7 files changed, 1117 insertions, 0 deletions
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
new file mode 100644
index 0000000..7613e82
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
@@ -0,0 +1,6 @@
+### OpenVINO Benchmark Tool
+---
+
+For detailed information on the OpenVINO Benchmark Tool, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/tools/benchmark_tool) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
+If you need examples of how to use the Benchmark Tool, check the [README](../README.md) in the parent directory for sample commands.
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
new file mode 100644
index 0000000..6696804
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
@@ -0,0 +1,78 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/benchmark.py	2024-03-01 14:01:50.443877000 -0500
++++ benchmark.py	2024-04-01 10:06:18.751566000 -0400
+@@ -1,14 +1,15 @@
+-# Copyright (C) 2018-2023 Intel Corporation
++# Copyright (C) 2018-2022 Intel Corporation
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import os
+ from datetime import datetime
+ from math import ceil
++import warnings
+ from openvino.runtime import Core, get_version, AsyncInferQueue
+ 
+-from .utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+-from .utils.logging import logger
+-from .utils.utils import get_duration_seconds
++from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
++from openvino.tools.benchmark.utils.logging import logger
++from openvino.tools.benchmark.utils.utils import get_duration_seconds
+ 
+ def percentile(values, percent):
+     return values[ceil(len(values) * percent / 100) - 1]
+@@ -17,7 +18,17 @@
+     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                  duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+         self.device = device
+-        self.core = Core()
++        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
++        if dla_plugins == '':
++            # Backwards compatability for old DLA_PLUGINS_XML_FILE
++            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
++            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
++        self.core = Core(dla_plugins)
++        if "FPGA" in self.device:
++            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
++            if dla_arch_file is None:
++                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
++            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+         self.nireq = number_infer_requests if api_type == 'async' else 1
+         self.niter = number_iterations
+         self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+@@ -59,6 +70,9 @@
+     def set_cache_dir(self, cache_dir: str):
+         self.core.set_property({'CACHE_DIR': cache_dir})
+ 
++    def set_allow_auto_batching(self, flag: bool):
++        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
++
+     def read_model(self, path_to_model: str):
+         model_filename = os.path.abspath(path_to_model)
+         head, ext = os.path.splitext(model_filename)
+@@ -110,7 +124,7 @@
+               (self.duration_seconds and exec_time < self.duration_seconds) or \
+               (iteration % self.nireq):
+             idle_id = infer_queue.get_idle_request_id()
+-            if idle_id in in_fly:
++            if idle_id in in_fly:       # Is this check neccessary?
+                 times.append(infer_queue[idle_id].latency)
+             else:
+                 in_fly.add(idle_id)
+@@ -162,7 +176,6 @@
+     def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+         if self.api_type == 'sync':
+             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+-            fps = len(batch_size) * iteration / total_duration_sec
+         elif self.inference_only:
+             times, total_duration_sec, iteration = self.async_inference_only(requests)
+             fps = len(batch_size) * iteration / total_duration_sec
+@@ -175,6 +188,9 @@
+         min_latency_ms = times[0]
+         max_latency_ms = times[-1]
+ 
++        if self.api_type == 'sync':
++            fps = len(batch_size) * 1000 / median_latency_ms
++
+         if pcseq:
+             for group in self.latency_groups:
+                 if group.times:
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
new file mode 100644
index 0000000..a98b82a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
@@ -0,0 +1,202 @@
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from datetime import datetime
+from math import ceil
+import warnings
+from openvino.runtime import Core, get_version, AsyncInferQueue
+
+from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import get_duration_seconds
+
+def percentile(values, percent):
+    return values[ceil(len(values) * percent / 100) - 1]
+
+class Benchmark:
+    def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+        self.device = device
+        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
+        if dla_plugins == '':
+            # Backwards compatability for old DLA_PLUGINS_XML_FILE
+            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
+            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
+        self.core = Core(dla_plugins)
+        if "FPGA" in self.device:
+            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
+            if dla_arch_file is None:
+                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
+            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+        self.nireq = number_infer_requests if api_type == 'async' else 1
+        self.niter = number_iterations
+        self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+        self.api_type = api_type
+        self.inference_only = inference_only
+        self.latency_groups = []
+
+    def __del__(self):
+        del self.core
+
+    def add_extension(self, path_to_extensions: str=None, path_to_cldnn_config: str=None):
+        if path_to_cldnn_config:
+            self.core.set_property(GPU_DEVICE_NAME, {'CONFIG_FILE': path_to_cldnn_config})
+            logger.info(f'GPU extensions is loaded {path_to_cldnn_config}')
+
+        if path_to_extensions:
+            for extension in path_to_extensions.split(","):
+                logger.info(f"Loading extension {extension}")
+                self.core.add_extension(extension)
+
+    def print_version_info(self) -> None:
+        version = get_version()
+        logger.info('OpenVINO:')
+        logger.info(f"{'Build ':.<39} {version}")
+        logger.info("")
+
+        logger.info("Device info:")
+        for device, version in self.core.get_versions(self.device).items():
+            logger.info(f"{device}")
+            logger.info(f"{'Build ':.<39} {version.build_number}")
+
+        logger.info("")
+        logger.info("")
+
+    def set_config(self, config = {}):
+        for device in config.keys():
+            self.core.set_property(device, config[device])
+
+    def set_cache_dir(self, cache_dir: str):
+        self.core.set_property({'CACHE_DIR': cache_dir})
+
+    def set_allow_auto_batching(self, flag: bool):
+        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
+
+    def read_model(self, path_to_model: str):
+        model_filename = os.path.abspath(path_to_model)
+        head, ext = os.path.splitext(model_filename)
+        weights_filename = os.path.abspath(head + BIN_EXTENSION) if ext == XML_EXTENSION else ""
+        return self.core.read_model(model_filename, weights_filename)
+
+    def create_infer_requests(self, compiled_model):
+        if self.api_type == 'sync':
+            requests = [compiled_model.create_infer_request()]
+        else:
+            requests = AsyncInferQueue(compiled_model, self.nireq)
+            self.nireq = len(requests)
+        return requests
+
+    def first_infer(self, requests):
+        if self.api_type == 'sync':
+            requests[0].infer()
+            return requests[0].latency
+        else:
+            id = requests.get_idle_request_id()
+            requests.start_async()
+            requests.wait_all()
+            return requests[id].latency
+
+    def sync_inference(self, request, data_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds):
+            if self.inference_only == False:
+                request.set_input_tensors(data_queue.get_next_input())
+            request.infer()
+            times.append(request.latency)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_only(self, infer_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        in_fly = set()
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % self.nireq):
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:       # Is this check neccessary?
+                times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            infer_queue.start_async()
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
+        processed_frames = 0
+        exec_time = 0
+        iteration = 0
+        times = []
+        num_groups = len(self.latency_groups)
+        start_time = datetime.utcnow()
+        in_fly = set()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % num_groups):
+            processed_frames += data_queue.get_next_batch_size()
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:
+                times.append(infer_queue[idle_id].latency)
+                if pcseq:
+                    self.latency_groups[infer_queue.userdata[idle_id]].times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            group_id = data_queue.current_group_id
+            infer_queue[idle_id].set_input_tensors(data_queue.get_next_input())
+            infer_queue.start_async(userdata=group_id)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+            if pcseq:
+                self.latency_groups[infer_queue.userdata[infer_request_id]].times.append(infer_queue[infer_request_id].latency)
+        
+        return sorted(times), total_duration_sec, processed_frames, iteration
+
+    def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+        if self.api_type == 'sync':
+            times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+        elif self.inference_only:
+            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            fps = len(batch_size) * iteration / total_duration_sec
+        else:
+            times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)
+            fps = processed_frames / total_duration_sec
+
+        median_latency_ms = percentile(times, latency_percentile)
+        avg_latency_ms = sum(times) / len(times)
+        min_latency_ms = times[0]
+        max_latency_ms = times[-1]
+
+        if self.api_type == 'sync':
+            fps = len(batch_size) * 1000 / median_latency_ms
+
+        if pcseq:
+            for group in self.latency_groups:
+                if group.times:
+                    group.times.sort()
+                    group.median = percentile(group.times, latency_percentile)
+                    group.avg = sum(group.times) / len(group.times)
+                    group.min = group.times[0]
+                    group.max = group.times[-1]
+        return fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
new file mode 100644
index 0000000..4a003ad
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
@@ -0,0 +1,14 @@
+--- /p/psg/swip/dla/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/benchmark_app	2023-02-07 15:01:24.336634000 -0500
++++ benchmark_app.py	2023-05-03 12:01:20.435826000 -0400
+@@ -1,8 +1,8 @@
+-#!/nfs/site/disks/swip_dla_1/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/python
++#!/usr/bin/python3
+ # -*- coding: utf-8 -*-
+ import re
+ import sys
+-from openvino.tools.benchmark.main import main
++import main
+ if __name__ == '__main__':
+     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+-    sys.exit(main())
++    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
new file mode 100644
index 0000000..d5b9c9a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
new file mode 100644
index 0000000..99afb40
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
@@ -0,0 +1,106 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/main.py	2024-03-01 14:01:50.466871000 -0500
++++ main.py	2024-10-29 11:10:06.569928000 -0400
+@@ -7,11 +7,11 @@
+ 
+ from openvino.runtime import Dimension,properties
+ 
+-from openvino.tools.benchmark.benchmark import Benchmark
++import benchmark as openvino_benchmark
+ from openvino.tools.benchmark.parameters import parse_args
+ from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+     CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+-    BLOB_EXTENSION, AUTO_DEVICE_NAME
++    BIN_EXTENSION, AUTO_DEVICE_NAME
+ from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+ from openvino.tools.benchmark.utils.logging import logger
+ from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+@@ -41,13 +41,13 @@
+     if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+         raise Exception("only detailed_counters report type is supported for MULTI device")
+ 
+-    _, ext = os.path.splitext(args.path_to_model)
+-    is_network_compiled = True if ext == BLOB_EXTENSION else False
+-    is_precisiton_set = not (args.input_precision == "" and args.output_precision == "" and args.input_output_precision == "")
++    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
++        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
++                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
++                       "then the number of inference request must be 1.")
+ 
+-    if is_network_compiled and is_precisiton_set:
+-        raise Exception("Cannot set precision for a compiled model. " \
+-                        "Please re-compile your model with required precision.")
++    _, ext = os.path.splitext(args.path_to_model)
++    is_network_compiled = True if ext == BIN_EXTENSION else False
+ 
+     return args, is_network_compiled
+ 
+@@ -84,7 +84,7 @@
+         # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+         next_step(step_id=2)
+ 
+-        benchmark = Benchmark(args.target_device, args.number_infer_requests,
++        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                               args.number_iterations, args.time, args.api_type, args.inference_only)
+ 
+         if args.extensions:
+@@ -166,8 +166,11 @@
+             supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+             if device not in config.keys():
+                 config[device] = {}
+-
+             ## high-level performance modes
++            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
++            # for both the HETERO and FPGA devices in our patched version of the Python demos
++            if device in ['HETERO', 'FPGA']:
++                continue
+             set_performance_hint(device)
+ 
+             if is_flag_set_in_command_line('nireq'):
+@@ -429,16 +432,21 @@
+             next_step()
+ 
+             start_time = datetime.utcnow()
+-            compiled_model = benchmark.core.import_model(args.path_to_model, benchmark.device, device_config)
+-            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+-            logger.info(f"Import model took {duration_ms} ms")
+-            if statistics:
+-                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+-                                          [
+-                                              ('import model time (ms)', duration_ms)
+-                                          ])
+-            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+-            batch_size = get_network_batch_size(app_inputs_info)
++            try:
++                with open(args.path_to_model, "rb") as model_stream:
++                    model_bytes = model_stream.read()
++                compiled_model = benchmark.core.import_model(model_bytes, device_name)
++                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
++                logger.info(f"Import model took {duration_ms} ms")
++                if statistics:
++                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
++                                            [
++                                                ('import model time (ms)', duration_ms)
++                                            ])
++                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
++                batch_size = get_network_batch_size(app_inputs_info)
++            except Exception as e:
++                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+ 
+         # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+         next_step()
+@@ -653,7 +661,7 @@
+             exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+             logger.info(f'Execution Devices:{exeDevice}')
+         except:
+-            pass
++            exeDevice = None
+         logger.info(f'Count:            {iteration} iterations')
+         logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+         if MULTI_DEVICE_NAME not in device_name:
+@@ -692,4 +700,4 @@
+                 [('error', str(e))]
+             )
+             statistics.dump()
+-        sys.exit(1)
++        sys.exit(1)
+\ No newline at end of file
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
new file mode 100644
index 0000000..e11daec
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
@@ -0,0 +1,703 @@
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+from datetime import datetime
+
+from openvino.runtime import Dimension,properties
+
+import benchmark as openvino_benchmark
+from openvino.tools.benchmark.parameters import parse_args
+from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+    CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+    BIN_EXTENSION, AUTO_DEVICE_NAME
+from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+    process_help_inference_string, print_perf_counters, print_perf_counters_sort, dump_exec_graph, get_duration_in_milliseconds, \
+    get_command_line_arguments, parse_value_per_device, parse_devices, get_inputs_info, \
+    print_inputs_and_outputs_info, get_network_batch_size, load_config, dump_config, get_latency_groups, \
+    check_for_static, can_measure_as_static, parse_value_for_virtual_device, is_virtual_device, is_virtual_device_found
+from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, JsonStatisticsReport, CsvStatisticsReport, \
+    averageCntReport, detailedCntReport
+
+def parse_and_check_command_line():
+    def arg_not_empty(arg_value,empty_value):
+        return not arg_value is None and not arg_value == empty_value
+
+    parser = parse_args()
+    args = parser.parse_args()
+
+    if args.latency_percentile < 1 or args.latency_percentile > 100:
+        parser.print_help()
+        raise RuntimeError("The percentile value is incorrect. The applicable values range is [1, 100].")
+
+    if not args.perf_hint == "none" and (arg_not_empty(args.number_streams, "") or arg_not_empty(args.number_threads, 0) or arg_not_empty(args.infer_threads_pinning, "")):
+        raise Exception("-nstreams, -nthreads and -pin options are fine tune options. To use them you " \
+                        "should explicitely set -hint option to none. This is not OpenVINO limitation " \
+                        "(those options can be used in OpenVINO together), but a benchmark_app UI rule.")
+
+    if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+        raise Exception("only detailed_counters report type is supported for MULTI device")
+
+    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
+        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
+                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
+                       "then the number of inference request must be 1.")
+
+    _, ext = os.path.splitext(args.path_to_model)
+    is_network_compiled = True if ext == BIN_EXTENSION else False
+
+    return args, is_network_compiled
+
+def main():
+    statistics = None
+    try:
+        # ------------------------------ 1. Parsing and validating input arguments ------------------------------
+        next_step()
+        logger.info("Parsing input parameters")
+        args, is_network_compiled = parse_and_check_command_line()
+
+        command_line_arguments = get_command_line_arguments(sys.argv)
+        if args.report_type:
+            _statistics_class = JsonStatisticsReport if args.json_stats else CsvStatisticsReport
+            statistics = _statistics_class(StatisticsReport.Config(args.report_type, args.report_folder))
+            statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
+
+        def is_flag_set_in_command_line(flag):
+            return any(x.strip('-') == flag for x, y in command_line_arguments)
+
+        device_name = args.target_device
+
+        devices = parse_devices(device_name)
+        device_number_streams = parse_value_per_device(devices, args.number_streams, "nstreams")
+        device_infer_precision = parse_value_per_device(devices, args.infer_precision, "infer_precision")
+
+        config = {}
+        if args.load_config:
+            load_config(args.load_config, config)
+
+        if is_network_compiled:
+            logger.info("Model is compiled")
+
+        # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+        next_step(step_id=2)
+
+        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                              args.number_iterations, args.time, args.api_type, args.inference_only)
+
+        if args.extensions:
+            benchmark.add_extension(path_to_extensions=args.extensions)
+
+        ## GPU (clDNN) Extensions
+        if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
+            if GPU_DEVICE_NAME not in config.keys():
+                config[GPU_DEVICE_NAME] = {}
+            config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
+
+        if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
+            cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
+            benchmark.add_extension(path_to_cldnn_config=cldnn_config)
+
+        benchmark.print_version_info()
+
+        # --------------------- 3. Setting device configuration --------------------------------------------------------
+        next_step()
+
+        def set_performance_hint(device):
+            perf_hint = properties.hint.PerformanceMode.UNDEFINED
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if properties.hint.performance_mode() in supported_properties:
+                if is_flag_set_in_command_line('hint'):
+                    if args.perf_hint == "throughput" or args.perf_hint == "tput":
+                        perf_hint = properties.hint.PerformanceMode.THROUGHPUT
+                    elif args.perf_hint == "latency":
+                        perf_hint = properties.hint.PerformanceMode.LATENCY
+                    elif args.perf_hint == "cumulative_throughput" or args.perf_hint == "ctput":
+                        perf_hint = properties.hint.PerformanceMode.CUMULATIVE_THROUGHPUT
+                    elif args.perf_hint=='none':
+                        perf_hint = properties.hint.PerformanceMode.UNDEFINED
+                    else:
+                        raise RuntimeError("Incorrect performance hint. Please set -hint option to"
+                            "`throughput`(tput), `latency', 'cumulative_throughput'(ctput) value or 'none'.")
+                else:
+                    perf_hint = properties.hint.PerformanceMode.THROUGHPUT if benchmark.api_type == "async" else properties.hint.PerformanceMode.LATENCY
+                    logger.warning(f"Performance hint was not explicitly specified in command line. " +
+                    f"Device({device}) performance hint will be set to {perf_hint}.")
+                if perf_hint != properties.hint.PerformanceMode.UNDEFINED:
+                    config[device][properties.hint.performance_mode()] = perf_hint
+            else:
+                logger.warning(f"Device {device} does not support performance hint property(-hint).")
+
+
+        def get_device_type_from_name(name) :
+            new_name = str(name)
+            new_name = new_name.split(".", 1)[0]
+            new_name = new_name.split("(", 1)[0]
+            return new_name
+
+        ## Set default values from dumped config
+        default_devices = set()
+        for device in devices:
+            device_type = get_device_type_from_name(device)
+            if device_type in config and device not in config:
+                config[device] = config[device_type].copy()
+                default_devices.add(device_type)
+
+        for def_device in default_devices:
+            config.pop(def_device)
+
+        perf_counts = False
+        # check if using the virtual device
+        hw_devices_list = devices.copy()
+        # Remove the hardware devices if AUTO/MULTI/HETERO appears in the devices list.
+        is_virtual = is_virtual_device_found(devices)
+        if is_virtual:
+            devices.clear()
+            # Parse out the currect virtual device as the target device.
+            virtual_device = device_name.partition(":")[0]
+            hw_devices_list.remove(virtual_device)
+            devices.append(virtual_device)
+            parse_value_for_virtual_device(virtual_device, device_number_streams)
+            parse_value_for_virtual_device(virtual_device, device_infer_precision)
+
+        for device in devices:
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if device not in config.keys():
+                config[device] = {}
+            ## high-level performance modes
+            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
+            # for both the HETERO and FPGA devices in our patched version of the Python demos
+            if device in ['HETERO', 'FPGA']:
+                continue
+            set_performance_hint(device)
+
+            if is_flag_set_in_command_line('nireq'):
+                config[device][properties.hint.num_requests()] = str(args.number_infer_requests)
+
+            ## Set performance counter
+            if is_flag_set_in_command_line('pc'):
+                ## set to user defined value
+                config[device][properties.enable_profiling()] = True if args.perf_counts else False
+            elif properties.enable_profiling() in config[device].keys() and config[device][properties.enable_profiling()] == True:
+                logger.warning(f"Performance counters for {device} device is turned on. " +
+                               "To print results use -pc option.")
+            elif args.report_type in [ averageCntReport, detailedCntReport ]:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since report type is {args.report_type}.")
+                config[device][properties.enable_profiling()] = True
+            elif args.exec_graph_path is not None:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               "due to execution graph dumping.")
+                config[device][properties.enable_profiling()] = True
+            elif is_flag_set_in_command_line('pcsort'):
+                ## set to default value
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since pcsort value is {args.perf_counts_sort}.")
+                config[device][properties.enable_profiling()] = True if args.perf_counts_sort else False
+            else:
+                ## set to default value
+                config[device][properties.enable_profiling()] = args.perf_counts
+            perf_counts = True if config[device][properties.enable_profiling()] == True else perf_counts
+
+            ## insert or append property into hw device properties list
+            def update_configs(hw_device, property_name, property_value):
+                (key, value) = properties.device.properties({hw_device:{property_name:property_value}})
+                # add property into hw device properties list.
+                if key not in config[device].keys():
+                    config[device][key] = value
+                else:
+                    current_config = config[device][key].get()
+                    if hw_device not in current_config.keys():
+                        current_config.update(value.get())
+                    else:
+                        current_device_config = current_config[hw_device]
+                        for prop in value.get().items():
+                            current_device_config.update(prop[1])
+                        current_config[hw_device].update(current_device_config)
+                    config[device][key].set(current_config)
+
+            def update_device_config_for_virtual_device(value, config, key):
+                # check if the element contains the hardware device property
+                if len(value.split(':')) == 1:
+                    config[device][key] = device_infer_precision[device]
+                else:
+                    # set device nstreams properties in the AUTO/MULTI plugin
+                    value_vec = value[value.find('{') + 1:value.rfind('}')].split(',')
+                    device_properties  = {value_vec[i].split(':')[0] : value_vec[i].split(':')[1] for i in range(0, len(value_vec))}
+                    for hw_device in device_properties.keys():
+                        update_configs(hw_device, key, device_properties[hw_device])
+
+            ## infer precision
+            def set_infer_precision():
+                key = properties.hint.inference_precision()
+                if device in device_infer_precision.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_infer_precision[device]
+                    elif is_virtual_device(device):
+                        update_device_config_for_virtual_device(device_infer_precision[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key INFERENCE_PRECISION_HINT!" \
+                                        " Please specify -infer_precision for correct devices in format" \
+                                        " <dev1>:<infer_precision1>,<dev2>:<infer_precision2> or via configuration file.")
+                return
+
+            ## the rest are individual per-device settings (overriding the values the device will deduce from perf hint)
+            def set_throughput_streams():
+                key = get_device_type_from_name(device) + "_THROUGHPUT_STREAMS"
+                if device in device_number_streams.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_number_streams[device]
+                    elif properties.streams.num() in supported_properties:
+                        key = properties.streams.num()
+                        config[device][key] = device_number_streams[device]
+                    elif is_virtual_device(device):
+                        key = properties.streams.num()
+                        update_device_config_for_virtual_device(device_number_streams[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key '{key}'! " +
+                                        "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>")
+                elif key not in config[device].keys() and args.api_type == "async" and key not in config[device].keys() \
+                    and 'PERFORMANCE_HINT' in config[device].keys() and config[device]['PERFORMANCE_HINT'] == '':
+                    ## set the _AUTO value for the #streams
+                    logger.warning(f"-nstreams default value is determined automatically for {device} device. " +
+                                   "Although the automatic selection usually provides a reasonable performance, "
+                                   "but it still may be non-optimal for some cases, for more information look at README.")
+                    if key in supported_properties:
+                        config[device][key] = get_device_type_from_name(device) + "_THROUGHPUT_AUTO"
+                    elif properties.streams.Num() in supported_properties:
+                        key = properties.streams.Num()
+                        config[device][key] = "-1"  # Set AUTO mode for streams number
+                    elif is_virtual_device(device):
+                        # Set nstreams to default value auto if no nstreams specified from cmd line.
+                        for hw_device in hw_devices_list:
+                            hw_supported_properties = benchmark.core.get_property(hw_device, properties.supported_properties())
+                            key = get_device_type_from_name(hw_device) + "_THROUGHPUT_STREAMS"
+                            value = get_device_type_from_name(hw_device) + "_THROUGHPUT_AUTO"
+                            if key not in hw_supported_properties:
+                                key = properties.streams.Num()
+                                value = properties.streams.Num.AUTO
+                            if key in hw_supported_properties:
+                                update_configs(hw_device, key, value)
+                if key in config[device].keys():
+                    device_number_streams[device] = config[device][key]
+                return
+
+            def set_nthreads_pin(property_name, property_value):
+                if property_name == properties.affinity():
+                    if property_value == "YES":
+                        property_value = properties.Affinity.CORE
+                    elif property_value == "NO":
+                        property_value = properties.Affinity.NONE
+                if property_name in supported_properties or device_name == AUTO_DEVICE_NAME:
+                    # create nthreads/pin primary property for HW device or AUTO if -d is AUTO directly.
+                    config[device][property_name] = property_value
+                elif is_virtual:
+                    # Create secondary property of -nthreads/-pin only for CPU if CPU device appears in the devices
+                    # list specified by -d.
+                    if CPU_DEVICE_NAME in hw_devices_list:
+                        update_configs(CPU_DEVICE_NAME, property_name, property_value)
+                return
+
+            if args.number_threads and is_flag_set_in_command_line("nthreads"):
+                # limit threading for CPU portion of inference
+                set_nthreads_pin(properties.inference_num_threads(), str(args.number_threads))
+
+            if is_flag_set_in_command_line('pin'):
+                ## set for CPU to user defined value
+                set_nthreads_pin(properties.affinity(), args.infer_threads_pinning)
+
+            set_throughput_streams()
+            set_infer_precision()
+
+            if is_virtual_device(device):
+                if device in device_number_streams.keys():
+                    del device_number_streams[device]
+
+        device_config = {}
+        for device in config:
+            if benchmark.device.find(device) == 0:
+                device_config = config[device]
+        if args.cache_dir:
+            benchmark.set_cache_dir(args.cache_dir)
+
+        ## If set batch size, disable the auto batching
+        if args.batch_size:
+            logger.warning("Batch size is set. Auto batching will be disabled")
+            device_config["ALLOW_AUTO_BATCHING"] = False
+
+        topology_name = ""
+        load_from_file_enabled = is_flag_set_in_command_line('load_from_file') or is_flag_set_in_command_line('lfile')
+        if load_from_file_enabled and not is_network_compiled:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported with --load_from_file. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(args.path_to_model, benchmark.device, device_config)
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+            batch_size = get_network_batch_size(app_inputs_info)
+        elif not is_network_compiled:
+            # --------------------- 4. Read the Intermediate Representation of the network -----------------------------
+            next_step()
+
+            logger.info("Loading model files")
+
+            start_time = datetime.utcnow()
+            model = benchmark.read_model(args.path_to_model)
+            topology_name = model.get_name()
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Read model took {duration_ms} ms")
+            logger.info("Original model I/O parameters:")
+            print_inputs_and_outputs_info(model)
+
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('read model time (ms)', duration_ms)
+                                          ])
+
+            # --------------------- 5. Resizing network to match image sizes and given batch ---------------------------
+            next_step()
+
+            app_inputs_info, reshape = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, model.inputs)
+
+            # use batch size according to provided layout and shapes
+            batch_size = get_network_batch_size(app_inputs_info)
+            logger.info(f'Model batch size: {batch_size}')
+
+            if reshape:
+                start_time = datetime.utcnow()
+                shapes = { info.name : info.partial_shape for info in app_inputs_info }
+                logger.info(
+                    'Reshaping model: {}'.format(', '.join("'{}': {}".format(k, str(v)) for k, v in shapes.items())))
+                model.reshape(shapes)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Reshape model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                              [
+                                                  ('reshape model time (ms)', duration_ms)
+                                              ])
+
+            # --------------------- 6. Configuring inputs and outputs of the model --------------------------------------------------
+            next_step()
+
+            pre_post_processing(model, app_inputs_info, args.input_precision, args.output_precision, args.input_output_precision)
+            print_inputs_and_outputs_info(model)
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(model, benchmark.device, device_config)
+
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+        else:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported for compiled model. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            try:
+                with open(args.path_to_model, "rb") as model_stream:
+                    model_bytes = model_stream.read()
+                compiled_model = benchmark.core.import_model(model_bytes, device_name)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Import model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                ('import model time (ms)', duration_ms)
+                                            ])
+                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+                batch_size = get_network_batch_size(app_inputs_info)
+            except Exception as e:
+                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+
+        # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+        next_step()
+
+        ## actual device-deduced settings
+        keys = compiled_model.get_property(properties.supported_properties())
+        logger.info("Model:")
+        for k in keys:
+            skip_keys = ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', properties.supported_properties())
+            if k not in skip_keys:
+                value = compiled_model.get_property(k)
+                if k == properties.device.properties():
+                    for device_key in value.keys():
+                        logger.info(f'  {device_key}:')
+                        for k2, value2 in value.get(device_key).items():
+                            if k2 not in skip_keys:
+                                logger.info(f'    {k2}: {value2}')
+                else:
+                    logger.info(f'  {k}: {value}')
+
+        # Update number of streams
+        for device in device_number_streams.keys():
+            try:
+                key = get_device_type_from_name(device) + '_THROUGHPUT_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+            except:
+                key = 'NUM_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+
+        # ------------------------------------ 9. Creating infer requests and preparing input data ----------------------
+        next_step()
+
+        # Create infer requests
+        requests = benchmark.create_infer_requests(compiled_model)
+
+        # Prepare input data
+        paths_to_input = list()
+        if args.paths_to_input:
+            for path in args.paths_to_input:
+                if ":" in next(iter(path), ""):
+                    paths_to_input.extend(path)
+                else:
+                    paths_to_input.append(os.path.abspath(*path))
+
+        data_queue = get_input_data(paths_to_input, app_inputs_info)
+
+        static_mode = check_for_static(app_inputs_info)
+        allow_inference_only_or_sync = can_measure_as_static(app_inputs_info)
+        if not allow_inference_only_or_sync and benchmark.api_type == 'sync':
+            raise Exception("Benchmarking of the model with dynamic shapes is available for async API only. "
+                            "Please use -api async -hint latency -nireq 1 to emulate sync behavior.")
+
+        if benchmark.inference_only == None:
+            if static_mode:
+                benchmark.inference_only = True
+            else:
+                benchmark.inference_only = False
+        elif benchmark.inference_only and not allow_inference_only_or_sync:
+            raise Exception("Benchmarking dynamic model available with input filling in measurement loop only!")
+
+        # update batch size in case dynamic network with one data_shape
+        if allow_inference_only_or_sync and batch_size.is_dynamic:
+            batch_size = Dimension(data_queue.batch_sizes[data_queue.current_group_id])
+
+        benchmark.latency_groups = get_latency_groups(app_inputs_info)
+
+        if len(benchmark.latency_groups) > 1:
+            logger.info(f"Defined {len(benchmark.latency_groups)} tensor groups:")
+            for group in benchmark.latency_groups:
+                logger.info(f"\t{str(group)}")
+
+        # Iteration limit
+        benchmark.niter = get_number_iterations(benchmark.niter, benchmark.nireq, max(len(info.shapes) for info in app_inputs_info), benchmark.api_type)
+
+        # Set input tensors before first inference
+        for request in requests:
+            data_tensors = data_queue.get_next_input()
+            for port, data_tensor in data_tensors.items():
+                input_tensor = request.get_input_tensor(port)
+                if not static_mode:
+                    input_tensor.shape = data_tensor.shape
+                if not len(input_tensor.shape):
+                    input_tensor.data.flat[:] = data_tensor.data
+                else:
+                    input_tensor.data[:] = data_tensor.data
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                      [
+                                          ('topology', topology_name),
+                                          ('target device', device_name),
+                                          ('API', args.api_type),
+                                          ('inference_only', benchmark.inference_only),
+                                          ('precision', "UNSPECIFIED"),
+                                          ('batch size', str(batch_size)),
+                                          ('number of iterations', str(benchmark.niter)),
+                                          ('number of parallel infer requests', str(benchmark.nireq)),
+                                          ('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
+                                       ])
+
+            for nstreams in device_number_streams.items():
+                statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                         [
+                                            (f"number of {nstreams[0]} streams", str(nstreams[1])),
+                                         ])
+
+        # ------------------------------------ 10. Measuring performance -----------------------------------------------
+
+        output_string = process_help_inference_string(benchmark, device_number_streams)
+
+        next_step(additional_info=output_string)
+
+        if benchmark.inference_only:
+            logger.info("Benchmarking in inference only mode (inputs filling are not included in measurement loop).")
+        else:
+            logger.info("Benchmarking in full mode (inputs filling are included in measurement loop).")
+        duration_ms = f"{benchmark.first_infer(requests):.2f}"
+        logger.info(f"First inference took {duration_ms} ms")
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                    [
+                                        ('first inference time (ms)', duration_ms)
+                                    ])
+
+        pcseq = args.pcseq
+        if static_mode or len(benchmark.latency_groups) == 1:
+            pcseq = False
+
+        fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration = benchmark.main_loop(requests, data_queue, batch_size, args.latency_percentile, pcseq)
+
+        # ------------------------------------ 11. Dumping statistics report -------------------------------------------
+        next_step()
+
+        if args.dump_config:
+            dump_config(args.dump_config, config)
+            logger.info(f"OpenVINO configuration settings were dumped to {args.dump_config}")
+
+        if args.exec_graph_path:
+            dump_exec_graph(compiled_model, args.exec_graph_path)
+
+        if perf_counts:
+            perfs_count_list = []
+            for request in requests:
+                perfs_count_list.append(request.profiling_info)
+
+            if args.perf_counts_sort:
+                total_sorted_list = print_perf_counters_sort(perfs_count_list,sort_flag=args.perf_counts_sort)
+                if statistics:
+                    statistics.dump_performance_counters_sorted(total_sorted_list)
+
+            elif args.perf_counts:
+                print_perf_counters(perfs_count_list)
+
+            if statistics:
+                # if not args.perf_counts_sort:
+                statistics.dump_performance_counters(perfs_count_list)
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('total execution time (ms)', f'{get_duration_in_milliseconds(total_duration_sec):.2f}'),
+                                          ('total number of iterations', str(iteration)),
+                                      ])
+            if MULTI_DEVICE_NAME not in device_name:
+                latency_prefix = None
+                if args.latency_percentile == 50:
+                    latency_prefix = 'latency (ms)'
+                elif args.latency_percentile != 50:
+                    latency_prefix = 'latency (' + str(args.latency_percentile) + ' percentile) (ms)'
+                if latency_prefix:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                (latency_prefix, f'{median_latency_ms:.2f}'),
+                                            ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{avg_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{min_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{max_latency_ms:.2f}'),
+                                          ])
+                if pcseq:
+                    for group in benchmark.latency_groups:
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("group", str(group)),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{group.avg:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{group.min:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{group.max:.2f}'),
+                                          ])
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('throughput', f'{fps:.2f}'),
+                                      ])
+            statistics.dump()
+
+        try:
+            exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+            logger.info(f'Execution Devices:{exeDevice}')
+        except:
+            exeDevice = None
+        logger.info(f'Count:            {iteration} iterations')
+        logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+        if MULTI_DEVICE_NAME not in device_name:
+            logger.info('Latency:')
+            if args.latency_percentile == 50:
+                logger.info(f'   Median:        {median_latency_ms:.2f} ms')
+            elif args.latency_percentile != 50:
+                logger.info(f'   {args.latency_percentile} percentile:     {median_latency_ms:.2f} ms')
+            logger.info(f'   Average:       {avg_latency_ms:.2f} ms')
+            logger.info(f'   Min:           {min_latency_ms:.2f} ms')
+            logger.info(f'   Max:           {max_latency_ms:.2f} ms')
+
+            if pcseq:
+                logger.info("Latency for each data shape group:")
+                for idx,group in enumerate(benchmark.latency_groups):
+                    logger.info(f"{idx+1}.{str(group)}")
+                    if args.latency_percentile == 50:
+                        logger.info(f'   Median:     {group.median:.2f} ms')
+                    elif args.latency_percentile != 50:
+                        logger.info(f'   {args.latency_percentile} percentile:     {group.median:.2f} ms')
+                    logger.info(f'   Average:    {group.avg:.2f} ms')
+                    logger.info(f'   Min:        {group.min:.2f} ms')
+                    logger.info(f'   Max:        {group.max:.2f} ms')
+
+        logger.info(f'Throughput:   {fps:.2f} FPS')
+
+        del compiled_model
+
+        next_step.step_id = 0
+    except Exception as e:
+        logger.exception(e)
+
+        if statistics:
+            statistics.add_parameters(
+                StatisticsReport.Category.EXECUTION_RESULTS,
+                [('error', str(e))]
+            )
+            statistics.dump()
+        sys.exit(1)
+\ No newline at end of file