completed thesisHEAD master

author: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
committer: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
commit: ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree: a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/python_demos
parent: 40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download: thesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
11 files changed, 1576 insertions, 0 deletions
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
new file mode 100644
index 0000000..7613e82
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
@@ -0,0 +1,6 @@
+### OpenVINO Benchmark Tool
+---
+
+For detailed information on the OpenVINO Benchmark Tool, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/tools/benchmark_tool) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
+If you need examples of how to use the Benchmark Tool, check the [README](../README.md) in the parent directory for sample commands.
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
new file mode 100644
index 0000000..6696804
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
@@ -0,0 +1,78 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/benchmark.py	2024-03-01 14:01:50.443877000 -0500
++++ benchmark.py	2024-04-01 10:06:18.751566000 -0400
+@@ -1,14 +1,15 @@
+-# Copyright (C) 2018-2023 Intel Corporation
++# Copyright (C) 2018-2022 Intel Corporation
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import os
+ from datetime import datetime
+ from math import ceil
++import warnings
+ from openvino.runtime import Core, get_version, AsyncInferQueue
+ 
+-from .utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+-from .utils.logging import logger
+-from .utils.utils import get_duration_seconds
++from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
++from openvino.tools.benchmark.utils.logging import logger
++from openvino.tools.benchmark.utils.utils import get_duration_seconds
+ 
+ def percentile(values, percent):
+     return values[ceil(len(values) * percent / 100) - 1]
+@@ -17,7 +18,17 @@
+     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                  duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+         self.device = device
+-        self.core = Core()
++        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
++        if dla_plugins == '':
++            # Backwards compatability for old DLA_PLUGINS_XML_FILE
++            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
++            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
++        self.core = Core(dla_plugins)
++        if "FPGA" in self.device:
++            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
++            if dla_arch_file is None:
++                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
++            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+         self.nireq = number_infer_requests if api_type == 'async' else 1
+         self.niter = number_iterations
+         self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+@@ -59,6 +70,9 @@
+     def set_cache_dir(self, cache_dir: str):
+         self.core.set_property({'CACHE_DIR': cache_dir})
+ 
++    def set_allow_auto_batching(self, flag: bool):
++        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
++
+     def read_model(self, path_to_model: str):
+         model_filename = os.path.abspath(path_to_model)
+         head, ext = os.path.splitext(model_filename)
+@@ -110,7 +124,7 @@
+               (self.duration_seconds and exec_time < self.duration_seconds) or \
+               (iteration % self.nireq):
+             idle_id = infer_queue.get_idle_request_id()
+-            if idle_id in in_fly:
++            if idle_id in in_fly:       # Is this check neccessary?
+                 times.append(infer_queue[idle_id].latency)
+             else:
+                 in_fly.add(idle_id)
+@@ -162,7 +176,6 @@
+     def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+         if self.api_type == 'sync':
+             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+-            fps = len(batch_size) * iteration / total_duration_sec
+         elif self.inference_only:
+             times, total_duration_sec, iteration = self.async_inference_only(requests)
+             fps = len(batch_size) * iteration / total_duration_sec
+@@ -175,6 +188,9 @@
+         min_latency_ms = times[0]
+         max_latency_ms = times[-1]
+ 
++        if self.api_type == 'sync':
++            fps = len(batch_size) * 1000 / median_latency_ms
++
+         if pcseq:
+             for group in self.latency_groups:
+                 if group.times:
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
new file mode 100644
index 0000000..a98b82a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
@@ -0,0 +1,202 @@
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from datetime import datetime
+from math import ceil
+import warnings
+from openvino.runtime import Core, get_version, AsyncInferQueue
+
+from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import get_duration_seconds
+
+def percentile(values, percent):
+    return values[ceil(len(values) * percent / 100) - 1]
+
+class Benchmark:
+    def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+        self.device = device
+        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
+        if dla_plugins == '':
+            # Backwards compatability for old DLA_PLUGINS_XML_FILE
+            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
+            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
+        self.core = Core(dla_plugins)
+        if "FPGA" in self.device:
+            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
+            if dla_arch_file is None:
+                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
+            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+        self.nireq = number_infer_requests if api_type == 'async' else 1
+        self.niter = number_iterations
+        self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+        self.api_type = api_type
+        self.inference_only = inference_only
+        self.latency_groups = []
+
+    def __del__(self):
+        del self.core
+
+    def add_extension(self, path_to_extensions: str=None, path_to_cldnn_config: str=None):
+        if path_to_cldnn_config:
+            self.core.set_property(GPU_DEVICE_NAME, {'CONFIG_FILE': path_to_cldnn_config})
+            logger.info(f'GPU extensions is loaded {path_to_cldnn_config}')
+
+        if path_to_extensions:
+            for extension in path_to_extensions.split(","):
+                logger.info(f"Loading extension {extension}")
+                self.core.add_extension(extension)
+
+    def print_version_info(self) -> None:
+        version = get_version()
+        logger.info('OpenVINO:')
+        logger.info(f"{'Build ':.<39} {version}")
+        logger.info("")
+
+        logger.info("Device info:")
+        for device, version in self.core.get_versions(self.device).items():
+            logger.info(f"{device}")
+            logger.info(f"{'Build ':.<39} {version.build_number}")
+
+        logger.info("")
+        logger.info("")
+
+    def set_config(self, config = {}):
+        for device in config.keys():
+            self.core.set_property(device, config[device])
+
+    def set_cache_dir(self, cache_dir: str):
+        self.core.set_property({'CACHE_DIR': cache_dir})
+
+    def set_allow_auto_batching(self, flag: bool):
+        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
+
+    def read_model(self, path_to_model: str):
+        model_filename = os.path.abspath(path_to_model)
+        head, ext = os.path.splitext(model_filename)
+        weights_filename = os.path.abspath(head + BIN_EXTENSION) if ext == XML_EXTENSION else ""
+        return self.core.read_model(model_filename, weights_filename)
+
+    def create_infer_requests(self, compiled_model):
+        if self.api_type == 'sync':
+            requests = [compiled_model.create_infer_request()]
+        else:
+            requests = AsyncInferQueue(compiled_model, self.nireq)
+            self.nireq = len(requests)
+        return requests
+
+    def first_infer(self, requests):
+        if self.api_type == 'sync':
+            requests[0].infer()
+            return requests[0].latency
+        else:
+            id = requests.get_idle_request_id()
+            requests.start_async()
+            requests.wait_all()
+            return requests[id].latency
+
+    def sync_inference(self, request, data_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds):
+            if self.inference_only == False:
+                request.set_input_tensors(data_queue.get_next_input())
+            request.infer()
+            times.append(request.latency)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_only(self, infer_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        in_fly = set()
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % self.nireq):
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:       # Is this check neccessary?
+                times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            infer_queue.start_async()
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
+        processed_frames = 0
+        exec_time = 0
+        iteration = 0
+        times = []
+        num_groups = len(self.latency_groups)
+        start_time = datetime.utcnow()
+        in_fly = set()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % num_groups):
+            processed_frames += data_queue.get_next_batch_size()
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:
+                times.append(infer_queue[idle_id].latency)
+                if pcseq:
+                    self.latency_groups[infer_queue.userdata[idle_id]].times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            group_id = data_queue.current_group_id
+            infer_queue[idle_id].set_input_tensors(data_queue.get_next_input())
+            infer_queue.start_async(userdata=group_id)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+            if pcseq:
+                self.latency_groups[infer_queue.userdata[infer_request_id]].times.append(infer_queue[infer_request_id].latency)
+        
+        return sorted(times), total_duration_sec, processed_frames, iteration
+
+    def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+        if self.api_type == 'sync':
+            times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+        elif self.inference_only:
+            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            fps = len(batch_size) * iteration / total_duration_sec
+        else:
+            times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)
+            fps = processed_frames / total_duration_sec
+
+        median_latency_ms = percentile(times, latency_percentile)
+        avg_latency_ms = sum(times) / len(times)
+        min_latency_ms = times[0]
+        max_latency_ms = times[-1]
+
+        if self.api_type == 'sync':
+            fps = len(batch_size) * 1000 / median_latency_ms
+
+        if pcseq:
+            for group in self.latency_groups:
+                if group.times:
+                    group.times.sort()
+                    group.median = percentile(group.times, latency_percentile)
+                    group.avg = sum(group.times) / len(group.times)
+                    group.min = group.times[0]
+                    group.max = group.times[-1]
+        return fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
new file mode 100644
index 0000000..4a003ad
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
@@ -0,0 +1,14 @@
+--- /p/psg/swip/dla/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/benchmark_app	2023-02-07 15:01:24.336634000 -0500
++++ benchmark_app.py	2023-05-03 12:01:20.435826000 -0400
+@@ -1,8 +1,8 @@
+-#!/nfs/site/disks/swip_dla_1/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/python
++#!/usr/bin/python3
+ # -*- coding: utf-8 -*-
+ import re
+ import sys
+-from openvino.tools.benchmark.main import main
++import main
+ if __name__ == '__main__':
+     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+-    sys.exit(main())
++    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
new file mode 100644
index 0000000..d5b9c9a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
new file mode 100644
index 0000000..99afb40
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
@@ -0,0 +1,106 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/main.py	2024-03-01 14:01:50.466871000 -0500
++++ main.py	2024-10-29 11:10:06.569928000 -0400
+@@ -7,11 +7,11 @@
+ 
+ from openvino.runtime import Dimension,properties
+ 
+-from openvino.tools.benchmark.benchmark import Benchmark
++import benchmark as openvino_benchmark
+ from openvino.tools.benchmark.parameters import parse_args
+ from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+     CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+-    BLOB_EXTENSION, AUTO_DEVICE_NAME
++    BIN_EXTENSION, AUTO_DEVICE_NAME
+ from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+ from openvino.tools.benchmark.utils.logging import logger
+ from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+@@ -41,13 +41,13 @@
+     if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+         raise Exception("only detailed_counters report type is supported for MULTI device")
+ 
+-    _, ext = os.path.splitext(args.path_to_model)
+-    is_network_compiled = True if ext == BLOB_EXTENSION else False
+-    is_precisiton_set = not (args.input_precision == "" and args.output_precision == "" and args.input_output_precision == "")
++    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
++        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
++                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
++                       "then the number of inference request must be 1.")
+ 
+-    if is_network_compiled and is_precisiton_set:
+-        raise Exception("Cannot set precision for a compiled model. " \
+-                        "Please re-compile your model with required precision.")
++    _, ext = os.path.splitext(args.path_to_model)
++    is_network_compiled = True if ext == BIN_EXTENSION else False
+ 
+     return args, is_network_compiled
+ 
+@@ -84,7 +84,7 @@
+         # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+         next_step(step_id=2)
+ 
+-        benchmark = Benchmark(args.target_device, args.number_infer_requests,
++        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                               args.number_iterations, args.time, args.api_type, args.inference_only)
+ 
+         if args.extensions:
+@@ -166,8 +166,11 @@
+             supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+             if device not in config.keys():
+                 config[device] = {}
+-
+             ## high-level performance modes
++            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
++            # for both the HETERO and FPGA devices in our patched version of the Python demos
++            if device in ['HETERO', 'FPGA']:
++                continue
+             set_performance_hint(device)
+ 
+             if is_flag_set_in_command_line('nireq'):
+@@ -429,16 +432,21 @@
+             next_step()
+ 
+             start_time = datetime.utcnow()
+-            compiled_model = benchmark.core.import_model(args.path_to_model, benchmark.device, device_config)
+-            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+-            logger.info(f"Import model took {duration_ms} ms")
+-            if statistics:
+-                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+-                                          [
+-                                              ('import model time (ms)', duration_ms)
+-                                          ])
+-            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+-            batch_size = get_network_batch_size(app_inputs_info)
++            try:
++                with open(args.path_to_model, "rb") as model_stream:
++                    model_bytes = model_stream.read()
++                compiled_model = benchmark.core.import_model(model_bytes, device_name)
++                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
++                logger.info(f"Import model took {duration_ms} ms")
++                if statistics:
++                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
++                                            [
++                                                ('import model time (ms)', duration_ms)
++                                            ])
++                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
++                batch_size = get_network_batch_size(app_inputs_info)
++            except Exception as e:
++                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+ 
+         # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+         next_step()
+@@ -653,7 +661,7 @@
+             exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+             logger.info(f'Execution Devices:{exeDevice}')
+         except:
+-            pass
++            exeDevice = None
+         logger.info(f'Count:            {iteration} iterations')
+         logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+         if MULTI_DEVICE_NAME not in device_name:
+@@ -692,4 +700,4 @@
+                 [('error', str(e))]
+             )
+             statistics.dump()
+-        sys.exit(1)
++        sys.exit(1)
+\ No newline at end of file
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
new file mode 100644
index 0000000..e11daec
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
@@ -0,0 +1,703 @@
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+from datetime import datetime
+
+from openvino.runtime import Dimension,properties
+
+import benchmark as openvino_benchmark
+from openvino.tools.benchmark.parameters import parse_args
+from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+    CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+    BIN_EXTENSION, AUTO_DEVICE_NAME
+from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+    process_help_inference_string, print_perf_counters, print_perf_counters_sort, dump_exec_graph, get_duration_in_milliseconds, \
+    get_command_line_arguments, parse_value_per_device, parse_devices, get_inputs_info, \
+    print_inputs_and_outputs_info, get_network_batch_size, load_config, dump_config, get_latency_groups, \
+    check_for_static, can_measure_as_static, parse_value_for_virtual_device, is_virtual_device, is_virtual_device_found
+from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, JsonStatisticsReport, CsvStatisticsReport, \
+    averageCntReport, detailedCntReport
+
+def parse_and_check_command_line():
+    def arg_not_empty(arg_value,empty_value):
+        return not arg_value is None and not arg_value == empty_value
+
+    parser = parse_args()
+    args = parser.parse_args()
+
+    if args.latency_percentile < 1 or args.latency_percentile > 100:
+        parser.print_help()
+        raise RuntimeError("The percentile value is incorrect. The applicable values range is [1, 100].")
+
+    if not args.perf_hint == "none" and (arg_not_empty(args.number_streams, "") or arg_not_empty(args.number_threads, 0) or arg_not_empty(args.infer_threads_pinning, "")):
+        raise Exception("-nstreams, -nthreads and -pin options are fine tune options. To use them you " \
+                        "should explicitely set -hint option to none. This is not OpenVINO limitation " \
+                        "(those options can be used in OpenVINO together), but a benchmark_app UI rule.")
+
+    if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+        raise Exception("only detailed_counters report type is supported for MULTI device")
+
+    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
+        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
+                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
+                       "then the number of inference request must be 1.")
+
+    _, ext = os.path.splitext(args.path_to_model)
+    is_network_compiled = True if ext == BIN_EXTENSION else False
+
+    return args, is_network_compiled
+
+def main():
+    statistics = None
+    try:
+        # ------------------------------ 1. Parsing and validating input arguments ------------------------------
+        next_step()
+        logger.info("Parsing input parameters")
+        args, is_network_compiled = parse_and_check_command_line()
+
+        command_line_arguments = get_command_line_arguments(sys.argv)
+        if args.report_type:
+            _statistics_class = JsonStatisticsReport if args.json_stats else CsvStatisticsReport
+            statistics = _statistics_class(StatisticsReport.Config(args.report_type, args.report_folder))
+            statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
+
+        def is_flag_set_in_command_line(flag):
+            return any(x.strip('-') == flag for x, y in command_line_arguments)
+
+        device_name = args.target_device
+
+        devices = parse_devices(device_name)
+        device_number_streams = parse_value_per_device(devices, args.number_streams, "nstreams")
+        device_infer_precision = parse_value_per_device(devices, args.infer_precision, "infer_precision")
+
+        config = {}
+        if args.load_config:
+            load_config(args.load_config, config)
+
+        if is_network_compiled:
+            logger.info("Model is compiled")
+
+        # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+        next_step(step_id=2)
+
+        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                              args.number_iterations, args.time, args.api_type, args.inference_only)
+
+        if args.extensions:
+            benchmark.add_extension(path_to_extensions=args.extensions)
+
+        ## GPU (clDNN) Extensions
+        if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
+            if GPU_DEVICE_NAME not in config.keys():
+                config[GPU_DEVICE_NAME] = {}
+            config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
+
+        if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
+            cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
+            benchmark.add_extension(path_to_cldnn_config=cldnn_config)
+
+        benchmark.print_version_info()
+
+        # --------------------- 3. Setting device configuration --------------------------------------------------------
+        next_step()
+
+        def set_performance_hint(device):
+            perf_hint = properties.hint.PerformanceMode.UNDEFINED
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if properties.hint.performance_mode() in supported_properties:
+                if is_flag_set_in_command_line('hint'):
+                    if args.perf_hint == "throughput" or args.perf_hint == "tput":
+                        perf_hint = properties.hint.PerformanceMode.THROUGHPUT
+                    elif args.perf_hint == "latency":
+                        perf_hint = properties.hint.PerformanceMode.LATENCY
+                    elif args.perf_hint == "cumulative_throughput" or args.perf_hint == "ctput":
+                        perf_hint = properties.hint.PerformanceMode.CUMULATIVE_THROUGHPUT
+                    elif args.perf_hint=='none':
+                        perf_hint = properties.hint.PerformanceMode.UNDEFINED
+                    else:
+                        raise RuntimeError("Incorrect performance hint. Please set -hint option to"
+                            "`throughput`(tput), `latency', 'cumulative_throughput'(ctput) value or 'none'.")
+                else:
+                    perf_hint = properties.hint.PerformanceMode.THROUGHPUT if benchmark.api_type == "async" else properties.hint.PerformanceMode.LATENCY
+                    logger.warning(f"Performance hint was not explicitly specified in command line. " +
+                    f"Device({device}) performance hint will be set to {perf_hint}.")
+                if perf_hint != properties.hint.PerformanceMode.UNDEFINED:
+                    config[device][properties.hint.performance_mode()] = perf_hint
+            else:
+                logger.warning(f"Device {device} does not support performance hint property(-hint).")
+
+
+        def get_device_type_from_name(name) :
+            new_name = str(name)
+            new_name = new_name.split(".", 1)[0]
+            new_name = new_name.split("(", 1)[0]
+            return new_name
+
+        ## Set default values from dumped config
+        default_devices = set()
+        for device in devices:
+            device_type = get_device_type_from_name(device)
+            if device_type in config and device not in config:
+                config[device] = config[device_type].copy()
+                default_devices.add(device_type)
+
+        for def_device in default_devices:
+            config.pop(def_device)
+
+        perf_counts = False
+        # check if using the virtual device
+        hw_devices_list = devices.copy()
+        # Remove the hardware devices if AUTO/MULTI/HETERO appears in the devices list.
+        is_virtual = is_virtual_device_found(devices)
+        if is_virtual:
+            devices.clear()
+            # Parse out the currect virtual device as the target device.
+            virtual_device = device_name.partition(":")[0]
+            hw_devices_list.remove(virtual_device)
+            devices.append(virtual_device)
+            parse_value_for_virtual_device(virtual_device, device_number_streams)
+            parse_value_for_virtual_device(virtual_device, device_infer_precision)
+
+        for device in devices:
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if device not in config.keys():
+                config[device] = {}
+            ## high-level performance modes
+            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
+            # for both the HETERO and FPGA devices in our patched version of the Python demos
+            if device in ['HETERO', 'FPGA']:
+                continue
+            set_performance_hint(device)
+
+            if is_flag_set_in_command_line('nireq'):
+                config[device][properties.hint.num_requests()] = str(args.number_infer_requests)
+
+            ## Set performance counter
+            if is_flag_set_in_command_line('pc'):
+                ## set to user defined value
+                config[device][properties.enable_profiling()] = True if args.perf_counts else False
+            elif properties.enable_profiling() in config[device].keys() and config[device][properties.enable_profiling()] == True:
+                logger.warning(f"Performance counters for {device} device is turned on. " +
+                               "To print results use -pc option.")
+            elif args.report_type in [ averageCntReport, detailedCntReport ]:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since report type is {args.report_type}.")
+                config[device][properties.enable_profiling()] = True
+            elif args.exec_graph_path is not None:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               "due to execution graph dumping.")
+                config[device][properties.enable_profiling()] = True
+            elif is_flag_set_in_command_line('pcsort'):
+                ## set to default value
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since pcsort value is {args.perf_counts_sort}.")
+                config[device][properties.enable_profiling()] = True if args.perf_counts_sort else False
+            else:
+                ## set to default value
+                config[device][properties.enable_profiling()] = args.perf_counts
+            perf_counts = True if config[device][properties.enable_profiling()] == True else perf_counts
+
+            ## insert or append property into hw device properties list
+            def update_configs(hw_device, property_name, property_value):
+                (key, value) = properties.device.properties({hw_device:{property_name:property_value}})
+                # add property into hw device properties list.
+                if key not in config[device].keys():
+                    config[device][key] = value
+                else:
+                    current_config = config[device][key].get()
+                    if hw_device not in current_config.keys():
+                        current_config.update(value.get())
+                    else:
+                        current_device_config = current_config[hw_device]
+                        for prop in value.get().items():
+                            current_device_config.update(prop[1])
+                        current_config[hw_device].update(current_device_config)
+                    config[device][key].set(current_config)
+
+            def update_device_config_for_virtual_device(value, config, key):
+                # check if the element contains the hardware device property
+                if len(value.split(':')) == 1:
+                    config[device][key] = device_infer_precision[device]
+                else:
+                    # set device nstreams properties in the AUTO/MULTI plugin
+                    value_vec = value[value.find('{') + 1:value.rfind('}')].split(',')
+                    device_properties  = {value_vec[i].split(':')[0] : value_vec[i].split(':')[1] for i in range(0, len(value_vec))}
+                    for hw_device in device_properties.keys():
+                        update_configs(hw_device, key, device_properties[hw_device])
+
+            ## infer precision
+            def set_infer_precision():
+                key = properties.hint.inference_precision()
+                if device in device_infer_precision.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_infer_precision[device]
+                    elif is_virtual_device(device):
+                        update_device_config_for_virtual_device(device_infer_precision[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key INFERENCE_PRECISION_HINT!" \
+                                        " Please specify -infer_precision for correct devices in format" \
+                                        " <dev1>:<infer_precision1>,<dev2>:<infer_precision2> or via configuration file.")
+                return
+
+            ## the rest are individual per-device settings (overriding the values the device will deduce from perf hint)
+            def set_throughput_streams():
+                key = get_device_type_from_name(device) + "_THROUGHPUT_STREAMS"
+                if device in device_number_streams.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_number_streams[device]
+                    elif properties.streams.num() in supported_properties:
+                        key = properties.streams.num()
+                        config[device][key] = device_number_streams[device]
+                    elif is_virtual_device(device):
+                        key = properties.streams.num()
+                        update_device_config_for_virtual_device(device_number_streams[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key '{key}'! " +
+                                        "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>")
+                elif key not in config[device].keys() and args.api_type == "async" and key not in config[device].keys() \
+                    and 'PERFORMANCE_HINT' in config[device].keys() and config[device]['PERFORMANCE_HINT'] == '':
+                    ## set the _AUTO value for the #streams
+                    logger.warning(f"-nstreams default value is determined automatically for {device} device. " +
+                                   "Although the automatic selection usually provides a reasonable performance, "
+                                   "but it still may be non-optimal for some cases, for more information look at README.")
+                    if key in supported_properties:
+                        config[device][key] = get_device_type_from_name(device) + "_THROUGHPUT_AUTO"
+                    elif properties.streams.Num() in supported_properties:
+                        key = properties.streams.Num()
+                        config[device][key] = "-1"  # Set AUTO mode for streams number
+                    elif is_virtual_device(device):
+                        # Set nstreams to default value auto if no nstreams specified from cmd line.
+                        for hw_device in hw_devices_list:
+                            hw_supported_properties = benchmark.core.get_property(hw_device, properties.supported_properties())
+                            key = get_device_type_from_name(hw_device) + "_THROUGHPUT_STREAMS"
+                            value = get_device_type_from_name(hw_device) + "_THROUGHPUT_AUTO"
+                            if key not in hw_supported_properties:
+                                key = properties.streams.Num()
+                                value = properties.streams.Num.AUTO
+                            if key in hw_supported_properties:
+                                update_configs(hw_device, key, value)
+                if key in config[device].keys():
+                    device_number_streams[device] = config[device][key]
+                return
+
+            def set_nthreads_pin(property_name, property_value):
+                if property_name == properties.affinity():
+                    if property_value == "YES":
+                        property_value = properties.Affinity.CORE
+                    elif property_value == "NO":
+                        property_value = properties.Affinity.NONE
+                if property_name in supported_properties or device_name == AUTO_DEVICE_NAME:
+                    # create nthreads/pin primary property for HW device or AUTO if -d is AUTO directly.
+                    config[device][property_name] = property_value
+                elif is_virtual:
+                    # Create secondary property of -nthreads/-pin only for CPU if CPU device appears in the devices
+                    # list specified by -d.
+                    if CPU_DEVICE_NAME in hw_devices_list:
+                        update_configs(CPU_DEVICE_NAME, property_name, property_value)
+                return
+
+            if args.number_threads and is_flag_set_in_command_line("nthreads"):
+                # limit threading for CPU portion of inference
+                set_nthreads_pin(properties.inference_num_threads(), str(args.number_threads))
+
+            if is_flag_set_in_command_line('pin'):
+                ## set for CPU to user defined value
+                set_nthreads_pin(properties.affinity(), args.infer_threads_pinning)
+
+            set_throughput_streams()
+            set_infer_precision()
+
+            if is_virtual_device(device):
+                if device in device_number_streams.keys():
+                    del device_number_streams[device]
+
+        device_config = {}
+        for device in config:
+            if benchmark.device.find(device) == 0:
+                device_config = config[device]
+        if args.cache_dir:
+            benchmark.set_cache_dir(args.cache_dir)
+
+        ## If set batch size, disable the auto batching
+        if args.batch_size:
+            logger.warning("Batch size is set. Auto batching will be disabled")
+            device_config["ALLOW_AUTO_BATCHING"] = False
+
+        topology_name = ""
+        load_from_file_enabled = is_flag_set_in_command_line('load_from_file') or is_flag_set_in_command_line('lfile')
+        if load_from_file_enabled and not is_network_compiled:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported with --load_from_file. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(args.path_to_model, benchmark.device, device_config)
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+            batch_size = get_network_batch_size(app_inputs_info)
+        elif not is_network_compiled:
+            # --------------------- 4. Read the Intermediate Representation of the network -----------------------------
+            next_step()
+
+            logger.info("Loading model files")
+
+            start_time = datetime.utcnow()
+            model = benchmark.read_model(args.path_to_model)
+            topology_name = model.get_name()
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Read model took {duration_ms} ms")
+            logger.info("Original model I/O parameters:")
+            print_inputs_and_outputs_info(model)
+
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('read model time (ms)', duration_ms)
+                                          ])
+
+            # --------------------- 5. Resizing network to match image sizes and given batch ---------------------------
+            next_step()
+
+            app_inputs_info, reshape = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, model.inputs)
+
+            # use batch size according to provided layout and shapes
+            batch_size = get_network_batch_size(app_inputs_info)
+            logger.info(f'Model batch size: {batch_size}')
+
+            if reshape:
+                start_time = datetime.utcnow()
+                shapes = { info.name : info.partial_shape for info in app_inputs_info }
+                logger.info(
+                    'Reshaping model: {}'.format(', '.join("'{}': {}".format(k, str(v)) for k, v in shapes.items())))
+                model.reshape(shapes)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Reshape model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                              [
+                                                  ('reshape model time (ms)', duration_ms)
+                                              ])
+
+            # --------------------- 6. Configuring inputs and outputs of the model --------------------------------------------------
+            next_step()
+
+            pre_post_processing(model, app_inputs_info, args.input_precision, args.output_precision, args.input_output_precision)
+            print_inputs_and_outputs_info(model)
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(model, benchmark.device, device_config)
+
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+        else:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported for compiled model. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            try:
+                with open(args.path_to_model, "rb") as model_stream:
+                    model_bytes = model_stream.read()
+                compiled_model = benchmark.core.import_model(model_bytes, device_name)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Import model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                ('import model time (ms)', duration_ms)
+                                            ])
+                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+                batch_size = get_network_batch_size(app_inputs_info)
+            except Exception as e:
+                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+
+        # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+        next_step()
+
+        ## actual device-deduced settings
+        keys = compiled_model.get_property(properties.supported_properties())
+        logger.info("Model:")
+        for k in keys:
+            skip_keys = ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', properties.supported_properties())
+            if k not in skip_keys:
+                value = compiled_model.get_property(k)
+                if k == properties.device.properties():
+                    for device_key in value.keys():
+                        logger.info(f'  {device_key}:')
+                        for k2, value2 in value.get(device_key).items():
+                            if k2 not in skip_keys:
+                                logger.info(f'    {k2}: {value2}')
+                else:
+                    logger.info(f'  {k}: {value}')
+
+        # Update number of streams
+        for device in device_number_streams.keys():
+            try:
+                key = get_device_type_from_name(device) + '_THROUGHPUT_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+            except:
+                key = 'NUM_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+
+        # ------------------------------------ 9. Creating infer requests and preparing input data ----------------------
+        next_step()
+
+        # Create infer requests
+        requests = benchmark.create_infer_requests(compiled_model)
+
+        # Prepare input data
+        paths_to_input = list()
+        if args.paths_to_input:
+            for path in args.paths_to_input:
+                if ":" in next(iter(path), ""):
+                    paths_to_input.extend(path)
+                else:
+                    paths_to_input.append(os.path.abspath(*path))
+
+        data_queue = get_input_data(paths_to_input, app_inputs_info)
+
+        static_mode = check_for_static(app_inputs_info)
+        allow_inference_only_or_sync = can_measure_as_static(app_inputs_info)
+        if not allow_inference_only_or_sync and benchmark.api_type == 'sync':
+            raise Exception("Benchmarking of the model with dynamic shapes is available for async API only. "
+                            "Please use -api async -hint latency -nireq 1 to emulate sync behavior.")
+
+        if benchmark.inference_only == None:
+            if static_mode:
+                benchmark.inference_only = True
+            else:
+                benchmark.inference_only = False
+        elif benchmark.inference_only and not allow_inference_only_or_sync:
+            raise Exception("Benchmarking dynamic model available with input filling in measurement loop only!")
+
+        # update batch size in case dynamic network with one data_shape
+        if allow_inference_only_or_sync and batch_size.is_dynamic:
+            batch_size = Dimension(data_queue.batch_sizes[data_queue.current_group_id])
+
+        benchmark.latency_groups = get_latency_groups(app_inputs_info)
+
+        if len(benchmark.latency_groups) > 1:
+            logger.info(f"Defined {len(benchmark.latency_groups)} tensor groups:")
+            for group in benchmark.latency_groups:
+                logger.info(f"\t{str(group)}")
+
+        # Iteration limit
+        benchmark.niter = get_number_iterations(benchmark.niter, benchmark.nireq, max(len(info.shapes) for info in app_inputs_info), benchmark.api_type)
+
+        # Set input tensors before first inference
+        for request in requests:
+            data_tensors = data_queue.get_next_input()
+            for port, data_tensor in data_tensors.items():
+                input_tensor = request.get_input_tensor(port)
+                if not static_mode:
+                    input_tensor.shape = data_tensor.shape
+                if not len(input_tensor.shape):
+                    input_tensor.data.flat[:] = data_tensor.data
+                else:
+                    input_tensor.data[:] = data_tensor.data
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                      [
+                                          ('topology', topology_name),
+                                          ('target device', device_name),
+                                          ('API', args.api_type),
+                                          ('inference_only', benchmark.inference_only),
+                                          ('precision', "UNSPECIFIED"),
+                                          ('batch size', str(batch_size)),
+                                          ('number of iterations', str(benchmark.niter)),
+                                          ('number of parallel infer requests', str(benchmark.nireq)),
+                                          ('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
+                                       ])
+
+            for nstreams in device_number_streams.items():
+                statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                         [
+                                            (f"number of {nstreams[0]} streams", str(nstreams[1])),
+                                         ])
+
+        # ------------------------------------ 10. Measuring performance -----------------------------------------------
+
+        output_string = process_help_inference_string(benchmark, device_number_streams)
+
+        next_step(additional_info=output_string)
+
+        if benchmark.inference_only:
+            logger.info("Benchmarking in inference only mode (inputs filling are not included in measurement loop).")
+        else:
+            logger.info("Benchmarking in full mode (inputs filling are included in measurement loop).")
+        duration_ms = f"{benchmark.first_infer(requests):.2f}"
+        logger.info(f"First inference took {duration_ms} ms")
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                    [
+                                        ('first inference time (ms)', duration_ms)
+                                    ])
+
+        pcseq = args.pcseq
+        if static_mode or len(benchmark.latency_groups) == 1:
+            pcseq = False
+
+        fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration = benchmark.main_loop(requests, data_queue, batch_size, args.latency_percentile, pcseq)
+
+        # ------------------------------------ 11. Dumping statistics report -------------------------------------------
+        next_step()
+
+        if args.dump_config:
+            dump_config(args.dump_config, config)
+            logger.info(f"OpenVINO configuration settings were dumped to {args.dump_config}")
+
+        if args.exec_graph_path:
+            dump_exec_graph(compiled_model, args.exec_graph_path)
+
+        if perf_counts:
+            perfs_count_list = []
+            for request in requests:
+                perfs_count_list.append(request.profiling_info)
+
+            if args.perf_counts_sort:
+                total_sorted_list = print_perf_counters_sort(perfs_count_list,sort_flag=args.perf_counts_sort)
+                if statistics:
+                    statistics.dump_performance_counters_sorted(total_sorted_list)
+
+            elif args.perf_counts:
+                print_perf_counters(perfs_count_list)
+
+            if statistics:
+                # if not args.perf_counts_sort:
+                statistics.dump_performance_counters(perfs_count_list)
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('total execution time (ms)', f'{get_duration_in_milliseconds(total_duration_sec):.2f}'),
+                                          ('total number of iterations', str(iteration)),
+                                      ])
+            if MULTI_DEVICE_NAME not in device_name:
+                latency_prefix = None
+                if args.latency_percentile == 50:
+                    latency_prefix = 'latency (ms)'
+                elif args.latency_percentile != 50:
+                    latency_prefix = 'latency (' + str(args.latency_percentile) + ' percentile) (ms)'
+                if latency_prefix:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                (latency_prefix, f'{median_latency_ms:.2f}'),
+                                            ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{avg_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{min_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{max_latency_ms:.2f}'),
+                                          ])
+                if pcseq:
+                    for group in benchmark.latency_groups:
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("group", str(group)),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{group.avg:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{group.min:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{group.max:.2f}'),
+                                          ])
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('throughput', f'{fps:.2f}'),
+                                      ])
+            statistics.dump()
+
+        try:
+            exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+            logger.info(f'Execution Devices:{exeDevice}')
+        except:
+            exeDevice = None
+        logger.info(f'Count:            {iteration} iterations')
+        logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+        if MULTI_DEVICE_NAME not in device_name:
+            logger.info('Latency:')
+            if args.latency_percentile == 50:
+                logger.info(f'   Median:        {median_latency_ms:.2f} ms')
+            elif args.latency_percentile != 50:
+                logger.info(f'   {args.latency_percentile} percentile:     {median_latency_ms:.2f} ms')
+            logger.info(f'   Average:       {avg_latency_ms:.2f} ms')
+            logger.info(f'   Min:           {min_latency_ms:.2f} ms')
+            logger.info(f'   Max:           {max_latency_ms:.2f} ms')
+
+            if pcseq:
+                logger.info("Latency for each data shape group:")
+                for idx,group in enumerate(benchmark.latency_groups):
+                    logger.info(f"{idx+1}.{str(group)}")
+                    if args.latency_percentile == 50:
+                        logger.info(f'   Median:     {group.median:.2f} ms')
+                    elif args.latency_percentile != 50:
+                        logger.info(f'   {args.latency_percentile} percentile:     {group.median:.2f} ms')
+                    logger.info(f'   Average:    {group.avg:.2f} ms')
+                    logger.info(f'   Min:        {group.min:.2f} ms')
+                    logger.info(f'   Max:        {group.max:.2f} ms')
+
+        logger.info(f'Throughput:   {fps:.2f} FPS')
+
+        del compiled_model
+
+        next_step.step_id = 0
+    except Exception as e:
+        logger.exception(e)
+
+        if statistics:
+            statistics.add_parameters(
+                StatisticsReport.Category.EXECUTION_RESULTS,
+                [('error', str(e))]
+            )
+            statistics.dump()
+        sys.exit(1)
+\ No newline at end of file
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md
new file mode 100644
index 0000000..0b021b9
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md
@@ -0,0 +1,6 @@
+### OpeNVINO Image Classification Async Python Sample 
+---
+
+For detailed information on the OpenVINO Classification Sample Async Demo, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/samples/python/classification_sample_async) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
+If you need examples of how to use the demo, check the [README](../README.md) in the parent directory for sample commands.
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch
new file mode 100644
index 0000000..28ae75c
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch
@@ -0,0 +1,116 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/samples/python/classification_sample_async/classification_sample_async.py	2024-03-01 14:01:24.460131000 -0500
++++ ./runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py	2024-04-16 10:33:28.810439000 -0400
+@@ -1,15 +1,18 @@
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+-# Copyright (C) 2018-2023 Intel Corporation
++# Copyright (C) 2018-2022 Intel Corporation
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import argparse
+ import logging as log
++import os
+ import sys
++import warnings
+ 
+ import cv2
+ import numpy as np
+-import openvino as ov
++from openvino.preprocess import PrePostProcessor
++from openvino.runtime import AsyncInferQueue, Core, InferRequest, Layout, Type
+ 
+ 
+ def parse_args() -> argparse.Namespace:
+@@ -24,14 +27,14 @@
+     args.add_argument('-i', '--input', type=str, required=True, nargs='+',
+                       help='Required. Path to an image file(s).')
+     args.add_argument('-d', '--device', type=str, default='CPU',
+-                      help='Optional. Specify the target device to infer on; CPU, GPU, GNA or HETERO: '
++                      help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
+                       'is acceptable. The sample will look for a suitable plugin for device specified. '
+                       'Default value is CPU.')
+     # fmt: on
+     return parser.parse_args()
+ 
+ 
+-def completion_callback(infer_request: ov.InferRequest, image_path: str) -> None:
++def completion_callback(infer_request: InferRequest, image_path: str) -> None:
+     predictions = next(iter(infer_request.results.values()))
+ 
+     # Change a shape of a numpy.ndarray with results to get another one with one dimension
+@@ -60,7 +63,17 @@
+ 
+ # --------------------------- Step 1. Initialize OpenVINO Runtime Core ------------------------------------------------
+     log.info('Creating OpenVINO Runtime Core')
+-    core = ov.Core()
++    dla_plugins = os.environ.get('DLA_PLUGINS', default='')
++    if dla_plugins == '':
++        # Backwards compatability for old DLA_PLUGINS_XML_FILE
++        warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
++        dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
++    core = Core(dla_plugins)
++    if "FPGA" in args.device:
++        dla_arch_file = os.environ.get('DLA_ARCH_FILE')
++        if dla_arch_file is None:
++            raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
++        core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+ 
+ # --------------------------- Step 2. Read a model --------------------------------------------------------------------
+     log.info(f'Reading the model: {args.model}')
+@@ -80,29 +93,38 @@
+     images = [cv2.imread(image_path) for image_path in args.input]
+ 
+     # Resize images to model input dims
+-    _, _, h, w = model.input().shape
++    # Assuming we always have w=h, we will 
++    # figure out the layout from the dimensions
++    # start with the assumption of NHWC (TF)
++    _, h, w, c = model.input().shape
++
++    if h != w:
++        c = h
++        h = w
++
+     resized_images = [cv2.resize(image, (w, h)) for image in images]
+ 
+     # Add N dimension
+     input_tensors = [np.expand_dims(image, 0) for image in resized_images]
+ 
++    # Transpose from NHWC to NCHW
++    input_tensors = [np.transpose(tensor, (0, 3, 1, 2)) for tensor in input_tensors]
++
+ # --------------------------- Step 4. Apply preprocessing -------------------------------------------------------------
+-    ppp = ov.preprocess.PrePostProcessor(model)
++    ppp = PrePostProcessor(model)
+ 
+     # 1) Set input tensor information:
+     # - input() provides information about a single model input
+-    # - precision of tensor is supposed to be 'u8'
+-    # - layout of data is 'NHWC'
+-    ppp.input().tensor() \
+-        .set_element_type(ov.Type.u8) \
+-        .set_layout(ov.Layout('NHWC'))  # noqa: N400
++    # - layout of data is 'NCHW'
++    ppp.input().tensor().set_layout(Layout('NCHW'))  # noqa: N400
+ 
+     # 2) Here we suppose model has 'NCHW' layout for input
+-    ppp.input().model().set_layout(ov.Layout('NCHW'))
++    # DLA --> We let the demo select the layout based on the model
++    # ppp.input().model().set_layout(Layout('NCHW'))
+ 
+     # 3) Set output tensor information:
+     # - precision of tensor is supposed to be 'f32'
+-    ppp.output().tensor().set_element_type(ov.Type.f32)
++    ppp.output().tensor().set_element_type(Type.f32)
+ 
+     # 4) Apply preprocessing modifing the original 'model'
+     model = ppp.build()
+@@ -114,7 +136,7 @@
+ # --------------------------- Step 6. Create infer request queue ------------------------------------------------------
+     log.info('Starting inference in asynchronous mode')
+     # create async queue with optimal number of infer requests
+-    infer_queue = ov.AsyncInferQueue(compiled_model)
++    infer_queue = AsyncInferQueue(compiled_model)
+     infer_queue.set_callback(completion_callback)
+ 
+ # --------------------------- Step 7. Do inference --------------------------------------------------------------------
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py
new file mode 100755
index 0000000..339c942
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import logging as log
+import os
+import sys
+import warnings
+
+import cv2
+import numpy as np
+from openvino.preprocess import PrePostProcessor
+from openvino.runtime import AsyncInferQueue, Core, InferRequest, Layout, Type
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse and return command line arguments."""
+    parser = argparse.ArgumentParser(add_help=False)
+    args = parser.add_argument_group('Options')
+    # fmt: off
+    args.add_argument('-h', '--help', action='help',
+                      help='Show this help message and exit.')
+    args.add_argument('-m', '--model', type=str, required=True,
+                      help='Required. Path to an .xml or .onnx file with a trained model.')
+    args.add_argument('-i', '--input', type=str, required=True, nargs='+',
+                      help='Required. Path to an image file(s).')
+    args.add_argument('-d', '--device', type=str, default='CPU',
+                      help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
+                      'is acceptable. The sample will look for a suitable plugin for device specified. '
+                      'Default value is CPU.')
+    # fmt: on
+    return parser.parse_args()
+
+
+def completion_callback(infer_request: InferRequest, image_path: str) -> None:
+    predictions = next(iter(infer_request.results.values()))
+
+    # Change a shape of a numpy.ndarray with results to get another one with one dimension
+    probs = predictions.reshape(-1)
+
+    # Get an array of 10 class IDs in descending order of probability
+    top_10 = np.argsort(probs)[-10:][::-1]
+
+    header = 'class_id probability'
+
+    log.info(f'Image path: {image_path}')
+    log.info('Top 10 results: ')
+    log.info(header)
+    log.info('-' * len(header))
+
+    for class_id in top_10:
+        probability_indent = ' ' * (len('class_id') - len(str(class_id)) + 1)
+        log.info(f'{class_id}{probability_indent}{probs[class_id]:.7f}')
+
+    log.info('')
+
+
+def main() -> int:
+    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
+    args = parse_args()
+
+# --------------------------- Step 1. Initialize OpenVINO Runtime Core ------------------------------------------------
+    log.info('Creating OpenVINO Runtime Core')
+    dla_plugins = os.environ.get('DLA_PLUGINS', default='')
+    if dla_plugins == '':
+        # Backwards compatability for old DLA_PLUGINS_XML_FILE
+        warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
+        dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
+    core = Core(dla_plugins)
+    if "FPGA" in args.device:
+        dla_arch_file = os.environ.get('DLA_ARCH_FILE')
+        if dla_arch_file is None:
+            raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
+        core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+
+# --------------------------- Step 2. Read a model --------------------------------------------------------------------
+    log.info(f'Reading the model: {args.model}')
+    # (.xml and .bin files) or (.onnx file)
+    model = core.read_model(args.model)
+
+    if len(model.inputs) != 1:
+        log.error('Sample supports only single input topologies')
+        return -1
+
+    if len(model.outputs) != 1:
+        log.error('Sample supports only single output topologies')
+        return -1
+
+# --------------------------- Step 3. Set up input --------------------------------------------------------------------
+    # Read input images
+    images = [cv2.imread(image_path) for image_path in args.input]
+
+    # Resize images to model input dims
+    # Assuming we always have w=h, we will 
+    # figure out the layout from the dimensions
+    # start with the assumption of NHWC (TF)
+    _, h, w, c = model.input().shape
+
+    if h != w:
+        c = h
+        h = w
+
+    resized_images = [cv2.resize(image, (w, h)) for image in images]
+
+    # Add N dimension
+    input_tensors = [np.expand_dims(image, 0) for image in resized_images]
+
+    # Transpose from NHWC to NCHW
+    input_tensors = [np.transpose(tensor, (0, 3, 1, 2)) for tensor in input_tensors]
+
+# --------------------------- Step 4. Apply preprocessing -------------------------------------------------------------
+    ppp = PrePostProcessor(model)
+
+    # 1) Set input tensor information:
+    # - input() provides information about a single model input
+    # - layout of data is 'NCHW'
+    ppp.input().tensor().set_layout(Layout('NCHW'))  # noqa: N400
+
+    # 2) Here we suppose model has 'NCHW' layout for input
+    # DLA --> We let the demo select the layout based on the model
+    # ppp.input().model().set_layout(Layout('NCHW'))
+
+    # 3) Set output tensor information:
+    # - precision of tensor is supposed to be 'f32'
+    ppp.output().tensor().set_element_type(Type.f32)
+
+    # 4) Apply preprocessing modifing the original 'model'
+    model = ppp.build()
+
+# --------------------------- Step 5. Loading model to the device -----------------------------------------------------
+    log.info('Loading the model to the plugin')
+    compiled_model = core.compile_model(model, args.device)
+
+# --------------------------- Step 6. Create infer request queue ------------------------------------------------------
+    log.info('Starting inference in asynchronous mode')
+    # create async queue with optimal number of infer requests
+    infer_queue = AsyncInferQueue(compiled_model)
+    infer_queue.set_callback(completion_callback)
+
+# --------------------------- Step 7. Do inference --------------------------------------------------------------------
+    for i, input_tensor in enumerate(input_tensors):
+        infer_queue.start_async({0: input_tensor}, args.input[i])
+
+    infer_queue.wait_all()
+# ----------------------------------------------------------------------------------------------------------------------
+    log.info('This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool\n')
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/openvino/runtime/python_demos/README.md b/python/openvino/runtime/python_demos/README.md
new file mode 100644
index 0000000..2cf080b
--- /dev/null
+++ b/python/openvino/runtime/python_demos/README.md
@@ -0,0 +1,184 @@
+# CoreDLA Python API Usage
+
+This README.md documents how to use OpenVINO's Python API with FPGA AI Suite.
+
+## OpenVINO Benchmark Python Tool (Just In Time Flow)
+
+A port of the OpenVINO Python benchmark_app is included in this directory. For more details on OpenVINO Python benchmark_app, see [README.md](./OpenVINO_benchmark_app/README.md). Note that this OpenVINO Python benchmark_app has slightly lower performance than the DLA C++ dla_benchmark in `runtime/dla_benchmark`.
+
+To run this Python implementation of benchmark_app:
+
+1. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to program the bitstream onto the FPGA device.
+
+2. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$COREDLA_ROOT/lib:$COREDLA_WORK/runtime/build_Release`
+    - `$COREDLA_ROOT/lib` is needed to find `libcoreDLAHeteroPlugin.so`
+    - `$COREDLA_WORK/runtime/build_Release` is needed to find `libcoreDLARuntimePlugin.so`
+
+3. This step assumes that $curarch specifies the .arch file corresponding to the bitstream currently
+programmed onto the FPGA board (as is done in the FPGA AI Suite Getting Started Guide).
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py \
+    -b=1 \
+    -m $xmldir/resnet-50-tf/FP32/resnet-50-tf.xml \
+   -d=HETERO:FPGA,CPU \
+   -niter=8 \
+   -api=async \
+   -nireq=4 \
+   -i $imagedir \
+   -ip=f32 \
+```
+
+   which will estimate the latency and throughput for resnet-50.
+
+Below is a fragment of sample output for HETERO:FPGA,CPU:
+
+```text
+[Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 8 iterations)
+[ INFO ] First inference took <number> ms
+[Step 11/11] Dumping statistics report
+Count:      8 iterations
+Duration:   <Duration> ms
+Latency:    <Latency> ms
+Throughput: <Throughput> FPS
+```
+**Note**: When the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP (e.g. the Agilex 5E Premium Development Kit JTAG Design Example), the only supported value of *nireq* is 1.
+
+## OpenVINO Benchmark Python Tool (Ahead Of Time Flow)
+
+A port of the OpenVINO Python benchmark_app is included in this directory. For more details on OpenVINO Python benchmark_app, see [README.md](./OpenVINO_benchmark_app/README.md). Note that this OpenVINO Python benchmark_app has slightly lower performance than the DLA C++ dla_benchmark in `runtime/dla_benchmark`.
+
+To run this Python implementation of benchmark_app:
+
+1. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to generate an AOT file. The architecture used should correspond to the same bitstream programmed in step 4.
+
+2. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to program the bitstream onto the FPGA device.
+
+3. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$COREDLA_ROOT/lib:$COREDLA_WORK/runtime/build_Release`
+    - `$COREDLA_ROOT/lib` is needed to find `libcoreDLAHeteroPlugin.so`
+    - `$COREDLA_WORK/runtime/build_Release` is needed to find `libcoreDLARuntimePlugin.so`
+
+4. This step assumes that:
+    - `$curarch` specifies the .arch file corresponding to the bitstream currently programmed onto the FPGA board (as is done in the FPGA AI Suite Getting Started Guide).
+    -  `graph.bin` is the compiled graph from step 1.
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py \
+    -b=1 \
+    -m $COREDLA_WORK/graph.bin \
+   -d=HETERO:FPGA,CPU \
+   -niter=8 \
+   -api=async \
+   -nireq=4 \
+   -i $imagedir \
+   -ip=f32 \
+```
+
+   which will estimate the latency and throughput for resnet-50.
+
+Below is a fragment of sample output for HETERO:FPGA,CPU:
+
+```text
+[Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 8 iterations)
+[ INFO ] First inference took <number> ms
+[Step 11/11] Dumping statistics report
+Count:      8 iterations
+Duration:   <Duration> ms
+Latency:    <Latency> ms
+Throughput: <Throughput> FPS
+```
+
+## OpenVINO Benchmark Python Tool Precision (AOT and JIT)
+
+The OpenVINO Python application supports various input tensor precisions. For compatibility with the FPGA AI Suite, which only supports f16 and f32 precisions in the input transformations module, please specify the desired precision using the `-ip` (or `--input_precision`) flag.
+
+## OpenVINO Image Classification Async Python Sample
+
+Another example is a port of OpenVINO Image Classification Async Python Sample. For more details, see it's [README.md](./OpenVINO_classification_sample_async/README.md).
+
+To run this demo, follow step 1 and 2 above in the previous section and run
+
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py \
+    -m $xmldir/resnet-50-tf/FP32/resnet-50-tf.xml \
+    -d=HETERO:FPGA,CPU \
+    -i $imagedir/val_00000000.bmp $imagedir/val_00000001.bmp
+```
+
+Below is a fragment of the output:
+
+```txt
+[ INFO ] Starting inference in asynchronous mode
+[ INFO ] Infer request 0 returned 0
+[ INFO ] Image path: /absolute/path/of/demo/sample_images/val_00000000.bmp
+[ INFO ] Top 10 results:
+[ INFO ] classid probability
+[ INFO ] -------------------
+[ INFO ] 872     0.9995117
+[ INFO ] 999     0.0000000
+[ INFO ] 327     0.0000000
+[ INFO ] 340     0.0000000
+[ INFO ] 339     0.0000000
+[ INFO ] 338     0.0000000
+[ INFO ] 337     0.0000000
+[ INFO ] 336     0.0000000
+[ INFO ] 335     0.0000000
+[ INFO ] 334     0.0000000
+[ INFO ]
+[ INFO ] Infer request 1 returned 0
+[ INFO ] Image path: /absolute/path/of/demo/sample_images/val_00000001.bmp
+[ INFO ] Top 10 results:
+[ INFO ] classid probability
+[ INFO ] -------------------
+[ INFO ] 769     0.9672852
+[ INFO ] 845     0.0292053
+[ INFO ] 778     0.0005350
+[ INFO ] 798     0.0005350
+[ INFO ] 710     0.0003245
+[ INFO ] 767     0.0002230
+[ INFO ] 418     0.0001737
+[ INFO ] 587     0.0001533
+[ INFO ] 542     0.0000820
+[ INFO ] 600     0.0000820
+```
+
+## Instructions on how to run the software emulator model
+
+1. All steps are the same as above except `DLA_PLUGINS` should be set to $COERDLA_ROOT/bin/plugins_emulation.xml (`DLA_PLUGINS=$COREDLA_ROOT/bin/plugins_emulation.xml`)
+
+**NOTE** The software emulator model is slower than a hardware run. Thus, it is highly recommended to run the commands above with the `DLA_PLUGINS` and `-niter=1` and `-nireq=1`
+
+## Modifications Needed
+
+OpenVINO's Python demos and benchmark_app requires slight modification to work with CoreDLA.
+
+Please see the `.patch` file for the exact changes applied to port the OpenVINO Python benchmark_app to the FPGA AI Suite.
+
+These patches are created using
+
+- `cd $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/`
+- `diff -u $INTEL_OPENVINO_DIR/python/openvino/tools/benchmark/benchmark.py benchmark.py > benchmark.patch`
+- `diff -u $INTEL_OPENVINO_DIR/python/openvino/tools/benchmark/main.py main.py > main.patch`
+- `diff -u $INTEL_OPENVINO_DIR/samples/python/classification_sample_async/classification_sample_async.py classification_sample_async.py > classification_sample_async.patch`
+
+To run these demos and benchmark_app, pass the absolute path of the plugin file and arch file as environment variables as shown in the example above.
+
+---
+
+**IMPORTANT**: OpenVINO's sample applications, tools, and demos are designed to work with images in BGR channel order by default. If your model was trained using images in RGB channel order, you will need to take additional steps to ensure compatibility:
+
+1. **Modify the Application**: Update the channel order within the sample or demo application code to match the RGB order expected by your model.
+
+2. **Convert the Model**: Alternatively, you can convert your trained model to expect BGR input by using the Model Optimizer tool. When doing so, include the `--reverse_input_channels` flag to adjust the channel order. For detailed guidance on this flag, consult the Model Optimizer documentation or run `mo --help` in your command line for assistance.
+
+---
author	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
committer	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
commit	ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree	a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/python_demos
parent	40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download	thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip