340 files changed, 58226 insertions, 0 deletions
diff --git a/python/openvino/runtime/.gitignore b/python/openvino/runtime/.gitignore
new file mode 100644
index 0000000..4c16775
--- /dev/null
+++ b/python/openvino/runtime/.gitignore
@@ -0,0 +1,21 @@
+*~
+*#
+*.marks
+release_build/
+build*/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
+embedded_arm_sdk
+hps_packages
+poky*.sh
diff --git a/python/openvino/runtime/CMakeLists.txt b/python/openvino/runtime/CMakeLists.txt
new file mode 100644
index 0000000..cdc4578
--- /dev/null
+++ b/python/openvino/runtime/CMakeLists.txt
@@ -0,0 +1,588 @@
+###############################################################################
+# Build file defining build targets for the PCIe and HPS example design packages
+#
+# The runtime can be built in two ways which can be selected with the -disable_jit flag
+# using the build_runtime.sh script. Adding the flag will build the AOT (ahead of time)
+# runtime ONLY (hence disabling jit/just in time). This build flow should be used if you
+# wish to build independently of the DLAC and/or want to port the runtime to your own system.
+#
+# Building without the -disable_jit flag will build a runtime that is dependent on
+# DLAC's libraries which also enables both AOT and JIT flow.
+#
+# Note: HPS builds default to -disable_jit
+##############################################################################
+
+cmake_minimum_required(VERSION 3.10)
+
+# CMake policies
+# Use <PackageName>_ROOT env. variable as a prefix
+if(POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+# MSVC runtime library flags are selected by an abstraction.
+#if(POLICY CMP0091)
+#  cmake_policy(SET CMP0091 NEW)
+#endif()
+if(POLICY CMP0092)
+  # Disable passing /W3 by default on MSVC
+  cmake_policy(SET CMP0092 NEW)
+endif()
+# Honor visibility properties for all target types.
+if(POLICY CMP0063)
+  cmake_policy(SET CMP0063 NEW)
+endif()
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_C_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+project(coredla_runtime)
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)")
+  set(ARM ON)
+endif()
+
+set (TARGET_NAME coreDlaRuntimePlugin)
+
+if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+  set(CMAKE_CXX_STANDARD 14)
+else()
+  set(CMAKE_CXX_STANDARD 11)
+endif()
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Checking for COREDLA_ROOT being set should be handled by build_runtime.sh
+# We just double-check here
+if (NOT DEFINED ENV{COREDLA_ROOT})
+  message(FATAL_ERROR "COREDLA_ROOT environment variable not set.")
+endif()
+
+if (WIN32)
+  set(Protobuf_USE_STATIC_LIBS ON)
+  list (APPEND CMAKE_PREFIX_PATH $ENV{PROTOBUF_HOME} $ENV{PROTOBUF_LIBS} )
+  include(FindProtobuf)
+  find_package(Protobuf REQUIRED)
+endif()
+
+#
+# Adds compiler flags to C / C++ sources
+#
+macro(dla_add_compiler_flags)
+    foreach(flag ${ARGN})
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}")
+    endforeach()
+endmacro()
+
+get_filename_component(CHK1 $ENV{COREDLA_ROOT}/Makefile ABSOLUTE)
+get_filename_component(CHK2 ${CMAKE_CURRENT_SOURCE_DIR}/Makefile ABSOLUTE)
+####################################################################
+## SDL required compiler flags
+####################################################################
+# Needed for all builds
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+if (ARM)
+  # Built static library to simplify ED4 files.
+  option(BUILD_SHARED_LIBS "Build as a static library" OFF)
+else()
+  option(BUILD_SHARED_LIBS "Build as a shared library" ON)
+endif()
+
+if (WIN32)
+
+  if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    if(MSVC)
+      dla_add_compiler_flags(/sdl)
+    endif()
+    dla_add_compiler_flags("/guard:cf")
+    if (ENABLE_INTEGRITYCHECK)
+      set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /INTEGRITYCHECK")
+    endif()
+  endif()
+
+  dla_add_compiler_flags(-D_CRT_SECURE_NO_WARNINGS -D_SCL_SECURE_NO_WARNINGS)
+  # CMake adds the following default compiler flags when generating projects for Visual Studio:
+  # /DWIN32 /D_WINDOWS /W3 /GR /EHsc
+  if (MSVC)
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
+  else()
+    dla_add_compiler_flags(/EHsc) # no asynchronous structured exception handling
+  endif()
+  dla_add_compiler_flags(/Gy) # remove unreferenced functions: function level linking
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
+  # see https://msdn.microsoft.com/en-us/library/fwkeyyhe.aspx for details
+  # /ZI = include debug info
+  # /Wall = all warnings
+  add_compile_options("$<$<CONFIG:RELWITHDEBINFO>:/O2>")
+  add_compile_options("$<$<CONFIG:RELWITHDEBINFO>:/ZI>")
+  add_compile_options("$<$<CONFIG:RELEASE>:/O2>")
+  add_compile_options("$<$<CONFIG:RELEASE>:/D>")
+  add_compile_options("$<$<CONFIG:RELEASE>:/NDEBUG>")
+  add_compile_options("$<$<CONFIG:DEBUG>:/Od>")
+  # buffers security check
+  add_compile_options(/GS)
+
+  add_compile_options(/permissive-)
+
+  # Compiler specific flags
+  dla_add_compiler_flags(/bigobj)
+  dla_add_compiler_flags(/MP)
+
+  # Disable noisy warnings
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4308 /wd4703 /wd4244 /wd4819")
+  if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4146 /wd4996")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4146 /wd4996")
+    # C4251 needs to have dll-interface to be used by clients of class
+    dla_add_compiler_flags(/wd4251)
+    # C4275 non dll-interface class used as base for dll-interface class
+    dla_add_compiler_flags(/wd4275)
+    # Because permissive is set
+    dla_add_compiler_flags(/wd5208)
+
+    # inline is not a keyword in visual studios old C version, allow its redefinition
+    add_definitions("-D_ALLOW_KEYWORD_MACROS")
+  endif()
+
+  # Debug information flags, by default CMake adds /Zi option
+  # but provides no way to specify CMAKE_COMPILE_PDB_NAME on root level
+  # In order to avoid issues with ninja we are replacing default flag instead of having two of them
+  # and observing warning D9025 about flag override
+  string(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
+  string(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+  string(REPLACE "/Zi" "/Z7" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
+  string(REPLACE "/Zi" "/Z7" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+else()
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat -Wformat-security")
+
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
+
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+
+  set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fPIE")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE")
+
+  # Release build only
+  set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
+  if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9)
+    set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-strong")
+    set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -z noexecstack -z relro -z now")
+
+    # These are for 8478-CT158 in the SDL process
+    # ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ )
+  else()
+    set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-all")
+  endif()
+
+  # Release build only
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
+  if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9)
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-strong")
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -z noexecstack -z relro -z now")
+  else()
+    set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-all")
+  endif()
+
+  # These are for 8478-CT158 in the SDL process
+  # ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ )
+  set (CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+
+  ####################################################################
+
+  set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -ggdb3")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
+
+  #### Sanitizer settings ####
+  # Address
+  set(CMAKE_C_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+  set(CMAKE_CXX_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+  # Memory
+  set(CMAKE_C_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+  set(CMAKE_CXX_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+  # Thread
+  set(CMAKE_C_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+  set(CMAKE_CXX_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+
+  set (CMAKE_CXX_STANDARD 11)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  # Enable all warnings except unknown-pragmas.  Wunknown-pragmas must be excluded because
+  # it is triggered by header file included from OpenCL runtime
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas")
+
+  # Might be too strict for wide deployment, but easy to disable if it causes problems.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
+
+
+  # Edwinzha: With OV 2023.3.0 LTS a warning is thrown everytime old api is used. Suppress for now until we do a full uplift.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-cpp")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+
+  # This is required on Ubuntu 18; the new linker behaviour transforms
+  # RPATH into RUNPATH (which can be seen in the output of 'readelf -d').
+  # However, RUNPATH does not work recursively, so when OpenVINO reads
+  # the plugins.xml file and searches for the specified libcoreDlaRuntimePlugin.so
+  # library, it fails.  The --disable-new-dtags option causes the linker
+  # to keep RPATH as RPATH (rather than morphing to RUNPATH).
+  #
+  # References:
+  #  https://stackoverflow.com/questions/52018092/how-to-set-rpath-and-runpath-with-gcc-ld
+  #  https://stackoverflow.com/questions/59248421/c-secondary-dependency-resolution-with-runpath
+  #
+  # The solution below seems preferable to setting LD_LIBRARY_PATH, if only barely.
+  # For additional motivation, go ahead and throw away part of your day reading either
+  # of the screeds:
+  #  http://xahlee.info/UnixResource_dir/_/ldpath.html
+  #  https://gms.tf/ld_library_path-considered-harmful.html
+  # You may find that neither is fully convincing, of course.
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--disable-new-dtags")
+endif()
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+if (${HW_BUILD_PLATFORM} STREQUAL "SYSTEM_CONSOLE")
+  set (SYSTEM_CONSOLE_PLATFORM 1)
+  set (MMD_DIR_NAME system_console)
+  set (MMD_LIB_NAME system_console_mmd)
+elseif (${HW_BUILD_PLATFORM} STREQUAL "DE10_AGILEX")
+  set (DE10_AGILEX 1)
+  set (MMD_DIR_NAME de10_agilex)
+  set (MMD_LIB_NAME de10_agilex_mmd)
+elseif(${HW_BUILD_PLATFORM} STREQUAL "HPS_PLATFORM")
+  set (HPS_PLATFORM 1)
+  set (MMD_DIR_NAME hps_platform)
+  set (MMD_LIB_NAME hps_platform_mmd)
+elseif(${HW_BUILD_PLATFORM} STREQUAL "DCP_A10_PAC")
+  set (PAC_A10 1)
+  set (MMD_DIR_NAME dcp_a10_pac)
+  set (MMD_LIB_NAME intel_opae_mmd)
+elseif(${HW_BUILD_PLATFORM} STREQUAL "AGX7_I_DK")
+  set (AGX7_IDK 1)
+  set (MMD_DIR_NAME agx7_ofs_pcie)
+  set (MMD_LIB_NAME intel_opae_mmd)
+elseif(${HW_BUILD_PLATFORM} STREQUAL "AGX7_N6001")
+  set (AGX7_N6001 1)
+  set (MMD_DIR_NAME agx7_ofs_pcie)
+  set (MMD_LIB_NAME intel_opae_mmd)
+  add_definitions(-DUSE_N6001_BOARD)
+else()
+  set (EMULATION 1)
+endif()
+
+# Set HPS_AGX7 if building for HPS and Agilex 7 is selected
+if ("${HW_BUILD_PLATFORM}" STREQUAL "HPS_PLATFORM" AND "${HPS_BUILD_MACHINE}" STREQUAL "agilex7_dk_si_agi027fa")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHPS_AGX7")
+endif()
+
+# Set HPS_PLATFORM if building for HPS
+if (HPS_PLATFORM)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHPS_PLATFORM")
+endif()
+
+if (NOT HPS_PLATFORM)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_OLD_COREDLA_DEVICE")
+endif()
+
+# Flag to disable JIT mode
+if (DISABLE_JIT)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDISABLE_JIT")
+endif()
+
+# Build the runtime variant of the DLA plugin
+add_definitions(-DRUNTIME_DLA_PLUGIN)
+
+##########
+if (NOT OpenVINO_DIR)
+  Set (OpenVINO_DIR $ENV{OpenVINO_DIR})
+endif()
+
+find_package(OpenVINO CONFIG REQUIRED)
+#########
+
+if (NOT CoreDLA_DIR)
+  set(CoreDLA_DIR $ENV{CoreDLA_DIR})
+endif()
+
+if (NOT DISABLE_JIT)
+  find_package(CoreDLA CONFIG REQUIRED)
+endif()
+
+file(TO_CMAKE_PATH $ENV{COREDLA_ROOT} COREDLA_ROOT)
+
+# Gets a build version of {git branch}-{git hash} as an identifier for the plugin build version. Defaults to "Custom Build"
+function(get_build_version output_variable)
+  # Get branch name
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    RESULT_VARIABLE result_var_branch
+    OUTPUT_VARIABLE branch_name
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  # Get hash
+  execute_process(
+    COMMAND git rev-parse --short=10 HEAD
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    RESULT_VARIABLE result_var_hash
+    OUTPUT_VARIABLE git_hash
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  if(NOT ${result_var_branch} EQUAL 0 OR NOT ${result_var_hash} EQUAL 0)
+    set(${output_variable} "Custom build" PARENT_SCOPE)
+  else()
+    set(${output_variable} "${branch_name}-${git_hash}" PARENT_SCOPE)
+  endif()
+endfunction()
+# Call function to get Git branch name
+get_build_version(PLUGIN_BUILD_VERSION)
+
+# Pass PLUGIN_BUILD_VERSION to the source code via a define
+add_definitions(-DPLUGIN_BUILD_VERSION="${PLUGIN_BUILD_VERSION}")
+
+if (NOT ARM)
+  # ED4 OV is not built with tbb threading
+  find_package(TBB REQUIRED tbb)
+  if ( NOT DEFINED TBB_IMPORTED_TARGETS)
+    set (TBB_IMPORTED_TARGETS TBB::tbb TBB::tbbmalloc TBB::tbbmalloc_proxy)
+  endif()
+endif()
+
+SET (INFO 0)
+SET (WARNING 1)
+SET (ERROR 2)
+SET (FATAL 3)
+
+if(DEBUG_RUNTIME)
+  add_definitions(-DDEBUG_RUNTIME)
+endif(DEBUG_RUNTIME)
+if(DEBUG_RUNTIME_MEMORY_TEST)
+  add_definitions(-DDEBUG_RUNTIME_MEMORY_TEST)
+endif(DEBUG_RUNTIME_MEMORY_TEST)
+if(RUNTIME_VERBOSITY)
+  add_definitions(-DENABLE_LOGGING)
+  add_definitions(-DRUNTIME_VERBOSITY=${${RUNTIME_VERBOSITY}})
+endif(RUNTIME_VERBOSITY)
+
+if(RUNTIME_POLLING)
+  add_definitions(-DCOREDLA_RUNTIME_POLLING)
+endif(RUNTIME_POLLING)
+
+# OpenVINO pre-requisites to build hetero & dla plugin
+add_subdirectory(${COREDLA_ROOT}/thirdparty/openvino_dev_api thirdparty/openvino_dev_api)
+add_subdirectory($ENV{COREDLA_XUTIL_DIR}/transformations ${CMAKE_CURRENT_BINARY_DIR}/transformations)
+
+# CoreDLA ships the hetero plugin shared library. If AOT only runtime, we re-compile from src and
+# do not use the shipped library.
+if (DISABLE_JIT)
+  add_subdirectory(${COREDLA_ROOT}/thirdparty/pugixml ${CMAKE_CURRENT_BINARY_DIR}/thirdparty/pugixml)
+  add_subdirectory(${COREDLA_ROOT}/util/hetero_plugin ${CMAKE_CURRENT_BINARY_DIR}/hetero_plugin)
+endif()
+
+# Required for dla_benchmark and demos
+add_subdirectory(common)
+
+# Build runtime plugin CPU io transforms
+add_subdirectory(${COREDLA_ROOT}/dla_plugin/io_transformations ${CMAKE_BINARY_DIR}/dla_plugin/io_transformations)
+
+# Build Runtime plugin (AGX7/A10/HPS/Emulator)
+if(NOT EMULATION)
+  file(GLOB SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/src/*.cpp
+    ${COREDLA_ROOT}/dla_plugin/src/*.cpp
+  )
+
+  if (SYSTEM_CONSOLE_PLATFORM)
+    list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/src/mmd_wrapper.cpp)
+  endif()
+
+  # We seem to have a partial copy of the compiled_result_reader_writer.cpp inside
+  # of plugin/src/dla_compiled_model.cpp. The duplicate code should probably be removed.
+  if (DISABLE_JIT)
+  list(APPEND SOURCES
+    # Only required if building runtime independently of dla
+    $ENV{COREDLA_ROOT}/util/src/dla_numeric_utils.cpp
+    $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp
+  )
+  endif()
+
+  file(GLOB HEADERS
+    ${COREDLA_ROOT}/dla_plugin/inc/dlia/*.hpp
+    ${COREDLA_ROOT}/dla_plugin/inc/*.hpp
+    ${COREDLA_ROOT}/dla_plugin/inc/*.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/inc/*.h
+  )
+  if (WIN32)
+    list(APPEND HEADERS ${COREDLA_ROOT}/fpga/dma/rtl/dla_dma_constants.svh)
+  endif()
+  add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
+
+  if (SYSTEM_CONSOLE_PLATFORM)
+    target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/mmd/system_console/mmd_wrapper.cpp)
+    add_custom_command(
+      TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+      ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/system_console_script.tcl
+      ${CMAKE_CURRENT_BINARY_DIR}/system_console_script.tcl
+    )
+    target_compile_definitions(${TARGET_NAME} PRIVATE DLA_SYSCON_SOURCE_ROOT=${CMAKE_CURRENT_BINARY_DIR})
+  else()
+    target_sources(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/src/mmd_wrapper.cpp)
+  endif()
+endif()
+
+if (WIN32)
+  # Fix warning C4273: inconsistent dll linkage
+  target_compile_definitions(${TARGET_NAME} PRIVATE XBYAK_NO_OP_NAMES
+    IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+    $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
+endif()
+
+if (DLA_ALLOW_ENCRYPTION)
+  add_definitions(-DDLA_ALLOW_ENCRYPTION)
+  find_package(OpenSSL REQUIRED)
+  set (CRYPTO_LIB_NAME OpenSSL::Crypto)
+endif()
+
+if (NOT WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -lrt" )
+endif()
+
+if(NOT EMULATION)
+  target_include_directories(${TARGET_NAME} PUBLIC
+    ${COREDLA_ROOT}/dla_plugin/inc
+    ${COREDLA_ROOT}/dla_plugin/
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/inc
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/mmd/${MMD_DIR_NAME}/host
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/mmd/${MMD_DIR_NAME}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/coredla_device/stream_controller/app
+    ${COREDLA_ROOT}/util/inc
+  )
+
+  if (WIN32)
+    target_include_directories(${TARGET_NAME} PUBLIC ${Protobuf_INCLUDE_DIRS})
+  endif()
+
+  if (NOT HPS_PLATFORM)
+    find_package(Boost REQUIRED COMPONENTS filesystem)
+
+    target_link_libraries(${TARGET_NAME} PUBLIC
+      ${CMAKE_DL_LIBS}
+      ${TBB_IMPORTED_TARGETS}
+      openvino::runtime
+      openvino_dev_api
+      dliaPluginIOTransformations
+      dla_op_transformation
+      ${MMD_LIB_NAME}
+      ${CRYPTO_LIB_NAME}
+      Boost::filesystem
+    )
+
+    if (NOT DISABLE_JIT)
+      target_link_libraries(${TARGET_NAME} PUBLIC
+        archparam
+        dla_compiler_core
+        dla_compiled_result
+        lpsolve5525
+      )
+    endif()
+  else()
+    target_link_libraries(${TARGET_NAME} PUBLIC
+      ${CMAKE_DL_LIBS}
+      openvino::runtime
+      openvino_dev_api
+      dliaPluginIOTransformations
+      dla_op_transformation
+      ${MMD_LIB_NAME}
+    )
+  endif()
+
+  # Needed for coredla_device/inc/dla_dma_constants.h to find dla_dma_constants.svh, since
+  # the cmake description (find_package(CoreDLA)) does not know about this dependency.
+  #
+  # Also needed for a variety of .h files in the DISABLE_JIT case, where we do not
+  # use the CoreDLA package.
+  if (EXISTS ${COREDLA_ROOT}/inc)
+    target_include_directories(${TARGET_NAME} PUBLIC ${COREDLA_ROOT}/inc)
+  else()
+    target_include_directories(${TARGET_NAME} PUBLIC ${COREDLA_ROOT}/build/coredla/dla/inc)
+  endif()
+
+  get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+  foreach(dir ${dirs})
+    message(STATUS "dir='${dir}'")
+  endforeach()
+  add_subdirectory(coredla_device/mmd/${MMD_DIR_NAME})
+  # For some reason, (${HW_BUILD_PLATFORM} STREQUAL "DE10_AGILEX") does not work in the line below
+  if (DE10_AGILEX)
+    add_subdirectory(fpga_jtag_reprogram)
+  endif()
+  if (WIN32)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins_win.xml ${CMAKE_CURRENT_BINARY_DIR}/plugins.xml COPYONLY)
+  else()
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins.xml ${CMAKE_CURRENT_BINARY_DIR}/ COPYONLY)
+  endif()
+
+  # This is an ugly hack to keep internal regression tests happy
+  if (NOT HPS_PLATFORM)
+    if (DEFINED ENV{ARC_JOB_ID})
+      if (EXISTS "/p/psg/pac/release/rush_creek/adapt/19.1/367/linux64/sw/deps/lib/libjson-c.so.4")
+        target_link_libraries(${TARGET_NAME} PUBLIC "/p/psg/pac/release/rush_creek/adapt/19.1/367/linux64/sw/deps/lib/libjson-c.so.4")
+      endif()
+    endif()
+  endif()
+  if (NOT HPS_PLATFORM)
+    add_subdirectory(dla_aot_splitter)
+  endif()
+endif()
+
+add_subdirectory(dla_benchmark)
+
+# Runtime demos are not built by default.
+# Pass argument -build_demo to build_runtime.sh to build runtime demos.
+if (DEFINED BUILD_DEMO)
+  if (DISABLE_JIT)
+    message(FATAL_ERROR
+      "Error: BUILD_DEMO requires JIT support, but JIT compilation disabled via DISABLE_JIT."
+      "       If you did not specify these options, then they may have been cached.  Remove"
+      "       ${CMAKE_BINARY_DIR} to clear the cache."
+  )
+  endif()
+  add_subdirectory(classification_sample_async)
+  add_subdirectory(object_detection_demo)
+  add_subdirectory(segmentation_demo)
+endif()
+
+if (HPS_PLATFORM)
+  add_subdirectory(streaming/streaming_inference_app)
+  add_subdirectory(streaming/image_streaming_app)
+endif()
+
+# Runtime install is applicable for Windows only
+if (WIN32)
+  install(TARGETS ${TARGET_NAME}
+    RUNTIME DESTINATION "dla/runtime/bin" COMPONENT RUNTIME
+    LIBRARY DESTINATION "dla/runtime/lib" COMPONENT RUNTIME
+    ARCHIVE DESTINATION "dla/runtime/lib" COMPONENT RUNTIME
+  )
+endif()
+
+# Add ed0 streaming example tcl script
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/streaming/ed0_streaming_example/system_console_script.tcl
+     DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/streaming/ed0_streaming_example/.)
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/streaming/ed0_streaming_example/system_console_script_perf.tcl
+     DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/streaming/ed0_streaming_example/.)
diff --git a/python/openvino/runtime/CPPLINT.cfg b/python/openvino/runtime/CPPLINT.cfg
new file mode 100644
index 0000000..ffe6c23
--- /dev/null
+++ b/python/openvino/runtime/CPPLINT.cfg
@@ -0,0 +1,17 @@
+set noparent
+filter=-build/header_guard,-runtime/explicit,-build/include_subdir,-runtime/references,-build/c++11,-runtime/int,-runtime/string,-runtime/printf,-build/namespaces,-readability/casting
+
+# Ignore build DIR
+exclude_files=build_Release
+exclude_files=build_Debug
+# Ignore DIRs that are largely copied from OpenVINO and not actively maintained by CoreDLA
+exclude_files=common
+exclude_files=classification_sample_async
+exclude_files=object_detection_demo
+exclude_files=segmentation_demo
+# Ignore DIRs from other orgs and not actively maintained by CoreDLA
+exclude_files=streaming
+exclude_files=coredla_device
+
+linelength=160
+headers=h,hpp
diff --git a/python/openvino/runtime/build_hpspackages.sh b/python/openvino/runtime/build_hpspackages.sh
new file mode 100755
index 0000000..dba9711
--- /dev/null
+++ b/python/openvino/runtime/build_hpspackages.sh
@@ -0,0 +1,571 @@
+#!/bin/bash
+set -x
+
+# Script to build extra packages for building and running on Linux based SoC FPGAs.
+# This script needs to be called prior to building the CoreDLA Runtime.
+# Typical Usage :  ./build_hpspackages.sh -sb
+# For Help : ./build_hpspackages.sh -h
+
+##################################################################
+# Parameters
+SCRIPT_DIR=$(cd "$(dirname $0)" >/dev/null 2>&1 && pwd)
+RUNTIME_ROOT_DIR=$(cd "${SCRIPT_DIR}" >/dev/null 2>&1 && pwd)
+
+DEV_HOME=`pwd`
+BUILD_DIR=$DEV_HOME/hps_packages
+STAGING_DIR=$BUILD_DIR/armcpu_package
+
+YOCTO_SDK_NAME="embedded_arm_sdk"
+YOCTO_SDK="`pwd`/${YOCTO_SDK_NAME}"
+TOOLCHAIN_FILE="${YOCTO_SDK}/cmake/embedded.arm.cmake"
+TOOLCHAIN_PREFIX="${YOCTO_SDK}/sysroots/x86_64-pokysdk-linux/usr/bin/arm-poky-linux-gnueabi/arm-poky-linux-gnueabi-"
+SYSROOT="${YOCTO_SDK}/sysroots/armv7at2hf-neon-poky-linux-gnueabi"
+
+##############################################################
+function get_toolchain_prefix()
+{
+    (
+        unset LD_LIBRARY_PATH
+        source ${YOCTO_SDK}//environment-setup-*
+        IFS='\ ' array=(${CC})
+        CC_PATH=`which ${array[0]}`
+        echo ${CC_PATH::-3}
+    )
+}
+
+function get_sdksysroot()
+{
+    (
+        unset LD_LIBRARY_PATH
+        source ${YOCTO_SDK}//environment-setup-*
+        echo ${OECORE_TARGET_SYSROOT}
+    )
+}
+
+#################################################################
+# Functions
+function fail()
+{
+    echo "Failed : $1"
+    exit 1
+}
+
+#################################################################
+get_git_repo()
+{
+    OUTPUT=$1
+    URL=$2
+    SUBMODULES=$3
+    TAG=$4
+    if [ ! -e ${OUTPUT} ]; then
+
+        COMMAND="git clone $URL"
+        if [ ! -z ${TAG} ]; then
+            COMMAND="$COMMAND -b ${TAG}"
+        else
+            echo "Please provide a version number for $URL"
+            exit 1
+        fi
+
+        if [ "${SUBMODULES}" == "true" ]; then
+            COMMAND="${COMMAND} --recurse-submodules"
+        fi
+
+        COMMAND="${COMMAND} ${OUTPUT}"
+        ${COMMAND}
+    else
+        echo "Repo already exists - $OUTPUT"
+    fi
+}
+
+
+#################################################################
+function build_opencv()
+{
+    pushd $OPENCV_HOME
+        CMAKE_FLAGS="-DBUILD_opencv_apps:BOOL=OFF -DBUILD_opencv_calib3d:BOOL=OFF -DBUILD_opencv_core:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_dnn:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_features2d:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_flann:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_gapi:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_highgui:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_imgcodecs:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_imgproc:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_java_bindings_generator:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_js:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_js_bindings_generator:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_ml:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_objc_bindings_generator:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_objdetect:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_photo:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_python_bindings_generator:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_python_tests:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_stitching:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_ts:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_video:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_videoio:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_opencv_world:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_GTK:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_GTK_2_X:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_1394:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_GSTREAMER:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_PNG:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_PNG:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_JPEG:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_JPEG:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_WEBP:BOOL=OFF"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_TIFF:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_TIFF:BOOL=ON"
+        # CMAKE_FLAGS="${CMAKE_FLAGS} -DWITH_TBB:BOOL=ON"
+        # CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_TBB:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DBUILD_ZLIB:BOOL=ON"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_BUILD_TYPE=${OPENCV_BUILD_TYPE}"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_INSTALL_PREFIX=$OPENCV_BUILD/install"
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_STAGING_PREFIX=$STAGING_DIR/opencv"
+
+        # For some reason, OpenCV uses the machine's native ccache
+        # On SLES15, this ccache is too new for the older gcc version that we arc shell
+        CMAKE_FLAGS="${CMAKE_FLAGS} -DENABLE_CCACHE:BOOL=OFF"
+
+        cmake -B ${OPENCV_BUILD} -G "Ninja" -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} ${CMAKE_FLAGS}
+        if [ $? != 0 ]; then
+            fail "Failed to configure OPENCV"
+        fi
+        cmake --build ${OPENCV_BUILD} --parallel $(nproc)
+        if [ $? != 0 ]; then
+            fail "Failed to build OPENCV"
+        fi
+
+        cmake --install ${OPENCV_BUILD}
+        if [ $? != 0 ]; then
+            fail "Failed to install OPENCV"
+        fi
+    popd
+}
+
+#################################################################
+function cleanup_tmpfile() {
+    local file="$1"
+    rm -rf "$file"
+}
+
+#################################################################
+function build_openvino()
+{
+    # The cmake option ENABLE_OPENVINO_DEBUG allow OV print rich debug info,
+    # we turn it on here if build type in Debug
+    if [[ "$OPENVINO_BUILD_TYPE" == "Debug" ]]; then
+        OV_DEBUG_FLAG="ON"
+    else
+        OV_DEBUG_FLAG="OFF"
+    fi
+    # Arm plugin build options should be exported as an env variable
+    export toolchain_prefix=${TOOLCHAIN_PREFIX}
+    export exceptions=False
+    export reference_openmp=False
+    export validation_tests=False
+    export benchmark_tests=False
+    export extra_link_flags="--sysroot=${SYSROOT}"
+
+    # cmake throws an "Argument list too long" error if the path is too long
+    # Use a temp directory symlinked to the runtime/hps_packages directory
+    # If cmake was previously run, use the same temp directory
+    echo "Checking for cached path"
+
+    # Path to generated cmake cache file that contains previously used cmake cache path
+    OPENVINO_CMAKE_CACHE_FILE=$OPENVINO_HOME/build_Release/CMakeCache.txt
+
+    # Check if cmake_install.cmake exists, indicating that cmake has previously run
+    if [ -e "$OPENVINO_CMAKE_CACHE_FILE" ]; then
+
+        # Read first 2 line of the file, which contains the previously used path
+        FIRST_LINES=$(head -n 2 $OPENVINO_CMAKE_CACHE_FILE)
+
+        PATH_REGEX='\bFor build in directory:\s+(\S+)\/hps_packages\/openvino\/build_Release'
+
+        if [[ $FIRST_LINES =~ $PATH_REGEX ]]; then        
+            BUILD_DIR_TEMP=${BASH_REMATCH[1]}
+            echo "Using cached temp path: $BUILD_DIR_TEMP"
+
+            mkdir -p $BUILD_DIR_TEMP
+
+            if [ -d $BUILD_DIR_TEMP ]; then
+                ln -s $BUILD_DIR $BUILD_DIR_TEMP
+                trap 'cleanup_tmpfile "$BUILD_DIR_TEMP"' EXIT
+            else
+                echo "mkdir command failed. Cannot create temporary build directory."
+            fi
+        else
+            echo "Could not read path from cmake_install.cmake"
+        fi
+    # If cmake_install.cmake does not exist, then generate a new temp directory
+    else
+        # Create temporary directory
+        if [ -n "$TEMPDIR" ]; then
+            BUILD_DIR_TEMP=$(mktemp -d -p "$TEMPDIR")
+        else
+            BUILD_DIR_TEMP=$(mktemp -d)
+        fi
+
+        echo "Creating new temp directory: $BUILD_DIR_TEMP"
+
+        if [ -z "$BUILD_DIR_TEMP" ]; then
+            echo "mktemp command failed. Cannot create temporary build directory."
+        else
+            ln -s $BUILD_DIR $BUILD_DIR_TEMP
+            trap 'cleanup_tmpfile "$BUILD_DIR_TEMP"' EXIT
+        fi
+    fi
+
+    # Use local versions of cmake variables
+    if [ -d "$BUILD_DIR_TEMP" ]; then
+        OPENVINO_HOME_LOC=$BUILD_DIR_TEMP/hps_packages/openvino
+        OPENVINO_BUILD_LOC=$OPENVINO_HOME_LOC/build_Release
+        STAGING_DIR_LOC=$BUILD_DIR_TEMP/hps_packages/armcpu_package
+    else
+        OPENVINO_HOME_LOC=$OPENVINO_HOME
+        OPENVINO_BUILD_LOC=$OPENVINO_BUILD
+        STAGING_DIR_LOC=$STAGING_DIR
+    fi
+
+    # Disable OpenVINO hetero plugin. ED4 should use the CoreDLA Hetero
+    pushd $OPENVINO_HOME_LOC
+        cmake -G "Ninja" -B $OPENVINO_BUILD_LOC \
+        -DOpenCV_DIR=$STAGING_DIR_LOC/opencv/cmake -DENABLE_OPENCV=OFF \
+        -DENABLE_SAMPLES=OFF \
+        -DENABLE_HETERO=OFF \
+        -DENABLE_AUTO=OFF \
+        -DENABLE_INTEL_GNA=OFF \
+        -DENABLE_INTEL_GPU=OFF \
+        -DENABLE_INTEL_MYRIAD=OFF \
+        -DENABLE_CPPLINT=OFF \
+        -DENABLE_TEMPLATE=OFF \
+        -DENABLE_TESTS=OFF -DENABLE_BEH_TESTS=OFF -DENABLE_FUNCTIONAL_TESTS=OFF \
+        -DENABLE_GAPI_TESTS=OFF \
+        -DENABLE_DATA=OFF -DENABLE_PROFILING_ITT=OFF \
+        -DCMAKE_EXE_LINKER_FLAGS=-Wl,-rpath-link,$STAGING_DIR_LOC/opencv/lib -DCMAKE_INSTALL_LIBDIR=lib \
+        -DENABLE_SSE42=OFF -DENABLE_INTEL_MYRIAD=OFF  -DENABLE_INTEL_MYRIAD_COMMON=OFF\
+        -DENABLE_OPENVINO_DEBUG="${OV_DEBUG_FLAG}" \
+        -DENABLE_SYSTEM_TBB=OFF \
+        -DTHREADING=SEQ -DENABLE_LTO=ON \
+        -DENABLE_PYTHON=OFF \
+        -DENABLE_TEMPLATE=OFF \
+        -DENABLE_OV_ONNX_FRONTEND=OFF \
+        -DENABLE_OV_PADDLE_FRONTEND=OFF \
+        -DENABLE_OV_TF_FRONTEND=OFF \
+        -DENABLE_SYSTEM_PUGIXML=OFF \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DARM_COMPUTE_TOOLCHAIN_PREFIX=${TOOLCHAIN_PREFIX} ${OPENCV_PREFIX} \
+        -DCMAKE_STAGING_PREFIX=$STAGING_DIR_LOC \
+        -DCMAKE_PREFIX_PATH=$STAGING_DIR_LOC \
+        -DCMAKE_BUILD_TYPE=$OPENVINO_BUILD_TYPE \
+        .
+        if [ $? != 0 ]; then
+            fail "Failed to configure OPENVINO"
+        fi
+
+        cmake --build $OPENVINO_BUILD_LOC --parallel $(nproc)
+        if [ $? != 0 ]; then
+            fail "Failed to build OPENVINO"
+        fi
+
+        cmake --install ${OPENVINO_BUILD_LOC}
+        if [ $? != 0 ]; then
+            fail "Failed to install OPENVINO"
+        fi
+    popd
+}
+
+#################################################################
+function build_protobuf()
+{
+    pushd $PROTOBUF_HOME/cmake
+        cmake -G "Ninja" -B $PROTOBUF_BUILD \
+        -Dprotobuf_BUILD_TESTS=OFF \
+        -Dprotobuf_BUILD_EXAMPLES=OFF \
+        -Dprotobuf_WITH_ZLIB=OFF \
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+        -DCMAKE_INSTALL_PREFIX=$STAGING_DIR/protobuf \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DCMAKE_BUILD_TYPE=$PROTOBUF_BUILD_TYPE \
+        .
+        if [ $? != 0 ]; then
+            fail "Failed to configure PROTOBUF"
+        fi
+
+        cmake --build $PROTOBUF_BUILD --parallel $(nproc)
+        if [ $? != 0 ]; then
+            fail "Failed to build PROTOBUF"
+        fi
+
+        cmake --install ${PROTOBUF_BUILD}
+        if [ $? != 0 ]; then
+            fail "Failed to install PROTOBUF"
+        fi
+    popd
+}
+
+#################################################################
+function build_gflags()
+{
+    pushd $GFLAGS_HOME
+        cmake -B $GFLAGS_BUILD \
+        -D BUILD_STATIC_LIBS=ON \
+        -D BUILD_SHARED_LIBS=ON \
+        -D BUILD_gflags_nothreads_LIBS=ON \
+        -D BUILD_gflags_LIBS=ON \
+        -D INSTALL_HEADERS=ON \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DCMAKE_INSTALL_PREFIX="${GFLAGS_HOME}/install" \
+        -DCMAKE_STAGING_PREFIX=$STAGING_DIR/gflags \
+        -DARM_COMPUTE_TOOLCHAIN_PREFIX=${TOOLCHAIN_PREFIX} ${OPENCV_PREFIX} \
+        -Dextra_link_flags=--sysroot=${SYSROOT} \
+        -DCMAKE_BUILD_TYPE=$GFLAGS_BUILD_TYPE \
+        $GFLAGS_PLUGIN_HOME
+        if [ $? != 0 ]; then
+            fail "Failed to configure GFLAGS"
+        fi
+
+        cmake --build $GFLAGS_BUILD --parallel $(nproc)
+        if [ $? != 0 ]; then
+            fail "Failed to configure GFLAGS"
+        fi
+
+        cmake --install $GFLAGS_BUILD
+        if [ $? != 0 ]; then
+            fail "Failed to install GFLAGS"
+        fi
+    popd
+}
+#################################################################
+function usage
+{
+    echo "$script -schbvodp"
+    echo "Options:"
+    echo "  -h Display usage"
+    echo "  -s get sources"
+    echo "  -b build"
+    echo "  -d Build debug OpenVino"
+    echo "  -c clean build directory"
+    echo "  -v build only OpenCV"
+    echo "  -o build only OpenVINO (Requires previous Protobuf, OpenCV build)"
+    echo "  -p build only Protobuf"
+    echo "  -x debug script"
+}
+
+##################################################################
+# Main Script
+get_source=0
+do_build=0
+do_clean=0
+
+do_opencv=0
+do_openvino=0
+do_protobuf=0
+do_gflags=0
+
+clean_staging=0
+
+BUILD_TYPE=Release
+
+while getopts "schbvoxpd" optname; do
+    case "$optname" in
+        h)
+            usage
+            exit 0
+            ;;
+        s)
+            get_source=1
+            ;;
+        b)
+            do_build=1
+            ;;
+        d)
+           BUILD_TYPE=Debug
+            ;;
+        c)
+            do_clean=1
+            ;;
+        v)
+            do_opencv=1
+            ;;
+        o)
+            do_openvino=1
+            ;;
+        g)
+            do_gflags=1
+            ;;
+        p)
+            do_protobuf=1
+            ;;
+        x)
+            set -x
+            ;;
+    esac
+done
+shift "$(($OPTIND -1))"
+
+
+#####################################################
+# Parameters
+OPENCV_BUILD_TYPE=Release
+OPENCV_HOME=$BUILD_DIR/opencv
+OPENCV_BUILD=$OPENCV_HOME/build_$OPENCV_BUILD_TYPE
+
+OPENVINO_BUILD_TYPE=$BUILD_TYPE
+OPENVINO_HOME=$BUILD_DIR/openvino
+OPENVINO_BUILD=$OPENVINO_HOME/build_$OPENVINO_BUILD_TYPE
+
+GFLAGS_BUILD_TYPE=Release
+GFLAGS_HOME=$BUILD_DIR/gflags
+GFLAGS_BUILD=$GFLAGS_HOME/build_$GFLAGS_BUILD_TYPE
+
+PROTOBUF_BUILD_TYPE=Release
+PROTOBUF_HOME=$BUILD_DIR/protobuf
+PROTOBUF_BUILD=$PROTOBUF_HOME/cmake/build_$PROTOBUF_BUILD_TYPE
+
+######################################################
+# Setup the Yocto Toolchain
+
+if find -maxdepth 1 -type d -name "${YOCTO_SDK_NAME}" | grep -q .; then
+    echo "Using previous setup Yocto toolchain at: ${YOCTO_SDK}"
+else
+    if find ${DEV_HOME} -maxdepth 1 -type f -name "poky*.sh" | grep -q .; then
+        echo "Found poky SDK at ${DEV_HOME}"
+    elif [[ ! -z "${ED4_POKY_SDK_LOC}" ]]; then
+        echo "copying poky SDK in ${ED4_POKY_SDK_LOC} to ${DEV_HOME}"
+        cp ${ED4_POKY_SDK_LOC} ${DEV_HOME}/
+    else
+        echo "Poky SDK not found. You need to copy the poky SDK to ${DEV_HOME} or"
+        echo "do: export ED4_POKY_SDK_LOC=\"path_to_your_poky_sdk\""
+        exit 1
+    fi
+    ${RUNTIME_ROOT_DIR}/scripts/hps/setup_toolchain.sh poky*.sh
+fi
+
+TOOLCHAIN_PREFIX=`get_toolchain_prefix`
+SYSROOT=`get_sdksysroot`
+echo $TOOLCHAIN_PREFIX
+echo $SYSROOT
+
+if [ $? != 0 ]; then
+    exit 1
+fi
+
+# If not doing individual builds then enable all
+if [[ ($do_opencv -eq 0) && ($do_openvino -eq 0) && ($do_gflags -eq 0) && ($do_protobuf -eq 0) ]]; then
+    do_opencv=1
+    do_openvino=1
+    do_gflags=1
+    do_protobuf=1
+    if [[ $do_clean -ne 0 ]]; then
+        clean_staging=1
+    fi
+fi
+
+
+if [[  $get_source -ne 0 ]]; then
+    get_git_repo $OPENCV_HOME https://github.com/opencv/opencv.git true 4.8.0
+    get_git_repo $OPENVINO_HOME https://github.com/openvinotoolkit/openvino.git true 2023.3.0
+    get_git_repo $GFLAGS_HOME https://github.com/gflags/gflags.git false v2.2.2
+    get_git_repo $PROTOBUF_HOME https://github.com/protocolbuffers/protobuf.git false v3.9.0
+fi
+
+if [[ $do_clean -ne 0 ]]; then
+    if [[ $do_opencv -ne 0 ]]; then
+        if [ -e $OPENCV_BUILD ]; then
+            echo "Cleaning $OPENCV_BUILD"
+            rm -r $OPENCV_BUILD
+        fi
+    fi
+
+    if [[ $do_openvino -ne 0 ]]; then
+        if [ -e $OPENVINO_BUILD ]; then
+            echo "Cleaning $OPENVINO_BUILD"
+            rm -r $OPENVINO_BUILD
+        fi
+    fi
+
+    if [[ $do_gflags -ne 0 ]]; then
+        if [ -e $GFLAGS_BUILD ]; then
+            echo "Cleaning $GFLAGS_BUILD"
+            rm -r $GFLAGS_BUILD
+        fi
+    fi
+
+    if [[ $do_protobuf -ne 0 ]]; then
+        if [ -e $PROTOBUF_BUILD ]; then
+            echo "Cleaning $PROTOBUF_BUILD"
+            rm -r $PROTOBUF_BUILD
+        fi
+    fi
+
+    if [[ $clean_staging -ne 0 ]]; then
+        if [ -e $STAGING_DIR ]; then
+            echo "Cleaning $STAGING_DIR"
+            rm -r $STAGING_DIR
+        fi
+    fi
+fi
+
+if [[ $do_build -ne 0 ]]; then
+    # Check we have the build sources
+    if [ ! -e $OPENCV_HOME ]; then
+        fail "OPENCV Source not available"
+    fi
+
+    if [ ! -e $OPENVINO_HOME ]; then
+        fail "OPENVINO Source not available"
+    fi
+
+    if [ ! -e $PROTOBUF_HOME ]; then
+        fail "PROTOBUF_HOME Source not available"
+    fi
+
+    # Apply patches to the build and check that each applied correctly
+    pushd $OPENVINO_HOME >> //dev/null
+        git apply ${DEV_HOME}/patches/openvino_5cee8bbf29797f4544b343e803de957e9f041f92_gcc11.3.0.patch  2> /dev/null
+    popd >> //dev/null
+
+    if [ $? != 0 ]; then
+        fail "Failed to apply patch: ${DEV_HOME}/patches/openvino_5cee8bbf29797f4544b343e803de957e9f041f92_gcc11.3.0.patch"
+    fi
+
+    pushd $OPENVINO_HOME >> //dev/null
+        git apply ${DEV_HOME}/patches/flags.patch  2> /dev/null
+    popd >> //dev/null
+
+    if [ $? != 0 ]; then
+        fail "Failed to apply patch: ${DEV_HOME}/patches/flags.patch"
+    fi
+
+    pushd $OPENVINO_HOME/src/plugins/intel_cpu/thirdparty/ComputeLibrary >> //dev/null
+        git apply ${DEV_HOME}/patches/computelibrary.patch  2> /dev/null
+    popd >> //dev/null
+
+    if [ $? != 0 ]; then
+        fail "Failed to apply patch: ${DEV_HOME}/patches/computelibrary.patch"
+    fi
+
+    if [[ $do_gflags -ne 0 ]]; then
+        build_gflags
+    fi
+    unset gflags_ROOT
+    export gflags_ROOT=${GFLAGS_HOME}/install
+
+    # Build the libaries
+    if [[ $do_protobuf -ne 0 ]]; then
+        build_protobuf
+    fi
+
+    if [[ $do_opencv -ne 0 ]]; then
+        build_opencv
+    else
+        OPENCV_PREFIX="-DCMAKE_PREFIX_PATH=$OPENCV_BUILD"
+    fi
+
+    if [[ $do_openvino -ne 0 ]]; then
+        build_openvino
+    fi
+fi
+exit
diff --git a/python/openvino/runtime/build_runtime.sh b/python/openvino/runtime/build_runtime.sh
new file mode 100755
index 0000000..93755e1
--- /dev/null
+++ b/python/openvino/runtime/build_runtime.sh
@@ -0,0 +1,446 @@
+#!/bin/bash
+
+#=============================================================================
+# Script to build runtime plugin
+#=============================================================================
+GDB=0
+
+usage()
+{
+    echo "Build runtime plugin"
+    echo "Usage: ./build_runtime.sh [OPTIONS]"
+    echo "  -h                                      Show this help"
+    echo "  -cmake_debug                            Flag to compile in debug mode"
+    echo "  -verbosity=INFO|WARNING|ERROR|FATAL     Enable logging at desired verbosity"
+    echo "  -build_dir=<path>                       Location of the runtime build"
+    echo "  -disable_jit                            Disable JIT execution mode - this removes dependencies on precompiled DLA libraries"
+    echo "  -build_demo                             Build runtime demos"
+    echo "  -target_de10_agilex                     Target the DE10 Agilex board."
+    echo "  -target_agx7_i_dk                       Target the Agilex 7 Iseries board."
+    echo "  -target_agx7_n6001                      Target the Agilex N6001 board."
+    echo "  -hps_platform                           Target a HPS based hardware."
+    echo "                                          This option should be used only via the create_hps_image.sh script"
+    echo "  -hps_machine=<machine>                  Target a specific machine. Used with -hps_platform. Options: arria10 (Arria 10 SoC),"
+    echo "                                          agilex7_dk_si_agi027fa (Agilex 7 SoC) [if not specified, default is Arria 10]"
+    echo "  -aot_splitter_example                   Build the aot splitter example"
+    echo "  -target_emulation                       Target the software emulation (aka emulator) build." 
+    echo "  -target_system_console                  Target a device that communicates with the host via system-console."
+}
+
+hidden_usage()
+{
+    usage
+    echo "  -coredla_dir_cmake=<path>               Intended for regtest which packages coredla and runtime separately"
+    echo "  -coredla_dir_lib=<path>                 Intended for regtest which packages coredla and runtime separately"
+    echo "  -encryption=<0|1>                       Without OpenSSL, can still build runtime without encryption support"
+    echo "  -no_make                                Skip final make command for Klocwork"
+    echo "  -polling                                Use polling instead of interrupts"
+    echo "  -target_a10_pac                         Target the Arria 10 PAC [EOL 2024.1 Release]."
+    echo "  -run_tests                              Runs short build tests. For Altera internal usage only."
+}
+
+OPT_RUNTIME_POLLING=false
+OPT_DISABLE_JIT=false
+OPT_BUILD_DEMO=false
+OPT_RUN_TESTS=false
+RUNTIME_VERBOSITY="-DRUNTIME_VERBOSITY=0"
+
+TARGET=""
+
+#Terrasic production BSP kernel space driver header files
+TERASIC_KERNEL_HEADER_FILES="hw_host_channel.h hw_pcie_constants.h pcie_linux_driver_exports.h"
+
+for i in "$@"; do
+    case $i in
+        -h | --help )                                   usage
+                                                        exit
+                                                        ;;
+        -cmake_debug | --cmake_debug )                  GDB=1
+                                                        ;;
+        -verbosity=* | --verbosity=* )                  RUNTIME_VERBOSITY="-DRUNTIME_VERBOSITY=${i#*=}"
+                                                        shift # pass argument=value
+                                                        ;;
+        -build_dir=* | --build_dir=* )                  BUILD_DIR_USER="${i#*=}"
+                                                        shift
+                                                        ;;
+        -disable_jit | --disable_jit )                  OPT_DISABLE_JIT=true
+                                                        ;;
+        -build_demo | --build_demo )                    OPT_BUILD_DEMO=true
+                                                        ;;
+        -target_de10_agilex | --target_de10_agilex )    PLATFORM_NAME="Terasic DE 10"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=DE10_AGILEX"
+                                                        ;;
+        -target_a10_pac | --target_a10_pac )            PLATFORM_NAME="PAC A10"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=DCP_A10_PAC"
+                                                        ;;
+        -target_agx7_i_dk | --target_agx7_i_dk )        PLATFORM_NAME="AGX7 ISERIES DK"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=AGX7_I_DK"
+                                                        ;;
+        -target_agx7_n6001 | --target_agx7_n6001 )      PLATFORM_NAME="AGX7 N6001"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=AGX7_N6001"
+                                                        ;;
+        -target_emulation | --target_emulation )        PLATFORM_NAME="EMULATION"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=EMULATION"
+                                                        ;;
+        -target_system_console | --target_system_console ) PLATFORM_NAME="SYSTEM CONSOLE"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=SYSTEM_CONSOLE"
+                                                        OPT_RUNTIME_POLLING=true
+                                                        ;;
+        # If HPS then we disable the JIT.
+        -hps_platform | --hps_platform )                PLATFORM_NAME="ARM Soc FPGA Platform"
+                                                        BUILD_PLATFORM="-DHW_BUILD_PLATFORM=HPS_PLATFORM"
+                                                        OPT_DISABLE_JIT=true
+                                                        HPS_PLATFORM_BUILD=1
+                                                        OPT_RUNTIME_POLLING=true
+                                                        HPS_BUILD_MACHINE="-DHPS_BUILD_MACHINE=arria10"
+                                                        ;;
+        # Specify the HPS machine. Default is Arria 10.
+        -hps_machine=* | --hps_machine=* )              if [ -z ${HPS_PLATFORM_BUILD} ]; then
+                                                            echo "Error: -hps_machine can only be specified with -hps_platform"
+                                                            exit 1
+                                                        fi
+                                                        HPS_BUILD_MACHINE="-DHPS_BUILD_MACHINE=${i#*=}"
+                                                        ;;
+        -aot_splitter_example | --aot_splitter_example ) TARGET="dla_aot_splitter_example"
+                                                         ;;
+        -ed3_streaming_example | --ed3_streaming_example ) TARGET="ed3_streaming_example"
+                                                         ;;
+        # all options below are hidden features and therefore not listed in usage()
+        -hidden_help | --hidden_help )                  hidden_usage
+                                                        exit
+                                                        ;;
+        -coredla_dir_cmake=* | --coredla_dir_cmake=* )  COREDLA_DIR_USER_CMAKE="${i#*=}"
+                                                        shift
+                                                        ;;
+        -coredla_dir_lib=* | --coredla_dir_lib=* )      COREDLA_DIR_USER_LIB="${i#*=}"
+                                                        shift
+                                                        ;;
+        -encryption=* | --encryption=* )                ENCRYPTION_USER="${i#*=}"
+                                                        shift
+                                                        ;;
+        -no_make | --no_make )                          SKIP_MAKE=1
+                                                        ;;
+        -polling | --polling )                          OPT_RUNTIME_POLLING=true
+                                                        ;;
+        -run_tests | --run_tests )                      OPT_RUN_TESTS=true
+                                                        ;;
+        * )                                             echo "Error: Unrecognised argument: $i"
+                                                        usage
+                                                        exit 1
+    esac
+    shift
+done
+
+if [[ -z "${PLATFORM_NAME}"  ]]; then
+    echo "Error: Please specify which platform to build the runtime for. Run ./build_runtime.sh -h to see usage"
+    exit 1
+fi
+
+# Currently runtime demos do not work with the AOT flow. Throw an error to remind the user.
+if $OPT_DISABLE_JIT && $OPT_BUILD_DEMO; then
+    echo "Error: Cannot build runtime demos with JIT disabled."
+    exit 1
+fi
+
+# set to 0 to remove OpenSSL dependency from customer flow
+# IMPORTANT (for Intel release manager): this must be consistent with ALLOW_ENCRYPTION in ${COREDLA_ROOT}/Makefile
+ALLOW_ENCRYPTION=0
+if [ ! -z "$ENCRYPTION_USER" ]; then
+    ALLOW_ENCRYPTION=${ENCRYPTION_USER}
+fi
+if [ "$ALLOW_ENCRYPTION" == "1" ]; then
+    DLA_ALLOW_ENCRYPTION="-DDLA_ALLOW_ENCRYPTION=1"
+fi
+
+if [[ -z "${COREDLA_ROOT}" ]]; then
+    echo "Error: COREDLA_ROOT environment variable not set. Run init_env.sh script first."
+    exit 1
+fi
+
+# if CoreDLA Config Directory is not under root check under build directory
+COREDLA_DIR_CMAKE=${COREDLA_ROOT}/cmake
+COREDLA_DIR_LIB=${COREDLA_ROOT}/lib
+
+echo ${COREDLA_DIR_USER_CMAKE}
+
+# Only need to check if cmake exists since COREDLA_ROOT/cmake and COREDLA_ROOT/lib are in same paths
+if [[ ! -d "${COREDLA_DIR_CMAKE}" && -d "${COREDLA_ROOT}/build/coredla/dla/cmake" ]]; then
+    COREDLA_DIR_CMAKE=${COREDLA_ROOT}/build/coredla/dla/cmake
+    COREDLA_DIR_LIB=${COREDLA_ROOT}/build/coredla/dla/lib
+fi
+if [ ! -z "$COREDLA_DIR_USER_CMAKE" ]; then
+    COREDLA_DIR_CMAKE=${COREDLA_DIR_USER_CMAKE}
+    COREDLA_DIR_LIB=${COREDLA_DIR_USER_LIB}
+fi
+if [ ! -d "$COREDLA_DIR_CMAKE" ]; then
+   # This error should not be possible in a correctly deployed build.  It should
+   # only happen in a developer environment.
+   echo "Error: $COREDLA_DIR_CMAKE not found.  Did you remember to do: cd \$COREDLA_ROOT && make"
+   exit 1
+fi
+
+# A deployed build has $COREDLA_ROOT/util/compiled_result/, $COREDLA_ROOT/util/transformations/,
+# whereas an Intel-internal developer build has these as just $COREDLA_ROOT/compiled_result/, etc.
+# Would prefer to syncrhonize these.  They could be found within cmake by ${CoreDLA_DIR}/../util/,
+# but that would mean developer changes to compiled_result/ and transformations/ need a `make`
+# before they are visible to the runtime build.
+if [ -d "$COREDLA_ROOT"/util/compiled_result ]; then
+   COREDLA_XUTIL_DIR="$COREDLA_ROOT"/util
+else
+   COREDLA_XUTIL_DIR="$COREDLA_ROOT"
+fi
+export COREDLA_XUTIL_DIR
+
+if [ ! -z "${PLATFORM_NAME}" ]; then
+    echo "Building runtime for ${PLATFORM_NAME}"
+fi
+
+if $OPT_BUILD_DEMO; then
+    echo "Runtime demos will be built."
+    if [ "$GDB" == "1" ]; then
+        echo "To test the runtime demo performance, please use the release build instead."
+    fi
+fi
+
+SCRIPT_DIR=$(cd "$(dirname $0)" >/dev/null 2>&1 && pwd)
+RUNTIME_ROOT_DIR=$(cd "${SCRIPT_DIR}" >/dev/null 2>&1 && pwd)
+
+cd ${RUNTIME_ROOT_DIR}
+
+BUILD_TYPE=Release
+
+if [ "$GDB" == "1" ]; then
+    echo "Building in debug mode"
+    BUILD_TYPE=Debug
+fi
+
+BUILD_DIR=build_${BUILD_TYPE}
+if [ ! -z "$BUILD_DIR_USER" ]; then
+    BUILD_DIR=${BUILD_DIR_USER}
+fi
+
+
+if [ ! -d "$BUILD_DIR" ]; then
+    mkdir -p ${BUILD_DIR}
+fi
+
+# Checking the type of build
+if [ "${PLATFORM_NAME}" = "Terasic DE 10" ]; then
+    if [ ! -z "${AOCL_BOARD_PACKAGE_ROOT}" ]; then
+        echo "Copying necessary header files from Terasic Production BSP"
+        for i in $TERASIC_KERNEL_HEADER_FILES; do
+            if ! cp "${AOCL_BOARD_PACKAGE_ROOT}/linux64/driver/${i}" "${RUNTIME_ROOT_DIR}/coredla_device/mmd/de10_agilex/host/"; then
+                echo "Error: Unable to copy ${i} from ${AOCL_BOARD_PACKAGE_ROOT}/linux64/driver"
+                exit 1
+            fi
+        done
+    else
+        echo "Error: Environment variable AOCL_BOARD_PACKAGE_ROOT must be set to the Terasic BSP path."
+        exit 1
+    fi
+fi
+
+if [ "${PLATFORM_NAME}" = "PAC A10" ]; then
+    if [[ -z "${OPAE_SDK_ROOT}" ]]; then
+        echo "Error: OPAE_SDK_ROOT environment variable not set. Run OPAE setup script before."
+        echo "       If OPAE has been installed into the default location, use: export OPAE_SDK_ROOT=/usr"
+        exit 1
+    fi
+    # Some quick checks that OPAE exists where we expect it.  Note that
+    # coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake has some search locations
+    # for OPAE hardcoded - so in reality, we will find it in /usr/ even if $OPAE_SDK_ROOT
+    # points to the wrong directory; but prefer to enforce a valid $OPAE_SDK_ROOT here.
+    if [ ! -f "$OPAE_SDK_ROOT/include/opae/fpga.h" -o \
+            ! \( -f "$OPAE_SDK_ROOT/lib/libopae-c.so" -o -f "$OPAE_SDK_ROOT/lib64/libopae-c.so" \) ]
+    then
+        echo "Error: OPAE not found at location specified by OPAE_SDK_ROOT."
+        exit 1
+    fi
+
+    OPAE_DIR="-DLIBOPAE-C_ROOT=${OPAE_SDK_ROOT}/"
+fi
+
+if [ "${PLATFORM_NAME}" = "AGX7 ISERIES DK" ] || [ "${PLATFORM_NAME}" = "AGX7 N6001" ]; then
+    if [[ -z "${OPAE_SDK_ROOT}" ]]; then
+        echo "Error: OPAE_SDK_ROOT environment variable not set. Run OPAE setup script before."
+        echo "       If OPAE has been installed into the default location, use: export OPAE_SDK_ROOT=/usr"
+        exit 1
+    fi
+    # Some quick checks that OPAE exists where we expect it.  Note that
+    # coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake has some search locations
+    # for OPAE hardcoded - so in reality, we will find it in /usr/ even if $OPAE_SDK_ROOT
+    # points to the wrong directory; but prefer to enforce a valid $OPAE_SDK_ROOT here.
+    if [ ! -f "$OPAE_SDK_ROOT/include/opae/fpga.h" -o \
+            ! \( -f "$OPAE_SDK_ROOT/lib/libopae-c.so" -o -f "$OPAE_SDK_ROOT/lib64/libopae-c.so" \) ]
+    then
+        echo "Error: OPAE not found at location specified by OPAE_SDK_ROOT."
+        exit 1
+    fi
+
+    OPAE_DIR="-DLIBOPAE-C_ROOT=${OPAE_SDK_ROOT}/"
+fi
+
+if [ "${PLATFORM_NAME}" = "EMULATION" ]; then
+    if [ -z "${COREDLA_DIR_LIB}/libdla_emulator.so" ]; then
+         # This should not happen in a correctly deployed build
+        echo "The software emulator shared library libdla_emulator.so does not exist in ${COREDLA_DIR_LIB}"
+        exit 1
+    fi
+fi
+
+if $OPT_RUNTIME_POLLING; then
+    echo "Warning: using polling instead of interrupts"
+fi
+
+if $OPT_DISABLE_JIT; then
+    echo "Building without just-in-time (JIT) execution functionality"
+fi
+
+# We must specify a default for $RUNTIME_POLLING so that cmake does a rebuild if
+# the polling option changes.
+RUNTIME_POLLING="-DRUNTIME_POLLING=0";
+$OPT_RUNTIME_POLLING && RUNTIME_POLLING="-DRUNTIME_POLLING=1"
+
+# We must specify a default for $DSIABLE_JIT sot hat cmake does a rebuild if the
+# option changes.
+DISABLE_JIT="-DDISABLE_JIT=0";
+$OPT_DISABLE_JIT && DISABLE_JIT="-DDISABLE_JIT=1"
+
+# We use a default of "" for BUILD_DEMO.  This means that cmake will not force a rebuild
+# if the -build_demo option is specified on a first build and then not specified on a second
+# build (unless something else forces a rebuild, of course).
+#
+BUILD_DEMO=""
+$OPT_BUILD_DEMO && BUILD_DEMO="-DBUILD_DEMO=1"
+
+# On Ubuntu18 devices demos may break with:
+# "Cannot load library ... libcoreDLARuntimePlugin.so ... undefined symbol: dla_mmd_ddr_write"
+# This is caused if /opt/intelFPGA_pro/quartus_19.2.0b57/hld/host/linux64/lib is in LD_LIBRARY_PATH
+# The known fix is to simply remove it
+os_release=$(lsb_release -rs)
+conflicting_dir="intelFPGA_pro/quartus_19.2.0b57/hld/host/linux64/lib"
+if [[ "$os_release" == "18."* && "$OPT_BUILD_DEMO" == true && ":$LD_LIBRARY_PATH:" == *"$conflicting_dir:"* ]]; then
+    echo -e "\e[91mError: Ubuntu18 runtime demo build detected. The demos may break with $conflicting_dir in the LD_LIBRARY_PATH. Please remove and recompile.\e[0m"
+    exit 1
+fi
+
+if [ -z ${HPS_PLATFORM_BUILD} ]; then
+    # When this happens (ie: differing g++ and gcc versions), the resulting link errors can be
+    # rather confusing.  Perhaps testing for this is overkill?  We used to allow an environment
+    # variable override of CXX, which is what made it easier to induce the version mismatch.
+    CXX=$(dirname $COREDLA_GCC)/g++
+    CC_VERSION=$($COREDLA_GCC --version | head -1 | awk '{print $NF}')
+    CXX_VERSION=$($CXX --version | head -1 | awk '{print $NF}')
+    if [ "$CC_VERSION" = "" -o "$CC_VERSION" != "$CXX_VERSION" ]; then
+        echo "Error: $COREDLA_GCC version is \"$CC_VERSION\" but $CXX version is \"$CXX_VERSION\""
+        echo "       Both compilers must have the same version number."
+        exit 1
+    fi
+
+    set -x
+    
+    cd ${BUILD_DIR} || exit 1
+    # Runtime demos will not be built by default. Use the -build_demo flag to build them.
+    CC=$COREDLA_GCC CXX=$CXX cmake ${RUNTIME_VERBOSITY} ${RUNTIME_POLLING} ${BUILD_PLATFORM} ${OPAE_DIR} ${DLA_ALLOW_ENCRYPTION} -DCoreDLA_DIR=${COREDLA_DIR_CMAKE} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${RUNTIME_ROOT_DIR} ${DISABLE_JIT} ${BUILD_DEMO}
+
+    cmake_exit_code=$?
+    set +x
+
+else
+    # Setup the Yocto Toolchain
+    ${RUNTIME_ROOT_DIR}/scripts/hps/setup_toolchain.sh poky*.sh
+    if [ $? != 0 ]; then
+        echo -e "\nNote: Directly calling build_runtime.sh --hps_platform is for internal only. "\
+                "If you are building runtime for ED4, use ${RUNTIME_ROOT_DIR}/create_hps_image.sh instead.\n"
+        exit 1
+    fi
+    TOOLCHAIN_FILE=${RUNTIME_ROOT_DIR}/embedded_arm_sdk/cmake/embedded.arm.cmake
+
+    HPS_PACKAGES_DIR=`pwd`/hps_packages
+    HPS_INSTALL_PACKAGES=${HPS_PACKAGES_DIR}/armcpu_package
+
+    export INTEL_OPENVINO_DIR=${HPS_INSTALL_PACKAGES}
+    # Check that the Local OPENVINO build has been done
+    if [ ! -e ${INTEL_OPENVINO_DIR} ]; then
+        echo "Error: Pre-built openvino package not found."
+        echo "     : Run ./build_hpspackages.sh -sb"
+        echo -e "\nNote: Directly calling build_runtime.sh --hps_platform is for internal only. "\
+                "If you are building runtime for ED4, use ${RUNTIME_ROOT_DIR}/create_hps_image.sh instead.\n"
+        exit 1
+    fi
+    # in OpenVINO 2022.3, setupvars.sh sits in ${INTEL_OPENVINO_DIR}, not in S{INTEL_OPENVINO_DIR}/bin
+    source ${INTEL_OPENVINO_DIR}/setupvars.sh
+
+    unset gflags_ROOT
+    export gflags_ROOT=${HPS_INSTALL_PACKAGES}/gflags
+    CMAKE_OPTIONS="-DCMAKE_PREFIX_PATH=${HPS_INSTALL_PACKAGES}/opencv;${HPS_INSTALL_PACKAGES}/gflags;${HPS_INSTALL_PACKAGES}/protobuf"
+
+    set -x
+    cd ${BUILD_DIR} || exit 1
+    cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} ${RUNTIME_VERBOSITY} ${RUNTIME_POLLING} ${BUILD_PLATFORM} ${HPS_BUILD_MACHINE} ${OPAE_DIR} ${DLA_ALLOW_ENCRYPTION} -DCoreDLA_DIR=${COREDLA_DIR_CMAKE} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${RUNTIME_ROOT_DIR} ${DISABLE_JIT} ${BUILD_DEMO} ${CMAKE_OPTIONS}
+
+    cmake_exit_code=$?
+    set +x
+fi
+
+if [ $cmake_exit_code != 0 ]; then
+    echo "Error: cmake failed"
+    exit 1
+fi
+
+function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4); }'; }
+cmake_dot_version=$(cmake --version | grep 'cmake version' | awk '{print $3}')
+CMAKE_PARALLEL=
+if [ $(version ${cmake_dot_version} ) -ge $(version "3.12.0") ]; then
+  # Check total_mem so that we are consistent between runs (even through free_mem is arguably
+  # more relevant).
+  total_mem=`free -g | grep Mem | awk '{print $2}'`
+  if [ "$total_mem" -gt 48 ]; then
+    CMAKE_PARALLEL="--parallel"
+  fi
+fi
+
+# Check if we should skip the make process
+if [ "$SKIP_MAKE" != "1" ]; then
+    if [ "${TARGET}" != "" ]; then
+        set -x
+        cmake --build . --target "${TARGET}" ${CMAKE_PARALLEL}
+        make_result=$?
+        set +x
+    else
+        set -x
+        cmake --build . ${CMAKE_PARALLEL}
+        make_result=$?
+        set +x
+    fi
+
+    # If the build failed, exit with the make result
+    if [ $make_result -ne 0 ]; then
+        exit $make_result
+    fi
+fi
+
+# Check if tests should be run based on OPT_RUN_TESTS variable
+if [ "$OPT_RUN_TESTS" = "true" ]; then
+    if [[ -n "$GITHUB_REPOSITORY_OWNER" ]]; then
+        # Runs in GitHub
+        LINKER_TEST_SCRIPT="$GITHUB_WORKSPACE/runtime/scripts/internal/linker_test.sh"
+    else
+        # Runs locally
+        LINKER_TEST_SCRIPT="$RUNTIME_ROOT_DIR/scripts/internal/linker_test.sh"
+    fi
+    if [ -f "$LINKER_TEST_SCRIPT" ]; then
+        # Notify that the build was successful and tests are starting
+        echo -e "\033[1;33mBuild successful. Running linker test...\033[0m"
+        if ! "$LINKER_TEST_SCRIPT" "$BUILD_DIR" "$PLATFORM_NAME" "$OPT_DISABLE_JIT"; then
+            echo "Error: Linker test script failed with a non-zero return code." >&2
+            exit 1
+        fi
+    else
+        echo "Error: Tests not found."
+        exit 1
+    fi
+fi
+
+# If we reach this point, the build and any tests were successful
+exit 0
diff --git a/python/openvino/runtime/classification_sample_async/CMakeLists.txt b/python/openvino/runtime/classification_sample_async/CMakeLists.txt
new file mode 100644
index 0000000..96e5578
--- /dev/null
+++ b/python/openvino/runtime/classification_sample_async/CMakeLists.txt
@@ -0,0 +1,58 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+        set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+endif()
+
+set (TARGET_NAME "classification_sample_async")
+
+file (GLOB MAIN_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+file (GLOB MAIN_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/classification_sample_async.h
+)
+
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+if (NOT WIN32)
+    set (LIB_DL dl)
+endif()
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    # Demo utils
+    ${CMAKE_CURRENT_SOURCE_DIR}/../common/demo_utils/include/utils
+    # FPGA plugin configs
+    $ENV{COREDLA_ROOT}/dla_plugin/inc
+)
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+    coreDLAHeteroPlugin
+    openvino::runtime
+    ${OpenCV_LIBRARIES}
+    format_reader
+    ie_samples_utils
+)
+
+if(NOT WIN32)
+    target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_DL} pthread)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
+
+# For libcoreDlaRuntimePlugin.so - typically specified by $COREDLA_ROOT/runtime/plugins.xml
+set_target_properties(${TARGET_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN/..")
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT DEMO)
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/not_shipped/bin" COMPONENT NOT_SHIPPED)
diff --git a/python/openvino/runtime/classification_sample_async/README.md b/python/openvino/runtime/classification_sample_async/README.md
new file mode 100644
index 0000000..95c5cde
--- /dev/null
+++ b/python/openvino/runtime/classification_sample_async/README.md
@@ -0,0 +1,13 @@
+# Image Classification C++ Sample Async
+
+### Running with CoreDLA
+In addition to the options described below, include the arguments:
+-  `-plugins=<path the plugins.xml>`, using the path to [plugins.xml](../plugins.xml)
+- `-d HETERO:FPGA,CPU`
+- `-arch_file <path to arch file>`, using the path to the architecture used when creating the FPGA bitstream
+
+Use the -build_demo option to the runtime/build_runtime.sh script to build this demo.
+
+See the documentation that is included with the example design.
+
+For detailed information on the OpenVINO Classification Sample Async Demo, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/samples/cpp/classification_sample_async) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
diff --git a/python/openvino/runtime/classification_sample_async/classification_sample_async.h b/python/openvino/runtime/classification_sample_async/classification_sample_async.h
new file mode 100644
index 0000000..a61a04e
--- /dev/null
+++ b/python/openvino/runtime/classification_sample_async/classification_sample_async.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "dla_plugin_config.hpp"
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message.";
+
+/// @brief message for model argument
+static const char model_message[] = "Required. Path to an .xml file with a trained model.";
+
+/// @brief message for images argument
+static const char image_message[] =
+    "Required. Path to a folder with images or path to an image files: a .ubyte file for LeNet"
+    " and a .bmp file for the other networks.";
+
+/// @brief message for assigning cnn calculation to device
+static const char target_device_message[] =
+    "Optional. Specify the target device to infer on (the list of available devices is shown below). "
+    "Default value is CPU. Use \"-d HETERO:<comma_separated_devices_list>\" format to specify HETERO plugin. "
+    "Sample will look for a suitable plugin for device specified.";
+
+/// @brief message for plugin messages
+static const char plugin_message[] = "Optional. Enables messages from a plugin";
+
+// @brief message for performance counters option
+static const char plugins_message[] = "Optional. Select a custom plugins_xml file to use.";
+// @brief message for architecture .arch file
+static const char arch_file_message[] = "Optional. Provide a path for the architecture .arch file.";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Define parameter for set image file <br>
+/// It is a required parameter
+DEFINE_string(i, "", image_message);
+
+/// @brief Define parameter for set model file <br>
+/// It is a required parameter
+DEFINE_string(m, "", model_message);
+
+/// @brief device the target device to infer on <br>
+/// It is an optional parameter
+DEFINE_string(d, "CPU", target_device_message);
+
+/// @brief Path to a plugins_xml file
+DEFINE_string(plugins, "", plugins_message);
+/// @brief Path to arch file
+DEFINE_string(arch_file, "", arch_file_message);
+
+
+/**
+ * @brief This function show a help message
+ */
+static void show_usage() {
+    std::cout << std::endl;
+    std::cout << "classification_sample_async [OPTION]" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    -h                      " << help_message << std::endl;
+    std::cout << "    -m \"<path>\"             " << model_message << std::endl;
+    std::cout << "    -i \"<path>\"             " << image_message << std::endl;
+    std::cout << "    -d \"<device>\"           " << target_device_message << std::endl;
+}
diff --git a/python/openvino/runtime/classification_sample_async/main.cpp b/python/openvino/runtime/classification_sample_async/main.cpp
new file mode 100644
index 0000000..929b109
--- /dev/null
+++ b/python/openvino/runtime/classification_sample_async/main.cpp
@@ -0,0 +1,259 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief The entry point the OpenVINO Runtime sample application
+ * @file classification_sample_async/main.cpp
+ * @example classification_sample_async/main.cpp
+ */
+
+#include <sys/stat.h>
+
+#include <condition_variable>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+// clang-format off
+#include "openvino/openvino.hpp"
+
+#include "samples/args_helper.hpp"
+#include "samples/common.hpp"
+#include "samples/classification_results.h"
+#include "samples/slog.hpp"
+#include "format_reader_ptr.h"
+
+#include "classification_sample_async.h"
+// clang-format on
+
+constexpr auto N_TOP_RESULTS = 10;
+
+using namespace ov::preprocess;
+
+bool exists_test (const std::string& name) {
+  struct stat buffer;
+  return (stat (name.c_str(), &buffer) == 0);
+}
+
+/**
+ * @brief Checks input args
+ * @param argc number of args
+ * @param argv list of input arguments
+ * @return bool status true(Success) or false(Fail)
+ */
+bool parse_and_check_command_line(int argc, char* argv[]) {
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        show_usage();
+        showAvailableDevices();
+        return false;
+    }
+    slog::info << "Parsing input parameters" << slog::endl;
+
+    if (FLAGS_m.empty()) {
+        show_usage();
+        throw std::logic_error("Model is required but not set. Please set -m option.");
+    }
+
+    if (FLAGS_i.empty()) {
+        show_usage();
+        throw std::logic_error("Input is required but not set. Please set -i option.");
+    }
+
+    if(!FLAGS_plugins.empty()) {
+        std::cout << "Using custom plugins xml file - " << FLAGS_plugins << std::endl;
+    }
+
+    if (!exists_test(FLAGS_plugins)) {
+        std::cout << "Error: plugins_xml file: " << FLAGS_plugins << " doesn't exist. Please provide a valid path." << std::endl;
+        throw std::logic_error("plugins_xml file path does not exist.");
+    }
+
+    return true;
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        // -------- Get OpenVINO Runtime version --------
+        slog::info << ov::get_openvino_version() << slog::endl;
+
+        // -------- Parsing and validation of input arguments --------
+        if (!parse_and_check_command_line(argc, argv)) {
+            return EXIT_SUCCESS;
+        }
+
+        // -------- Read input --------
+        // This vector stores paths to the processed images
+        std::vector<std::string> image_names;
+        parseInputFilesArguments(image_names);
+        if (image_names.empty())
+            throw std::logic_error("No suitable images were found");
+
+        // -------- Step 1. Initialize OpenVINO Runtime Core --------
+        ov::Core core(FLAGS_plugins);
+
+        if(FLAGS_arch_file != "" && FLAGS_d.find("FPGA") != std::string::npos){
+            core.set_property("FPGA", { { DLIAPlugin::properties::arch_path.name(), FLAGS_arch_file } });
+            if (!exists_test(FLAGS_arch_file)) {
+                std::cout << "Error: architecture file: " << FLAGS_arch_file << " doesn't exist. Please provide a valid path." << std::endl;
+                throw std::logic_error("architecture file path does not exist.");
+            }
+        }
+        // -------- Step 2. Read a model --------
+        slog::info << "Loading model files:" << slog::endl << FLAGS_m << slog::endl;
+        std::shared_ptr<ov::Model> model = core.read_model(FLAGS_m);
+        printInputAndOutputsInfo(*model);
+
+        OPENVINO_ASSERT(model->inputs().size() == 1, "Sample supports models with 1 input only");
+        OPENVINO_ASSERT(model->outputs().size() == 1, "Sample supports models with 1 output only");
+
+        // -------- Step 3. Configure preprocessing --------
+        const ov::Layout tensor_layout{"NHWC"};
+
+        ov::preprocess::PrePostProcessor ppp(model);
+        // 1) input() with no args assumes a model has a single input
+        ov::preprocess::InputInfo& input_info = ppp.input();
+        // 2) Set input tensor information:
+        // - precision of tensor is supposed to be 'u8'
+        // - layout of data is 'NHWC'
+        input_info.tensor().set_element_type(ov::element::u8).set_layout(tensor_layout);
+        // 3) Here we suppose model has 'NCHW' layout for input
+        // DLA --> We let the demo select the layout based on the model
+        // input_info.model().set_layout("NCHW");
+        // 4) output() with no args assumes a model has a single result
+        // - output() with no args assumes a model has a single result
+        // - precision of tensor is supposed to be 'f32'
+        ppp.output().tensor().set_element_type(ov::element::f32);
+
+        // 5) Once the build() method is called, the pre(post)processing steps
+        // for layout and precision conversions are inserted automatically
+        model = ppp.build();
+
+        // -------- Step 4. read input images --------
+        slog::info << "Read input images" << slog::endl;
+
+        ov::Shape input_shape = model->input().get_shape();
+        const size_t width = input_shape[ov::layout::width_idx(tensor_layout)];
+        const size_t height = input_shape[ov::layout::height_idx(tensor_layout)];
+
+        std::vector<std::shared_ptr<unsigned char>> images_data;
+        std::vector<std::string> valid_image_names;
+        for (const auto& i : image_names) {
+            FormatReader::ReaderPtr reader(i.c_str());
+            if (reader.get() == nullptr) {
+                slog::warn << "Image " + i + " cannot be read!" << slog::endl;
+                continue;
+            }
+            // Collect image data
+            std::shared_ptr<unsigned char> data(reader->getData(width, height, FormatReader::Reader::ResizeType::RESIZE));
+            if (data != nullptr) {
+                images_data.push_back(data);
+                valid_image_names.push_back(i);
+            }
+        }
+        if (images_data.empty() || valid_image_names.empty())
+            throw std::logic_error("Valid input images were not found!");
+
+        // -------- Step 5. Loading model to the device --------
+        // Setting batch size using image count
+        const size_t batchSize = images_data.size();
+        slog::info << "Set batch size " << std::to_string(batchSize) << slog::endl;
+        ov::set_batch(model, batchSize);
+        printInputAndOutputsInfo(*model);
+
+        // -------- Step 6. Loading model to the device --------
+        slog::info << "Loading model to the device " << FLAGS_d << slog::endl;
+        ov::CompiledModel compiled_model = core.compile_model(model, FLAGS_d);
+
+        // -------- Step 7. Create infer request --------
+        slog::info << "Create infer request" << slog::endl;
+        ov::InferRequest infer_request = compiled_model.create_infer_request();
+
+        // -------- Step 8. Combine multiple input images as batch --------
+        ov::Tensor input_tensor = infer_request.get_input_tensor();
+
+        for (size_t image_id = 0; image_id < images_data.size(); ++image_id) {
+            const size_t image_size = shape_size(model->input().get_shape()) / batchSize;
+            std::memcpy(input_tensor.data<std::uint8_t>() + image_id * image_size,
+                        images_data[image_id].get(),
+                        image_size);
+        }
+
+        // -------- Step 9. Do asynchronous inference --------
+        size_t num_iterations = 10;
+        size_t cur_iteration = 0;
+        std::condition_variable condVar;
+        std::mutex mutex;
+        std::exception_ptr exception_var;
+        // -------- Step 10. Do asynchronous inference --------
+        infer_request.set_callback([&](std::exception_ptr ex) {
+            std::lock_guard<std::mutex> l(mutex);
+            if (ex) {
+                exception_var = ex;
+                condVar.notify_all();
+                return;
+            }
+
+            cur_iteration++;
+            slog::info << "Completed " << cur_iteration << " async request execution" << slog::endl;
+            if (cur_iteration < num_iterations) {
+                // here a user can read output containing inference results and put new
+                // input to repeat async request again
+                infer_request.start_async();
+            } else {
+                // continue sample execution after last Asynchronous inference request
+                // execution
+                condVar.notify_one();
+            }
+        });
+
+        // Start async request for the first time
+        slog::info << "Start inference (asynchronous executions)" << slog::endl;
+        infer_request.start_async();
+
+        // Wait all iterations of the async request
+        std::unique_lock<std::mutex> lock(mutex);
+        condVar.wait(lock, [&] {
+            if (exception_var) {
+                std::rethrow_exception(exception_var);
+            }
+
+            return cur_iteration == num_iterations;
+        });
+
+        slog::info << "Completed async requests execution" << slog::endl;
+
+        // -------- Step 11. Process output --------
+        ov::Tensor output = infer_request.get_output_tensor();
+
+        // Read labels from file (e.x. AlexNet.labels)
+        std::string labelFileName = fileNameNoExt(FLAGS_m) + ".labels";
+        std::vector<std::string> labels;
+
+        std::ifstream inputFile;
+        inputFile.open(labelFileName, std::ios::in);
+        if (inputFile.is_open()) {
+            std::string strLine;
+            while (std::getline(inputFile, strLine)) {
+                trim(strLine);
+                labels.push_back(strLine);
+            }
+        }
+
+        // Prints formatted classification results
+        ClassificationResult classificationResult(output, valid_image_names, batchSize, N_TOP_RESULTS, labels);
+        classificationResult.show();
+    } catch (const std::exception& ex) {
+        slog::err << ex.what() << slog::endl;
+        return EXIT_FAILURE;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/python/openvino/runtime/common/CMakeLists.txt b/python/openvino/runtime/common/CMakeLists.txt
new file mode 100644
index 0000000..8ea3028
--- /dev/null
+++ b/python/openvino/runtime/common/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# Add dependencies for the following modules
+find_package(OpenCV COMPONENTS core REQUIRED)
+
+# pull in plugin apis and preproc
+add_subdirectory(utils)
+add_subdirectory(format_reader)
+add_subdirectory(monitors)
+
+if(DEFINED BUILD_DEMO)
+    # This dependency defines CNN prototypes used by text-detection demos.
+    include_directories("$ENV{COREDLA_ROOT}/transformations/inc/")
+    add_subdirectory(demo_utils)
+    add_subdirectory(models)
+    # This dependency is needed for runtime demos. The config_factory is used
+    # to produce hardware configurations and is required by pipelines.
+    #add_subdirectory(utils)
+
+    # Following steps compile and link the pipelines library from OpenVINO 2021.4 installation folder.
+    # This dependency is required by segmentation demo. It implements a pipeline for sending streaming input and output for inference.
+    add_subdirectory(pipelines)
+endif()
diff --git a/python/openvino/runtime/common/README.md b/python/openvino/runtime/common/README.md
new file mode 100644
index 0000000..1953fed
--- /dev/null
+++ b/python/openvino/runtime/common/README.md
@@ -0,0 +1,7 @@
+## Patch Log
+
+This README documents the changes made to `runtime/common` so that they can be preserved and reapplied in future OpenVINO uplifts or updates.
+
+| Patch Name              | PR Number           | Description            |
+| ------------------------- | ------------------------- | ------------------------- |
+| Make dla_benchmark less chatty | #3065  | Set the maximum number of printed warnings |
diff --git a/python/openvino/runtime/common/demo_utils/CMakeLists.txt b/python/openvino/runtime/common/demo_utils/CMakeLists.txt
new file mode 100644
index 0000000..b79d72a
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+file(GLOB_RECURSE HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/*")
+file(GLOB_RECURSE SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*")
+
+source_group("src" FILES ${SOURCES})
+source_group("include" FILES ${HEADERS})
+
+add_library(utils STATIC ${HEADERS} ${SOURCES})
+target_include_directories(utils PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include"
+                                        "$ENV{COREDLA_ROOT}/dla_plugin/inc/")
+target_link_libraries(utils PRIVATE openvino::runtime opencv_core opencv_imgcodecs opencv_videoio ie_samples_utils)
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/args_helper.hpp b/python/openvino/runtime/common/demo_utils/include/utils/args_helper.hpp
new file mode 100644
index 0000000..7a638cc
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/args_helper.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality
+ * @file args_helper.hpp
+ */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <opencv2/core/types.hpp>
+#include <openvino/openvino.hpp>
+
+/**
+* @brief This function checks input args and existence of specified files in a given folder
+* @param arg path to a file to be checked for existence
+* @return files updated vector of verified input files
+*/
+void readInputFilesArguments(std::vector<std::string>& files, const std::string& arg);
+
+/**
+* @brief This function finds -i/--i key in input args
+*        It's necessary to process multiple values for single key
+* @return files updated vector of verified input files
+*/
+void parseInputFilesArguments(std::vector<std::string>& files);
+
+std::vector<std::string> split(const std::string& s, char delim);
+
+std::vector<std::string> parseDevices(const std::string& device_string);
+
+std::map<std::string, int32_t> parseValuePerDevice(const std::set<std::string>& devices,
+                                                   const std::string& values_string);
+
+cv::Size stringToSize(const std::string& str);
+
+std::map<std::string, ov::Layout> parseLayoutString(const std::string& layout_string);
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/common.hpp b/python/openvino/runtime/common/demo_utils/include/utils/common.hpp
new file mode 100644
index 0000000..dbe7cf0
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/common.hpp
@@ -0,0 +1,190 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality
+ * @file common.hpp
+ */
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+#include "utils/slog.hpp"
+#include "utils/args_helper.hpp"
+
+#ifndef UNUSED
+#ifdef _WIN32
+#define UNUSED
+#else
+#define UNUSED  __attribute__((unused))
+#endif
+#endif
+
+template <typename T, std::size_t N>
+constexpr std::size_t arraySize(const T(&)[N]) noexcept {
+    return N;
+}
+
+static inline void catcher() noexcept {
+    if (std::current_exception()) {
+        try {
+            std::rethrow_exception(std::current_exception());
+        } catch (const std::exception& error) {
+            slog::err << error.what() << slog::endl;
+        } catch (...) {
+            slog::err << "Non-exception object thrown" << slog::endl;
+        }
+        std::exit(1);
+    }
+    std::abort();
+}
+
+template <typename T>
+T clamp(T value, T low, T high) {
+    return value < low ? low : (value > high ? high : value);
+}
+
+inline slog::LogStream& operator<<(slog::LogStream& os, const ov::Version& version) {
+    return os << "OpenVINO" << slog::endl
+        << "\tversion: " << OPENVINO_VERSION_MAJOR << "." << OPENVINO_VERSION_MINOR << "." << OPENVINO_VERSION_PATCH << slog::endl
+        << "\tbuild: " << version.buildNumber;
+}
+
+/**
+ * @class Color
+ * @brief A Color class stores channels of a given color
+ */
+class Color {
+private:
+    unsigned char _r;
+    unsigned char _g;
+    unsigned char _b;
+
+public:
+    /**
+     * A default constructor.
+     * @param r - value for red channel
+     * @param g - value for green channel
+     * @param b - value for blue channel
+     */
+    Color(unsigned char r,
+        unsigned char g,
+        unsigned char b) : _r(r), _g(g), _b(b) {}
+
+    inline unsigned char red() const {
+        return _r;
+    }
+
+    inline unsigned char blue() const {
+        return _b;
+    }
+
+    inline unsigned char green() const {
+        return _g;
+    }
+};
+
+// Known colors for training classes from the Cityscapes dataset
+static UNUSED const Color CITYSCAPES_COLORS[] = {
+    { 128, 64,  128 },
+    { 232, 35,  244 },
+    { 70,  70,  70 },
+    { 156, 102, 102 },
+    { 153, 153, 190 },
+    { 153, 153, 153 },
+    { 30,  170, 250 },
+    { 0,   220, 220 },
+    { 35,  142, 107 },
+    { 152, 251, 152 },
+    { 180, 130, 70 },
+    { 60,  20,  220 },
+    { 0,   0,   255 },
+    { 142, 0,   0 },
+    { 70,  0,   0 },
+    { 100, 60,  0 },
+    { 90,  0,   0 },
+    { 230, 0,   0 },
+    { 32,  11,  119 },
+    { 0,   74,  111 },
+    { 81,  0,   81 }
+};
+
+inline void showAvailableDevices() {
+    ov::Core core;
+    std::vector<std::string> devices = core.get_available_devices();
+
+    std::cout << "Available devices:";
+    for (const auto& device : devices) {
+        std::cout << ' ' << device;
+    }
+    std::cout << std::endl;
+}
+
+inline std::string fileNameNoExt(const std::string& filepath) {
+    auto pos = filepath.rfind('.');
+    if (pos == std::string::npos) return filepath;
+    return filepath.substr(0, pos);
+}
+
+inline void logCompiledModelInfo(
+    const ov::CompiledModel& compiledModel,
+    const std::string& modelName,
+    const std::string& deviceName,
+    const std::string& modelType = "") {
+    slog::info << "The " << modelType << (modelType.empty() ? "" : " ") << "model " << modelName << " is loaded to " << deviceName << slog::endl;
+    std::set<std::string> devices;
+    for (const std::string& device : parseDevices(deviceName)) {
+        devices.insert(device);
+    }
+
+    if (devices.find("AUTO") == devices.end()) { // do not print info for AUTO device
+        for (const auto& device : devices) {
+            try {
+                slog::info << "\tDevice: " << device << slog::endl;
+                int32_t nstreams = compiledModel.get_property(ov::streams::num);
+                slog::info << "\t\tNumber of streams: " << nstreams << slog::endl;
+                if (device == "CPU") {
+                    int32_t nthreads = compiledModel.get_property(ov::inference_num_threads);
+                    slog::info << "\t\tNumber of threads: " << (nthreads == 0 ? "AUTO" : std::to_string(nthreads)) << slog::endl;
+                }
+            }
+            catch (const ov::Exception&) {}
+        }
+    }
+}
+
+inline void logBasicModelInfo(const std::shared_ptr<ov::Model>& model) {
+    slog::info << "Model name: " << model->get_friendly_name() << slog::endl;
+
+    // Dump information about model inputs/outputs
+    ov::OutputVector inputs = model->inputs();
+    ov::OutputVector outputs = model->outputs();
+
+    slog::info << "\tInputs: " << slog::endl;
+    for (const ov::Output<ov::Node>& input : inputs) {
+        const std::string name = input.get_any_name();
+        const ov::element::Type type = input.get_element_type();
+        const ov::PartialShape shape = input.get_partial_shape();
+        const ov::Layout layout = ov::layout::get_layout(input);
+
+        slog::info << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << slog::endl;
+    }
+
+    slog::info << "\tOutputs: " << slog::endl;
+    for (const ov::Output<ov::Node>& output : outputs) {
+        const std::string name = output.get_any_name();
+        const ov::element::Type type = output.get_element_type();
+        const ov::PartialShape shape = output.get_partial_shape();
+        const ov::Layout layout = ov::layout::get_layout(output);
+
+        slog::info << "\t\t" << name << ", " << type << ", " << shape << ", " << layout.to_string() << slog::endl;
+    }
+
+    return;
+}
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/config_factory.h b/python/openvino/runtime/common/demo_utils/include/utils/config_factory.h
new file mode 100644
index 0000000..c7440b5
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/config_factory.h
@@ -0,0 +1,52 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stdint.h>
+
+#include <map>
+#include <set>
+#include <string>
+
+#include <openvino/openvino.hpp>
+
+struct ModelConfig {
+    std::string deviceName;
+    std::string cpuExtensionsPath;
+    std::string clKernelsConfigPath;
+    std::string fpgaArchPath;
+    unsigned int maxAsyncRequests;
+    ov::AnyMap compiledModelConfig;
+
+    std::set<std::string> getDevices();
+    std::map<std::string, std::string> getLegacyConfig();
+
+protected:
+    std::set<std::string> devices;
+};
+
+class ConfigFactory {
+public:
+    static ModelConfig getUserConfig(const std::string& flags_d,
+                                     uint32_t flags_nireq,
+                                     const std::string& flags_nstreams,
+                                     uint32_t flags_nthreads,
+                                     const std::string &flags_arch);
+    static ModelConfig getMinLatencyConfig(const std::string& flags_d, uint32_t flags_nireq);
+
+protected:
+    static ModelConfig getCommonConfig(const std::string& flags_d, uint32_t flags_nireq);
+};
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/default_flags.hpp b/python/openvino/runtime/common/demo_utils/include/utils/default_flags.hpp
new file mode 100644
index 0000000..83c32c2
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/default_flags.hpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#define DEFINE_INPUT_FLAGS \
+DEFINE_string(i, "", input_message); \
+DEFINE_bool(loop, false, loop_message);
+
+#define DEFINE_OUTPUT_FLAGS \
+DEFINE_string(o, "", output_message); \
+DEFINE_int32(limit, 1000, limit_message);
+
+static const char input_message[] = "Required. An input to process. The input must be a single image, a folder of "
+    "images, video file or camera id.";
+static const char loop_message[] = "Optional. Enable reading the input in a loop.";
+static const char output_message[] = "Optional. Name of the output file(s) to save.";
+static const char limit_message[] = "Optional. Number of frames to store in output. If 0 is set, all frames are stored.";
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/grid_mat.hpp b/python/openvino/runtime/common/demo_utils/include/utils/grid_mat.hpp
new file mode 100644
index 0000000..7d46d2b
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/grid_mat.hpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+
+class GridMat {
+public:
+    cv::Mat outimg;
+
+    explicit GridMat(const std::vector<cv::Size>& sizes, const cv::Size maxDisp = cv::Size{1920, 1080}) {
+        size_t maxWidth = 0;
+        size_t maxHeight = 0;
+        for (size_t i = 0; i < sizes.size(); i++) {
+            maxWidth = std::max(maxWidth, static_cast<size_t>(sizes[i].width));
+            maxHeight = std::max(maxHeight, static_cast<size_t>(sizes[i].height));
+        }
+        if (0 == maxWidth || 0 == maxHeight) {
+            throw std::invalid_argument("Input resolution must not be zero.");
+        }
+
+        size_t nGridCols = static_cast<size_t>(ceil(sqrt(static_cast<float>(sizes.size()))));
+        size_t nGridRows = (sizes.size() - 1) / nGridCols + 1;
+        size_t gridMaxWidth = static_cast<size_t>(maxDisp.width/nGridCols);
+        size_t gridMaxHeight = static_cast<size_t>(maxDisp.height/nGridRows);
+
+        float scaleWidth = static_cast<float>(gridMaxWidth) / maxWidth;
+        float scaleHeight = static_cast<float>(gridMaxHeight) / maxHeight;
+        float scaleFactor = std::min(1.f, std::min(scaleWidth, scaleHeight));
+
+        cellSize.width = static_cast<int>(maxWidth * scaleFactor);
+        cellSize.height = static_cast<int>(maxHeight * scaleFactor);
+
+        for (size_t i = 0; i < sizes.size(); i++) {
+            cv::Point p;
+            p.x = cellSize.width * (i % nGridCols);
+            p.y = cellSize.height * (i / nGridCols);
+            points.push_back(p);
+        }
+
+        outimg.create(cellSize.height * nGridRows, cellSize.width * nGridCols, CV_8UC3);
+        outimg.setTo(0);
+        clear();
+    }
+
+    cv::Size getCellSize() {
+        return cellSize;
+    }
+
+    void fill(std::vector<cv::Mat>& frames) {
+        if (frames.size() > points.size()) {
+            throw std::logic_error("Cannot display " + std::to_string(frames.size()) + " channels in a grid with " + std::to_string(points.size()) + " cells");
+        }
+
+        for (size_t i = 0; i < frames.size(); i++) {
+            cv::Mat cell = outimg(cv::Rect(points[i].x, points[i].y, cellSize.width, cellSize.height));
+
+            if ((cellSize.width == frames[i].cols) && (cellSize.height == frames[i].rows)) {
+                frames[i].copyTo(cell);
+            } else if ((cellSize.width > frames[i].cols) && (cellSize.height > frames[i].rows)) {
+                frames[i].copyTo(cell(cv::Rect(0, 0, frames[i].cols, frames[i].rows)));
+            } else {
+                cv::resize(frames[i], cell, cellSize);
+            }
+        }
+        unupdatedSourceIDs.clear();
+    }
+
+    void update(const cv::Mat& frame, const size_t sourceID) {
+        const cv::Mat& cell = outimg(cv::Rect(points[sourceID], cellSize));
+
+        if ((cellSize.width == frame.cols) && (cellSize.height == frame.rows)) {
+            frame.copyTo(cell);
+        } else if ((cellSize.width > frame.cols) && (cellSize.height > frame.rows)) {
+            frame.copyTo(cell(cv::Rect(0, 0, frame.cols, frame.rows)));
+        } else {
+            cv::resize(frame, cell, cellSize);
+        }
+        unupdatedSourceIDs.erase(unupdatedSourceIDs.find(sourceID));
+    }
+
+    bool isFilled() const noexcept {
+        return unupdatedSourceIDs.empty();
+    }
+    void clear() {
+        size_t counter = 0;
+        std::generate_n(std::inserter(unupdatedSourceIDs, unupdatedSourceIDs.end()), points.size(), [&counter]{return counter++;});
+    }
+    std::set<size_t> getUnupdatedSourceIDs() const noexcept {
+        return unupdatedSourceIDs;
+    }
+    cv::Mat getMat() const noexcept {
+        return outimg;
+    }
+
+private:
+    cv::Size cellSize;
+    std::set<size_t> unupdatedSourceIDs;
+    std::vector<cv::Point> points;
+};
+
+void fillROIColor(cv::Mat& displayImage, cv::Rect roi, cv::Scalar color, double opacity) {
+    if (opacity > 0) {
+        roi = roi & cv::Rect(0, 0, displayImage.cols, displayImage.rows);
+        cv::Mat textROI = displayImage(roi);
+        cv::addWeighted(color, opacity, textROI, 1.0 - opacity , 0.0, textROI);
+    }
+}
+
+void putTextOnImage(cv::Mat& displayImage, std::string str, cv::Point p,
+                    cv::HersheyFonts font, double fontScale, cv::Scalar color,
+                    int thickness = 1, cv::Scalar bgcolor = cv::Scalar(),
+                    double opacity = 0) {
+    int baseline = 0;
+    cv::Size textSize = cv::getTextSize(str, font, 0.5, 1, &baseline);
+    fillROIColor(displayImage, cv::Rect(cv::Point(p.x, p.y + baseline),
+                                        cv::Point(p.x + textSize.width, p.y - textSize.height)),
+                 bgcolor, opacity);
+    cv::putText(displayImage, str, p, font, fontScale, color, thickness);
+}
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/image_utils.h b/python/openvino/runtime/common/demo_utils/include/utils/image_utils.h
new file mode 100644
index 0000000..2731a9a
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/image_utils.h
@@ -0,0 +1,29 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <opencv2/opencv.hpp>
+
+enum RESIZE_MODE {
+    RESIZE_FILL,
+    RESIZE_KEEP_ASPECT,
+    RESIZE_KEEP_ASPECT_LETTERBOX
+};
+
+cv::Mat resizeImageExt(const cv::Mat& mat, int width, int height, RESIZE_MODE resizeMode = RESIZE_FILL,
+                       cv::InterpolationFlags interpolationMode = cv::INTER_LINEAR, cv::Rect* roi = nullptr,
+                       cv::Scalar BorderConstant = cv::Scalar(0, 0, 0));
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/images_capture.h b/python/openvino/runtime/common/demo_utils/include/utils/images_capture.h
new file mode 100644
index 0000000..f2afdfc
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/images_capture.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2020-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <stddef.h>
+
+#include <limits>
+#include <memory>
+#include <string>
+
+#include <opencv2/core.hpp>
+
+#include "utils/performance_metrics.hpp"
+
+enum class read_type { efficient, safe };
+
+class ImagesCapture {
+public:
+    const bool loop;
+
+    ImagesCapture(bool loop) : loop{loop} {}
+    virtual double fps() const = 0;
+    virtual cv::Mat read() = 0;
+    virtual std::string getType() const = 0;
+    const PerformanceMetrics& getMetrics() {
+        return readerMetrics;
+    }
+    virtual ~ImagesCapture() = default;
+
+protected:
+    PerformanceMetrics readerMetrics;
+};
+
+// An advanced version of
+// try {
+//     return cv::VideoCapture(std::stoi(input));
+// } catch (const std::invalid_argument&) {
+//     return cv::VideoCapture(input);
+// } catch (const std::out_of_range&) {
+//     return cv::VideoCapture(input);
+// }
+// Some VideoCapture backends continue owning the video buffer under cv::Mat. safe_copy forses to return a copy from
+// read()
+// https://github.com/opencv/opencv/blob/46e1560678dba83d25d309d8fbce01c40f21b7be/modules/gapi/include/opencv2/gapi/streaming/cap.hpp#L72-L76
+std::unique_ptr<ImagesCapture> openImagesCapture(
+    const std::string& input,
+    bool loop,
+    read_type type = read_type::efficient,
+    size_t initialImageId = 0,
+    size_t readLengthLimit = std::numeric_limits<size_t>::max(),  // General option
+    cv::Size cameraResolution = {1280, 720}
+   );
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/input_wrappers.hpp b/python/openvino/runtime/common/demo_utils/include/utils/input_wrappers.hpp
new file mode 100644
index 0000000..eff38a7
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/input_wrappers.hpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2018-2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <thread>
+#include <vector>
+#include <queue>
+
+#include <opencv2/opencv.hpp>
+
+class InputChannel;
+
+class IInputSource {
+public:
+    virtual bool read(cv::Mat& mat, const std::shared_ptr<InputChannel>& caller) = 0;
+    virtual void addSubscriber(const std::weak_ptr<InputChannel>& inputChannel) = 0;
+    virtual cv::Size getSize() = 0;
+    virtual void lock() {
+        sourceLock.lock();
+    }
+    virtual void unlock() {
+        sourceLock.unlock();
+    }
+    virtual ~IInputSource() = default;
+private:
+    std::mutex sourceLock;
+};
+
+class InputChannel: public std::enable_shared_from_this<InputChannel> { // note: public inheritance
+public:
+    InputChannel(const InputChannel&) = delete;
+    InputChannel& operator=(const InputChannel&) = delete;
+    static std::shared_ptr<InputChannel> create(const std::shared_ptr<IInputSource>& source) {
+        auto tmp = std::shared_ptr<InputChannel>(new InputChannel(source));
+        source->addSubscriber(tmp);
+        return tmp;
+    }
+    bool read(cv::Mat& mat) {
+        readQueueMutex.lock();
+        if (readQueue.empty()) {
+            readQueueMutex.unlock();
+            source->lock();
+            readQueueMutex.lock();
+            if (readQueue.empty()) {
+                bool res = source->read(mat, shared_from_this());
+                readQueueMutex.unlock();
+                source->unlock();
+                return res;
+            } else {
+                source->unlock();
+            }
+        }
+        mat = readQueue.front().clone();
+        readQueue.pop();
+        readQueueMutex.unlock();
+        return true;
+    }
+    void push(const cv::Mat& mat) {
+        readQueueMutex.lock();
+        readQueue.push(mat);
+        readQueueMutex.unlock();
+    }
+    cv::Size getSize() {
+        return source->getSize();
+    }
+
+private:
+    explicit InputChannel(const std::shared_ptr<IInputSource>& source): source{source} {}
+    std::shared_ptr<IInputSource> source;
+    std::queue<cv::Mat, std::list<cv::Mat>> readQueue;
+    std::mutex readQueueMutex;
+};
+
+class VideoCaptureSource: public IInputSource {
+public:
+    VideoCaptureSource(const cv::VideoCapture& videoCapture, bool loop): videoCapture{videoCapture}, loop{loop},
+        imSize{static_cast<int>(videoCapture.get(cv::CAP_PROP_FRAME_WIDTH)), static_cast<int>(videoCapture.get(cv::CAP_PROP_FRAME_HEIGHT))} {}
+    bool read(cv::Mat& mat, const std::shared_ptr<InputChannel>& caller) override {
+        if (!videoCapture.read(mat)) {
+            if (loop) {
+                videoCapture.set(cv::CAP_PROP_POS_FRAMES, 0);
+                videoCapture.read(mat);
+            } else {
+                return false;
+            }
+        }
+        if (1 != subscribedInputChannels.size()) {
+            cv::Mat shared = mat.clone();
+            for (const std::weak_ptr<InputChannel>& weakInputChannel : subscribedInputChannels) {
+                try {
+                    std::shared_ptr<InputChannel> sharedInputChannel = std::shared_ptr<InputChannel>(weakInputChannel);
+                    if (caller != sharedInputChannel) {
+                        sharedInputChannel->push(shared);
+                    }
+                } catch (const std::bad_weak_ptr&) {}
+            }
+        }
+        return true;
+    }
+    void addSubscriber(const std::weak_ptr<InputChannel>& inputChannel) override {
+        subscribedInputChannels.push_back(inputChannel);
+    }
+    cv::Size getSize() override {
+        return imSize;
+    }
+
+private:
+    std::vector<std::weak_ptr<InputChannel>> subscribedInputChannels;
+    cv::VideoCapture videoCapture;
+    bool loop;
+    cv::Size imSize;
+};
+
+class ImageSource: public IInputSource {
+public:
+    ImageSource(const cv::Mat& im, bool loop): im{im.clone()}, loop{loop} {}  // clone to avoid image changing
+    bool read(cv::Mat& mat, const std::shared_ptr<InputChannel>& caller) override {
+        if (!loop) {
+            auto subscribedInputChannelsIt = subscribedInputChannels.find(caller);
+            if (subscribedInputChannels.end() == subscribedInputChannelsIt) {
+                return false;
+            } else {
+                subscribedInputChannels.erase(subscribedInputChannelsIt);
+                mat = im;
+                return true;
+            }
+        } else {
+            mat = im;
+            return true;
+        }
+    }
+    void addSubscriber(const std::weak_ptr<InputChannel>& inputChannel) override {
+        if (false == subscribedInputChannels.insert(inputChannel).second)
+            throw std::invalid_argument("The insertion did not take place");
+    }
+    cv::Size getSize() override {
+        return im.size();
+    }
+
+private:
+    std::set<std::weak_ptr<InputChannel>, std::owner_less<std::weak_ptr<InputChannel>>> subscribedInputChannels;
+    cv::Mat im;
+    bool loop;
+};
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/kuhn_munkres.hpp b/python/openvino/runtime/common/demo_utils/include/utils/kuhn_munkres.hpp
new file mode 100644
index 0000000..6e6ac51
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/kuhn_munkres.hpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "opencv2/core.hpp"
+
+#include <memory>
+#include <vector>
+
+
+///
+/// \brief The KuhnMunkres class
+///
+/// Solves the assignment problem.
+///
+class KuhnMunkres {
+public:
+    ///
+    /// \brief Initializes the class for assignment problem solving.
+    /// \param[in] greedy If a faster greedy matching algorithm should be used.
+    explicit KuhnMunkres(bool greedy = false);
+
+    ///
+    /// \brief Solves the assignment problem for given dissimilarity matrix.
+    /// It returns a vector that where each element is a column index for
+    /// corresponding row (e.g. result[0] stores optimal column index for very
+    /// first row in the dissimilarity matrix).
+    /// \param dissimilarity_matrix CV_32F dissimilarity matrix.
+    /// \return Optimal column index for each row. -1 means that there is no
+    /// column for row.
+    ///
+    std::vector<size_t> Solve(const cv::Mat &dissimilarity_matrix);
+
+private:
+    static constexpr int kStar = 1;
+    static constexpr int kPrime = 2;
+
+    cv::Mat dm_;
+    cv::Mat marked_;
+    std::vector<cv::Point> points_;
+
+    std::vector<int> is_row_visited_;
+    std::vector<int> is_col_visited_;
+
+    int n_;
+    bool greedy_;
+
+    void TrySimpleCase();
+    bool CheckIfOptimumIsFound();
+    cv::Point FindUncoveredMinValPos();
+    void UpdateDissimilarityMatrix(float val);
+    int FindInRow(int row, int what);
+    int FindInCol(int col, int what);
+    void Run();
+};
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/nms.hpp b/python/openvino/runtime/common/demo_utils/include/utils/nms.hpp
new file mode 100644
index 0000000..1fd475f
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/nms.hpp
@@ -0,0 +1,81 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "opencv2/core.hpp"
+#include <numeric>
+#include <vector>
+
+struct Anchor {
+    float left;
+    float top;
+    float right;
+    float bottom;
+
+    float getWidth() const {
+        return (right - left) + 1.0f;
+    }
+    float getHeight() const {
+        return (bottom - top) + 1.0f;
+    }
+    float getXCenter() const {
+        return left + (getWidth() - 1.0f) / 2.0f;
+    }
+    float getYCenter() const {
+        return top + (getHeight() - 1.0f) / 2.0f;
+    }
+};
+
+template <typename Anchor>
+std::vector<int> nms(const std::vector<Anchor>& boxes, const std::vector<float>& scores,
+                     const float thresh, bool includeBoundaries=false) {
+    std::vector<float> areas(boxes.size());
+    for (size_t i = 0; i < boxes.size(); ++i) {
+        areas[i] = (boxes[i].right - boxes[i].left + includeBoundaries) * (boxes[i].bottom - boxes[i].top + includeBoundaries);
+    }
+    std::vector<int> order(scores.size());
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(), [&scores](int o1, int o2) { return scores[o1] > scores[o2]; });
+
+    size_t ordersNum = 0;
+    for (; ordersNum < order.size() && scores[order[ordersNum]] >= 0; ordersNum++);
+
+    std::vector<int> keep;
+    bool shouldContinue = true;
+    for (size_t i = 0; shouldContinue && i < ordersNum; ++i) {
+        auto idx1 = order[i];
+        if (idx1 >= 0) {
+            keep.push_back(idx1);
+            shouldContinue = false;
+            for (size_t j = i + 1; j < ordersNum; ++j) {
+                auto idx2 = order[j];
+                if (idx2 >= 0) {
+                    shouldContinue = true;
+                    auto overlappingWidth = std::fminf(boxes[idx1].right, boxes[idx2].right) - std::fmaxf(boxes[idx1].left, boxes[idx2].left);
+                    auto overlappingHeight = std::fminf(boxes[idx1].bottom, boxes[idx2].bottom) - std::fmaxf(boxes[idx1].top, boxes[idx2].top);
+                    auto intersection = overlappingWidth > 0 && overlappingHeight > 0 ? overlappingWidth * overlappingHeight : 0;
+                    auto overlap = intersection / (areas[idx1] + areas[idx2] - intersection);
+
+                    if (overlap >= thresh) {
+                        order[j] = -1;
+                    }
+                }
+            }
+        }
+    }
+    return keep;
+}
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/ocv_common.hpp b/python/openvino/runtime/common/demo_utils/include/utils/ocv_common.hpp
new file mode 100644
index 0000000..ebb5e14
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/ocv_common.hpp
@@ -0,0 +1,289 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality using OpenCV
+ * @file ocv_common.hpp
+ */
+
+#pragma once
+
+#include <opencv2/opencv.hpp>
+#include <openvino/openvino.hpp>
+
+#include "utils/common.hpp"
+#include "utils/shared_tensor_allocator.hpp"
+
+/**
+* @brief Get cv::Mat value in the correct format.
+*/
+template <typename T>
+const T getMatValue(const cv::Mat& mat, size_t h, size_t w, size_t c) {
+    switch (mat.type()) {
+        case CV_8UC1:  return (T)mat.at<uchar>(h, w);
+        case CV_8UC3:  return (T)mat.at<cv::Vec3b>(h, w)[c];
+        case CV_32FC1: return (T)mat.at<float>(h, w);
+        case CV_32FC3: return (T)mat.at<cv::Vec3f>(h, w)[c];
+    }
+    throw std::runtime_error("cv::Mat type is not recognized");
+};
+
+/**
+* @brief Resize and copy image data from cv::Mat object to a given Tensor object.
+* @param mat - given cv::Mat object with an image data.
+* @param tensor - Tensor object which to be filled by an image data.
+* @param batchIndex - batch index of an image inside of the blob.
+*/
+static UNUSED void matToTensor(const cv::Mat& mat, const ov::Tensor& tensor, int batchIndex = 0) {
+    ov::Shape tensorShape = tensor.get_shape();
+    static const ov::Layout layout("NCHW");
+    const size_t width = tensorShape[ov::layout::width_idx(layout)];
+    const size_t height = tensorShape[ov::layout::height_idx(layout)];
+    const size_t channels = tensorShape[ov::layout::channels_idx(layout)];
+    if (static_cast<size_t>(mat.channels()) != channels) {
+        throw std::runtime_error("The number of channels for model input and image must match");
+    }
+    if (channels != 1 && channels != 3) {
+        throw std::runtime_error("Unsupported number of channels");
+    }
+    int batchOffset = batchIndex * width * height * channels;
+
+    cv::Mat resizedMat;
+    if (static_cast<int>(width) != mat.size().width || static_cast<int>(height) != mat.size().height) {
+        cv::resize(mat, resizedMat, cv::Size(width, height));
+    } else {
+        resizedMat = mat;
+    }
+
+    if (tensor.get_element_type() == ov::element::f32) {
+        float_t* tensorData = tensor.data<float_t>();
+        for (size_t c = 0; c < channels; c++)
+            for (size_t h = 0; h < height; h++)
+                for (size_t w = 0; w < width; w++)
+                    tensorData[batchOffset + c * width * height + h * width + w] =
+                        getMatValue<float_t>(resizedMat, h, w, c);
+    } else {
+        uint8_t* tensorData = tensor.data<uint8_t>();
+        if (resizedMat.depth() == CV_32F) {
+            throw std::runtime_error("Conversion of cv::Mat from float_t to uint8_t is forbidden");
+        }
+        for (size_t c = 0; c < channels; c++)
+            for (size_t h = 0; h < height; h++)
+                for (size_t w = 0; w < width; w++)
+                    tensorData[batchOffset + c * width * height + h * width + w] =
+                        getMatValue<uint8_t>(resizedMat, h, w, c);
+    }
+}
+
+static UNUSED ov::Tensor wrapMat2Tensor(const cv::Mat& mat) {
+    auto matType = mat.type() & CV_MAT_DEPTH_MASK;
+    if (matType != CV_8U && matType != CV_32F) {
+        throw std::runtime_error("Unsupported mat type for wrapping");
+    }
+    bool isMatFloat = matType == CV_32F;
+
+    const size_t channels = mat.channels();
+    const size_t height = mat.rows;
+    const size_t width = mat.cols;
+
+    const size_t strideH = mat.step.buf[0];
+    const size_t strideW = mat.step.buf[1];
+
+    const bool isDense = !isMatFloat ? (strideW == channels && strideH == channels * width) :
+        (strideW == channels * sizeof(float) && strideH == channels * width * sizeof(float));
+    if (!isDense) {
+        throw std::runtime_error("Doesn't support conversion from not dense cv::Mat");
+    }
+    auto precision = isMatFloat ? ov::element::f32 : ov::element::u8;
+    auto allocator = std::make_shared<SharedTensorAllocator>(mat);
+    return ov::Tensor(precision, ov::Shape{ 1, height, width, channels }, ov::Allocator(allocator));
+}
+
+static inline void resize2tensor(const cv::Mat& mat, const ov::Tensor& tensor) {
+    static const ov::Layout layout{"NHWC"};
+    const ov::Shape& shape = tensor.get_shape();
+    cv::Size size{int(shape[ov::layout::width_idx(layout)]), int(shape[ov::layout::height_idx(layout)])};
+    assert(tensor.get_element_type() == ov::element::u8);
+    assert(shape.size() == 4);
+    assert(shape[ov::layout::batch_idx(layout)] == 1);
+    assert(shape[ov::layout::channels_idx(layout)] == 3);
+    cv::resize(mat, cv::Mat{size, CV_8UC3, tensor.data()}, size);
+}
+
+static inline ov::Layout getLayoutFromShape(const ov::Shape& shape) {
+    if (shape.size() == 2) {
+        return "NC";
+    }
+    else if (shape.size() == 3) {
+        return (shape[0] >= 1 && shape[0] <= 4) ? "CHW" :
+                                                  "HWC";
+    }
+    else if (shape.size() == 4) {
+        return (shape[1] >= 1 && shape[1] <= 4) ? "NCHW" :
+                                                  "NHWC";
+    }
+    else {
+        throw std::runtime_error("Usupported " + std::to_string(shape.size()) + "D shape");
+    }
+}
+
+/**
+ * @brief Puts text message on the frame, highlights the text with a white border to make it distinguishable from
+ *        the background.
+ * @param frame - frame to put the text on.
+ * @param message - text of the message.
+ * @param position - bottom-left corner of the text string in the image.
+ * @param fontFace - font type.
+ * @param fontScale - font scale factor that is multiplied by the font-specific base size.
+ * @param color - text color.
+ * @param thickness - thickness of the lines used to draw a text.
+ */
+inline void putHighlightedText(const cv::Mat& frame,
+                               const std::string& message,
+                               cv::Point position,
+                               int fontFace,
+                               double fontScale,
+                               cv::Scalar color,
+                               int thickness) {
+    cv::putText(frame, message, position, fontFace, fontScale, cv::Scalar(255, 255, 255), thickness + 1);
+    cv::putText(frame, message, position, fontFace, fontScale, color, thickness);
+}
+
+// TODO: replace with Size::empty() after OpenCV3 is dropped
+static inline bool isSizeEmpty(const cv::Size& size) {
+    return size.width <= 0 || size.height <= 0;
+}
+
+// TODO: replace with Rect::empty() after OpenCV3 is dropped
+static inline bool isRectEmpty(const cv::Rect& rect) {
+    return rect.width <= 0 || rect.height <= 0;
+}
+
+class OutputTransform {
+public:
+    OutputTransform() : doResize(false), scaleFactor(1) {}
+
+    OutputTransform(cv::Size inputSize, cv::Size outputResolution) :
+        doResize(true), scaleFactor(1), inputSize(inputSize), outputResolution(outputResolution) {}
+
+    cv::Size computeResolution() {
+        float inputWidth = static_cast<float>(inputSize.width);
+        float inputHeight = static_cast<float>(inputSize.height);
+        scaleFactor = std::min(outputResolution.height / inputHeight, outputResolution.width / inputWidth);
+        newResolution = cv::Size{static_cast<int>(inputWidth * scaleFactor), static_cast<int>(inputHeight * scaleFactor)};
+        return newResolution;
+    }
+
+    void resize(cv::Mat& image) {
+        if (!doResize) { return; }
+        cv::Size currSize = image.size();
+        if (currSize != inputSize) {
+            inputSize = currSize;
+            computeResolution();
+        }
+        if (scaleFactor == 1) { return; }
+        cv::resize(image, image, newResolution);
+    }
+
+    template<typename T>
+    void scaleCoord(T& coord) {
+        if (!doResize || scaleFactor == 1) { return; }
+        coord.x = std::floor(coord.x * scaleFactor);
+        coord.y = std::floor(coord.y * scaleFactor);
+    }
+
+    template<typename T>
+    void scaleRect(T& rect) {
+        if (!doResize || scaleFactor == 1) { return; }
+        scaleCoord(rect);
+        rect.width = std::floor(rect.width * scaleFactor);
+        rect.height = std::floor(rect.height * scaleFactor);
+    }
+
+    bool doResize;
+
+private:
+    float scaleFactor;
+    cv::Size inputSize;
+    cv::Size outputResolution;
+    cv::Size newResolution;
+};
+
+class InputTransform {
+public:
+    InputTransform() : reverseInputChannels(false), isTrivial(true) {}
+
+    InputTransform(bool reverseInputChannels, const std::string& meanValues, const std::string& scaleValues) :
+        reverseInputChannels(reverseInputChannels),
+        isTrivial(!reverseInputChannels && meanValues.empty() && scaleValues.empty()),
+        means(meanValues.empty() ? cv::Scalar(0.0, 0.0, 0.0) : string2Vec(meanValues)),
+        stdScales(scaleValues.empty() ? cv::Scalar(1.0, 1.0, 1.0) : string2Vec(scaleValues)) {
+    }
+
+    cv::Scalar string2Vec(const std::string& string) {
+        const auto& strValues = split(string, ' ');
+        std::vector<float> values;
+        try {
+            for (auto& str : strValues)
+                values.push_back(std::stof(str));
+        }
+        catch (const std::invalid_argument&) {
+            throw std::runtime_error("Invalid parameter --mean_values or --scale_values is provided.");
+        }
+        if (values.size() != 3) {
+            throw std::runtime_error("InputTransform expects 3 values per channel, but get \"" + string + "\".");
+        }
+        return cv::Scalar(values[0], values[1], values[2]);
+    }
+
+    void setPrecision(ov::preprocess::PrePostProcessor& ppp, const std::string& tensorName) {
+        const auto precision = isTrivial ? ov::element::u8 : ov::element::f32;
+        ppp.input(tensorName).tensor().
+                set_element_type(precision);
+    }
+
+    cv::Mat operator()(const cv::Mat& inputs) {
+        if (isTrivial) { return inputs; }
+        cv::Mat result;
+        inputs.convertTo(result, CV_32F);
+        if (reverseInputChannels) {
+            cv::cvtColor(result, result, cv::COLOR_BGR2RGB);
+        }
+        // TODO: merge the two following lines after OpenCV3 is droppped
+        result -= means;
+        result /= cv::Mat{stdScales};
+        return result;
+    }
+
+private:
+    bool reverseInputChannels;
+    bool isTrivial;
+    cv::Scalar means;
+    cv::Scalar stdScales;
+};
+
+class LazyVideoWriter {
+    cv::VideoWriter writer;
+    unsigned nwritten;
+public:
+    const std::string filenames;
+    const double fps;
+    const unsigned lim;
+
+    LazyVideoWriter(const std::string& filenames, double fps, unsigned lim) :
+        nwritten{1}, filenames{filenames}, fps{fps}, lim{lim} {}
+    void write(const cv::Mat& im) {
+        if (writer.isOpened() && (nwritten < lim || 0 == lim)) {
+            writer.write(im);
+            ++nwritten;
+            return;
+        }
+        if (!writer.isOpened() && !filenames.empty()) {
+            if (!writer.open(filenames, cv::VideoWriter::fourcc('M', 'J', 'P', 'G'), fps, im.size())) {
+                throw std::runtime_error("Can't open video writer");
+            }
+            writer.write(im);
+        }
+    }
+};
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/performance_metrics.hpp b/python/openvino/runtime/common/demo_utils/include/utils/performance_metrics.hpp
new file mode 100644
index 0000000..6c728b0
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/performance_metrics.hpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file for performance metrics calculation class
+ * @file performance_metrics.hpp
+ */
+
+#pragma once
+
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "utils/ocv_common.hpp"
+
+class PerformanceMetrics {
+public:
+    using Clock = std::chrono::steady_clock;
+    using TimePoint = std::chrono::time_point<Clock>;
+    using Duration = Clock::duration;
+    using Ms = std::chrono::duration<double, std::ratio<1, 1000>>;
+    using Sec = std::chrono::duration<double, std::ratio<1, 1>>;
+
+    struct Metrics {
+        double latency;
+        double fps;
+    };
+
+    enum MetricTypes {
+        ALL,
+        FPS,
+        LATENCY
+    };
+
+    PerformanceMetrics(Duration timeWindow = std::chrono::seconds(1));
+    void update(TimePoint lastRequestStartTime,
+                const cv::Mat& frame,
+                cv::Point position = {15, 30},
+                int fontFace = cv::FONT_HERSHEY_COMPLEX,
+                double fontScale = 0.75,
+                cv::Scalar color = {200, 10, 10},
+                int thickness = 2, MetricTypes metricType = ALL);
+    void update(TimePoint lastRequestStartTime);
+
+    /// Paints metrics over provided mat
+    /// @param frame frame to paint over
+    /// @param position left top corner of text block
+    /// @param fontScale font scale
+    /// @param color font color
+    /// @param thickness font thickness
+    void paintMetrics(const cv::Mat& frame,
+        cv::Point position = { 15, 30 },
+        int fontFace = cv::FONT_HERSHEY_COMPLEX,
+        double fontScale = 0.75,
+        cv::Scalar color = { 200, 10, 10 },
+        int thickness = 2, MetricTypes metricType = ALL) const;
+
+    Metrics getLast() const;
+    Metrics getTotal() const;
+    void logTotal() const;
+
+private:
+    struct Statistic {
+        Duration latency;
+        Duration period;
+        int frameCount;
+
+        Statistic() {
+            latency = Duration::zero();
+            period = Duration::zero();
+            frameCount = 0;
+        }
+
+        void combine(const Statistic& other) {
+            latency += other.latency;
+            period += other.period;
+            frameCount += other.frameCount;
+        }
+    };
+
+    Duration timeWindowSize;
+    Statistic lastMovingStatistic;
+    Statistic currentMovingStatistic;
+    Statistic totalStatistic;
+    TimePoint lastUpdateTime;
+    bool firstFrameProcessed;
+};
+
+void logLatencyPerStage(double readLat, double preprocLat, double inferLat, double postprocLat, double renderLat);
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/shared_tensor_allocator.hpp b/python/openvino/runtime/common/demo_utils/include/utils/shared_tensor_allocator.hpp
new file mode 100644
index 0000000..f74e8d0
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/shared_tensor_allocator.hpp
@@ -0,0 +1,47 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <opencv2/core.hpp>
+#include <openvino/runtime/allocator.hpp>
+
+// To prevent false-positive clang compiler warning
+// (https://github.com/openvinotoolkit/openvino/pull/11092#issuecomment-1073846256):
+// warning: destructor called on non-final 'SharedTensorAllocator' that has virtual functions
+// but non-virtual destructor [-Wdelete-non-abstract-non-virtual-dtor]
+// SharedTensorAllocator class declared as final
+
+class SharedTensorAllocator final : public ov::AllocatorImpl {
+public:
+    SharedTensorAllocator(const cv::Mat& img) : img(img) {}
+
+    ~SharedTensorAllocator() = default;
+
+    void* allocate(const size_t bytes, const size_t) override {
+        return bytes <= img.rows * img.step[0] ? img.data : nullptr;
+    }
+
+    void deallocate(void* handle, const size_t bytes, const size_t) override {}
+
+    bool is_equal(const AllocatorImpl& other) const override {
+        auto other_tensor_allocator = dynamic_cast<const SharedTensorAllocator*>(&other);
+        return other_tensor_allocator != nullptr && other_tensor_allocator == this;
+    }
+
+private:
+    const cv::Mat img;
+};
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/slog.hpp b/python/openvino/runtime/common/demo_utils/include/utils/slog.hpp
new file mode 100644
index 0000000..316b98d
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/slog.hpp
@@ -0,0 +1,99 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with logging facility for common samples
+ * @file log.hpp
+ */
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+namespace slog {
+
+/**
+ * @class LogStreamEndLine
+ * @brief The LogStreamEndLine class implements an end line marker for a log stream
+ */
+class LogStreamEndLine { };
+
+static constexpr LogStreamEndLine endl;
+
+
+/**
+ * @class LogStreamBoolAlpha
+ * @brief The LogStreamBoolAlpha class implements bool printing for a log stream
+ */
+class LogStreamBoolAlpha { };
+
+static constexpr LogStreamBoolAlpha boolalpha;
+
+
+/**
+ * @class LogStream
+ * @brief The LogStream class implements a stream for sample logging
+ */
+class LogStream {
+    std::string _prefix;
+    std::ostream* _log_stream;
+    bool _new_line;
+
+public:
+    /**
+     * @brief A constructor. Creates a LogStream object
+     * @param prefix The prefix to print
+     */
+    LogStream(const std::string &prefix, std::ostream& log_stream)
+            : _prefix(prefix), _new_line(true) {
+        _log_stream = &log_stream;
+    }
+
+    /**
+     * @brief A stream output operator to be used within the logger
+     * @param arg Object for serialization in the logger message
+     */
+    template<class T>
+    LogStream &operator<<(const T &arg) {
+        if (_new_line) {
+            (*_log_stream) << "[ " << _prefix << " ] ";
+            _new_line = false;
+        }
+
+        (*_log_stream) << arg;
+        return *this;
+    }
+
+    // Specializing for LogStreamEndLine to support slog::endl
+    LogStream& operator<< (const LogStreamEndLine &/*arg*/) {
+        _new_line = true;
+
+        (*_log_stream) << std::endl;
+        return *this;
+    }
+
+    // Specializing for LogStreamBoolAlpha to support slog::boolalpha
+    LogStream& operator<< (const LogStreamBoolAlpha &/*arg*/) {
+        (*_log_stream) << std::boolalpha;
+        return *this;
+    }
+
+    // Specializing for std::vector and std::list
+    template<template<class, class> class Container, class T>
+    LogStream& operator<< (const Container<T, std::allocator<T>>& container) {
+        for (const auto& el : container) {
+            *this << el << slog::endl;
+        }
+        return *this;
+    }
+};
+
+
+static LogStream info("INFO", std::cout);
+static LogStream debug("DEBUG", std::cout);
+static LogStream warn("WARNING", std::cout);
+static LogStream err("ERROR", std::cerr);
+
+}  // namespace slog
diff --git a/python/openvino/runtime/common/demo_utils/include/utils/threads_common.hpp b/python/openvino/runtime/common/demo_utils/include/utils/threads_common.hpp
new file mode 100644
index 0000000..f0e5cbf
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/include/utils/threads_common.hpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <utility>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include <opencv2/core/core.hpp>
+#include "utils/performance_metrics.hpp"
+
+// VideoFrame can represent not a single image but the whole grid
+class VideoFrame {
+public:
+    typedef std::shared_ptr<VideoFrame> Ptr;
+
+    VideoFrame(unsigned sourceID, int64_t frameId, const cv::Mat& frame = cv::Mat()) :
+        sourceID{sourceID}, frameId{frameId}, frame{frame} {}
+    virtual ~VideoFrame() = default;  // A user has to define how it is reconstructed
+
+    const unsigned sourceID;
+    const int64_t frameId;
+    cv::Mat frame;
+
+    PerformanceMetrics::TimePoint timestamp;
+};
+
+class Worker;
+
+class Task {
+public:
+    explicit Task(VideoFrame::Ptr sharedVideoFrame, float priority = 0):
+        sharedVideoFrame{sharedVideoFrame}, priority{priority} {}
+    virtual bool isReady() = 0;
+    virtual void process() = 0;
+    virtual ~Task() = default;
+
+    std::string name;
+    VideoFrame::Ptr sharedVideoFrame;  // it is possible that two tasks try to draw on the same cvMat
+    const float priority;
+};
+
+struct HigherPriority {
+    bool operator()(const std::shared_ptr<Task>& lhs, const std::shared_ptr<Task>& rhs) const {
+        return lhs->priority > rhs->priority
+            || (lhs->priority == rhs->priority && lhs->sharedVideoFrame->frameId < rhs->sharedVideoFrame->frameId)
+            || (lhs->priority == rhs->priority && lhs->sharedVideoFrame->frameId == rhs->sharedVideoFrame->frameId && lhs < rhs);
+    }
+};
+
+class Worker {
+public:
+    explicit Worker(unsigned threadNum):
+        threadPool(threadNum), running{false} {}
+    ~Worker() {
+        stop();
+    }
+    void runThreads() {
+        running = true;
+        for (std::thread& t : threadPool) {
+            t = std::thread(&Worker::threadFunc, this);
+        }
+    }
+    void push(std::shared_ptr<Task> task) {
+        tasksMutex.lock();
+        tasks.insert(task);
+        tasksMutex.unlock();
+        tasksCondVar.notify_one();
+    }
+    void threadFunc() {
+        while (running) {
+            std::unique_lock<std::mutex> lk(tasksMutex);
+            while (running && tasks.empty()) {
+                tasksCondVar.wait(lk);
+            }
+            try {
+                auto it = std::find_if(tasks.begin(), tasks.end(), [](const std::shared_ptr<Task>& task){return task->isReady();});
+                if (tasks.end() != it) {
+                    const std::shared_ptr<Task> task = std::move(*it);
+                    tasks.erase(it);
+                    lk.unlock();
+                    task->process();
+                }
+            } catch (...) {
+                std::lock_guard<std::mutex> lock{exceptionMutex};
+                if (nullptr == currentException) {
+                    currentException = std::current_exception();
+                    stop();
+                }
+            }
+        }
+    }
+    void stop() {
+        running = false;
+        tasksCondVar.notify_all();
+    }
+    void join() {
+        for (auto& t : threadPool) {
+            t.join();
+        }
+        if (nullptr != currentException) {
+            std::rethrow_exception(currentException);
+        }
+    }
+
+private:
+    std::condition_variable tasksCondVar;
+    std::set<std::shared_ptr<Task>, HigherPriority> tasks;
+    std::mutex tasksMutex;
+    std::vector<std::thread> threadPool;
+    std::atomic<bool> running;
+    std::exception_ptr currentException;
+    std::mutex exceptionMutex;
+};
+
+void tryPush(const std::weak_ptr<Worker>& worker, std::shared_ptr<Task>&& task) {
+    try {
+        std::shared_ptr<Worker>(worker)->push(task);
+    } catch (const std::bad_weak_ptr&) {}
+}
+
+template <class C> class ConcurrentContainer {
+public:
+    C container;
+    mutable std::mutex mutex;
+
+    bool lockedEmpty() const noexcept {
+        std::lock_guard<std::mutex> lock{mutex};
+        return container.empty();
+    }
+    typename C::size_type lockedSize() const noexcept {
+        std::lock_guard<std::mutex> lock{mutex};
+        return container.size();
+    }
+    void lockedPushBack(const typename C::value_type& value) {
+        std::lock_guard<std::mutex> lock{mutex};
+        container.push_back(value);
+    }
+    bool lockedTryPop(typename C::value_type& value) {
+        mutex.lock();
+        if (!container.empty()) {
+            value = container.back();
+            container.pop_back();
+            mutex.unlock();
+            return true;
+        } else {
+            mutex.unlock();
+            return false;
+        }
+    }
+
+    operator C() const {
+        std::lock_guard<std::mutex> lock{mutex};
+        return container;
+    }
+};
diff --git a/python/openvino/runtime/common/demo_utils/src/args_helper.cpp b/python/openvino/runtime/common/demo_utils/src/args_helper.cpp
new file mode 100644
index 0000000..8f4bc35
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/args_helper.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "utils/args_helper.hpp"
+#include "utils/slog.hpp"
+
+#ifdef _WIN32
+#include "w_dirent.hpp"
+#else
+#include <dirent.h>
+#endif
+
+#include <gflags/gflags.h>
+
+#include <sys/stat.h>
+#include <map>
+
+#include <algorithm>
+#include <cctype>
+#include <sstream>
+
+void readInputFilesArguments(std::vector<std::string>& files, const std::string& arg) {
+    struct stat sb;
+    if (stat(arg.c_str(), &sb) != 0) {
+        if (arg.compare(0, 5, "rtsp:") != 0) {
+            slog::warn << "File " << arg << " cannot be opened!" << slog::endl;
+            return;
+        }
+    }
+    if (S_ISDIR(sb.st_mode)) {
+        DIR *dp;
+        dp = opendir(arg.c_str());
+        if (dp == nullptr) {
+            slog::warn << "Directory " << arg << " cannot be opened!" << slog::endl;
+            return;
+        }
+
+        struct dirent *ep;
+        while (nullptr != (ep = readdir(dp))) {
+            std::string fileName = ep->d_name;
+            if (fileName == "." || fileName == "..") continue;
+            files.push_back(arg + "/" + ep->d_name);
+        }
+        closedir(dp);
+    } else {
+        files.push_back(arg);
+    }
+}
+
+void parseInputFilesArguments(std::vector<std::string>& files) {
+    std::vector<std::string> args = gflags::GetArgvs();
+    bool readArguments = false;
+    for (size_t i = 0; i < args.size(); i++) {
+        if (args.at(i) == "-i" || args.at(i) == "--i") {
+            readArguments = true;
+            continue;
+        }
+        if (!readArguments) {
+            continue;
+        }
+        if (args.at(i).c_str()[0] == '-') {
+            break;
+        }
+        readInputFilesArguments(files, args.at(i));
+    }
+}
+
+std::vector<std::string> split(const std::string& s, char delim) {
+    std::vector<std::string> result;
+    std::stringstream ss(s);
+    std::string item;
+
+    while (getline(ss, item, delim)) {
+        result.push_back(item);
+    }
+    return result;
+}
+
+std::vector<std::string> parseDevices(const std::string& device_string) {
+    const std::string::size_type colon_position = device_string.find(":");
+    if (colon_position != std::string::npos) {
+        std::string device_type = device_string.substr(0, colon_position);
+        if (device_type == "HETERO" || device_type == "MULTI") {
+            std::string comma_separated_devices = device_string.substr(colon_position + 1);
+            std::vector<std::string> devices = split(comma_separated_devices, ',');
+            for (auto& device : devices)
+                device = device.substr(0, device.find("("));
+            return devices;
+        }
+    }
+    return {device_string};
+}
+
+// Format: <device1>:<value1>,<device2>:<value2> or just <value>
+std::map<std::string, int32_t> parseValuePerDevice(const std::set<std::string>& devices,
+                                                   const std::string& values_string) {
+    auto values_string_upper = values_string;
+    std::transform(values_string_upper.begin(),
+                   values_string_upper.end(),
+                   values_string_upper.begin(),
+                   [](unsigned char c){ return std::toupper(c); });
+    std::map<std::string, int32_t> result;
+    auto device_value_strings = split(values_string_upper, ',');
+    for (auto& device_value_string : device_value_strings) {
+        auto device_value_vec =  split(device_value_string, ':');
+        if (device_value_vec.size() == 2) {
+            auto it = std::find(devices.begin(), devices.end(), device_value_vec.at(0));
+            if (it != devices.end()) {
+                result[device_value_vec.at(0)] = std::stoi(device_value_vec.at(1));
+            }
+        } else if (device_value_vec.size() == 1) {
+            uint32_t value = std::stoi(device_value_vec.at(0));
+            for (const auto& device : devices) {
+                result[device] = value;
+            }
+        } else if (device_value_vec.size() != 0) {
+            throw std::runtime_error("Unknown string format: " + values_string);
+        }
+    }
+    return result;
+}
+
+cv::Size stringToSize(const std::string& str) {
+    std::vector<std::string> strings = split(str, 'x');
+    if (strings.size() != 2) {
+        throw std::invalid_argument("Can't convert std::string to cv::Size. The string must contain exactly one x");
+    }
+    return {std::stoi(strings[0]), std::stoi(strings[1])};
+}
+
+std::map<std::string, ov::Layout> parseLayoutString(const std::string& layout_string) {
+    // Parse parameter string like "input0:NCHW,input1:NC" or "NCHW" (applied to all
+    // inputs)
+    std::map<std::string, ov::Layout> layouts;
+    std::string searchStr = (layout_string.find_last_of(':') == std::string::npos && !layout_string.empty() ?
+        ":" : "") + layout_string;
+    auto colonPos = searchStr.find_last_of(':');
+    while (colonPos != std::string::npos) {
+        auto startPos = searchStr.find_last_of(',');
+        auto inputName = searchStr.substr(startPos + 1, colonPos - startPos - 1);
+        auto inputLayout = searchStr.substr(colonPos + 1);
+        layouts[inputName] = ov::Layout(inputLayout);
+        searchStr = searchStr.substr(0, startPos + 1);
+        if (searchStr.empty() || searchStr.back() != ',') {
+            break;
+        }
+        searchStr.pop_back();
+        colonPos = searchStr.find_last_of(':');
+    }
+    if (!searchStr.empty()) {
+        throw std::invalid_argument("Can't parse input layout string: " + layout_string);
+    }
+    return layouts;
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/config_factory.cpp b/python/openvino/runtime/common/demo_utils/src/config_factory.cpp
new file mode 100644
index 0000000..2e9a442
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/config_factory.cpp
@@ -0,0 +1,111 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "utils/config_factory.h"
+
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/runtime/intel_gpu/properties.hpp>
+#include "dla_plugin_config.hpp"
+#include "utils/args_helper.hpp"
+#include <sys/stat.h>
+
+std::set<std::string> ModelConfig::getDevices() {
+    if (devices.empty()) {
+        for (const std::string& device : parseDevices(deviceName)) {
+            devices.insert(device);
+        }
+    }
+
+    return devices;
+}
+
+ModelConfig ConfigFactory::getUserConfig(const std::string& flags_d,
+                                         uint32_t flags_nireq,
+                                         const std::string& flags_nstreams,
+                                         uint32_t flags_nthreads,
+                                         const std::string &flags_arch) {
+    auto config = getCommonConfig(flags_d, flags_nireq);
+
+    std::map<std::string, int> deviceNstreams = parseValuePerDevice(config.getDevices(), flags_nstreams);
+    for (const auto& device : config.getDevices()) {
+        if (flags_arch != "" && device == "FPGA") {
+            struct stat buffer;
+            if (stat(flags_arch.c_str(), &buffer) != 0) {
+                std::cout << "Error: architecture file: " << flags_arch << " doesn't exist. Please provide a valid path." << std::endl;
+                throw std::logic_error("architecture file path does not exist.");
+            }
+            config.compiledModelConfig.emplace(DLIAPlugin::properties::arch_path.name(), flags_arch);
+        } else if (device == "CPU") {  // CPU supports a few special performance-oriented keys
+            // limit threading for CPU portion of inference
+            if (flags_nthreads != 0)
+                config.compiledModelConfig.emplace(ov::inference_num_threads.name(), flags_nthreads);
+
+            config.compiledModelConfig.emplace(ov::affinity.name(), ov::Affinity::NONE);
+
+            ov::streams::Num nstreams =
+                deviceNstreams.count(device) > 0 ? ov::streams::Num(deviceNstreams[device]) : ov::streams::AUTO;
+            config.compiledModelConfig.emplace(ov::streams::num.name(), nstreams);
+        } else if (device == "GPU") {
+            ov::streams::Num nstreams =
+                deviceNstreams.count(device) > 0 ? ov::streams::Num(deviceNstreams[device]) : ov::streams::AUTO;
+            config.compiledModelConfig.emplace(ov::streams::num.name(), nstreams);
+            if (flags_d.find("MULTI") != std::string::npos &&
+                config.getDevices().find("CPU") != config.getDevices().end()) {
+                // multi-device execution with the CPU + GPU performs best with GPU throttling hint,
+                // which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
+                config.compiledModelConfig.emplace(ov::intel_gpu::hint::queue_throttle.name(),
+                                                   ov::intel_gpu::hint::ThrottleLevel(1));
+            }
+        }
+    }
+    return config;
+}
+
+ModelConfig ConfigFactory::getMinLatencyConfig(const std::string& flags_d, uint32_t flags_nireq) {
+    auto config = getCommonConfig(flags_d, flags_nireq);
+    for (const auto& device : config.getDevices()) {
+        if (device == "CPU") {  // CPU supports a few special performance-oriented keys
+            config.compiledModelConfig.emplace(ov::streams::num.name(), 1);
+        } else if (device == "GPU") {
+            config.compiledModelConfig.emplace(ov::streams::num.name(), 1);
+        }
+    }
+    return config;
+}
+
+ModelConfig ConfigFactory::getCommonConfig(const std::string& flags_d, uint32_t flags_nireq) {
+    ModelConfig config;
+
+    if (!flags_d.empty()) {
+        config.deviceName = flags_d;
+    }
+
+    config.maxAsyncRequests = flags_nireq;
+
+    return config;
+}
+
+std::map<std::string, std::string> ModelConfig::getLegacyConfig() {
+    std::map<std::string, std::string> config;
+    for (const auto& item : compiledModelConfig) {
+        config[item.first] = item.second.as<std::string>();
+    }
+    return config;
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/image_utils.cpp b/python/openvino/runtime/common/demo_utils/src/image_utils.cpp
new file mode 100644
index 0000000..039dd66
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/image_utils.cpp
@@ -0,0 +1,55 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "utils/image_utils.h"
+
+cv::Mat resizeImageExt(const cv::Mat& mat, int width, int height, RESIZE_MODE resizeMode,
+                       cv::InterpolationFlags interpolationMode, cv::Rect* roi, cv::Scalar BorderConstant) {
+    if (width == mat.cols && height == mat.rows) {
+        return mat;
+    }
+
+    cv::Mat dst;
+
+    switch (resizeMode) {
+    case RESIZE_FILL:
+    {
+        cv::resize(mat, dst, cv::Size(width, height), interpolationMode);
+        if (roi) {
+            *roi = cv::Rect(0, 0, width, height);
+        }
+        break;
+    }
+    case RESIZE_KEEP_ASPECT:
+    case RESIZE_KEEP_ASPECT_LETTERBOX:
+    {
+        double scale = std::min(static_cast<double>(width) / mat.cols, static_cast<double>(height) / mat.rows);
+        cv::Mat resizedImage;
+        cv::resize(mat, resizedImage, cv::Size(0, 0), scale, scale, interpolationMode);
+
+        int dx = resizeMode == RESIZE_KEEP_ASPECT ? 0 : (width - resizedImage.cols) / 2;
+        int dy = resizeMode == RESIZE_KEEP_ASPECT ? 0 : (height - resizedImage.rows) / 2;
+
+        cv::copyMakeBorder(resizedImage, dst, dy, height - resizedImage.rows - dy,
+            dx, width - resizedImage.cols - dx, cv::BORDER_CONSTANT, BorderConstant);
+        if (roi) {
+            *roi = cv::Rect(dx, dy, resizedImage.cols, resizedImage.rows);
+        }
+        break;
+    }
+    }
+    return dst;
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/images_capture.cpp b/python/openvino/runtime/common/demo_utils/src/images_capture.cpp
new file mode 100644
index 0000000..febcdd7
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/images_capture.cpp
@@ -0,0 +1,327 @@
+// Copyright (C) 2020-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "utils/images_capture.h"
+
+#include <string.h>
+
+#ifdef _WIN32
+#    include "w_dirent.hpp"
+#else
+#    include <dirent.h>  // for closedir, dirent, opendir, readdir, DIR
+#endif
+
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/videoio.hpp>
+
+class InvalidInput : public std::runtime_error {
+public:
+    explicit InvalidInput(const std::string& message) noexcept : std::runtime_error(message) {}
+};
+
+class OpenError : public std::runtime_error {
+public:
+    explicit OpenError(const std::string& message) noexcept : std::runtime_error(message) {}
+};
+
+class ImreadWrapper : public ImagesCapture {
+    cv::Mat img;
+    bool canRead;
+
+public:
+    ImreadWrapper(const std::string& input, bool loop) : ImagesCapture{loop}, canRead{true} {
+        auto startTime = std::chrono::steady_clock::now();
+
+        std::ifstream file(input.c_str());
+        if (!file.good())
+            throw InvalidInput("Can't find the image by " + input);
+
+        img = cv::imread(input);
+        if (!img.data)
+            throw OpenError("Can't open the image from " + input);
+        else
+            readerMetrics.update(startTime);
+    }
+
+    double fps() const override {
+        return 1.0;
+    }
+
+    std::string getType() const override {
+        return "IMAGE";
+    }
+
+    cv::Mat read() override {
+        if (loop)
+            return img.clone();
+        if (canRead) {
+            canRead = false;
+            return img.clone();
+        }
+        return cv::Mat{};
+    }
+};
+
+class DirReader : public ImagesCapture {
+    std::vector<std::string> names;
+    size_t fileId;
+    size_t nextImgId;
+    const size_t initialImageId;
+    const size_t readLengthLimit;
+    const std::string input;
+
+public:
+    DirReader(const std::string& input, bool loop, size_t initialImageId, size_t readLengthLimit)
+        : ImagesCapture{loop},
+          fileId{0},
+          nextImgId{0},
+          initialImageId{initialImageId},
+          readLengthLimit{readLengthLimit},
+          input{input} {
+        DIR* dir = opendir(input.c_str());
+        if (!dir)
+            throw InvalidInput("Can't find the dir by " + input);
+        while (struct dirent* ent = readdir(dir))
+            if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, ".."))
+                names.emplace_back(ent->d_name);
+        closedir(dir);
+        if (names.empty())
+            throw OpenError("The dir " + input + " is empty");
+        sort(names.begin(), names.end());
+        size_t readImgs = 0;
+        while (fileId < names.size()) {
+            cv::Mat img = cv::imread(input + '/' + names[fileId]);
+            if (img.data) {
+                ++readImgs;
+                if (readImgs - 1 >= initialImageId)
+                    return;
+            }
+            ++fileId;
+        }
+        throw OpenError("Can't read the first image from " + input);
+    }
+
+    double fps() const override {
+        return 1.0;
+    }
+
+    std::string getType() const override {
+        return "DIR";
+    }
+
+    cv::Mat read() override {
+        auto startTime = std::chrono::steady_clock::now();
+
+        while (fileId < names.size() && nextImgId < readLengthLimit) {
+            cv::Mat img = cv::imread(input + '/' + names[fileId]);
+            ++fileId;
+            if (img.data) {
+                ++nextImgId;
+                readerMetrics.update(startTime);
+                return img;
+            }
+        }
+
+        if (loop) {
+            fileId = 0;
+            size_t readImgs = 0;
+            while (fileId < names.size()) {
+                cv::Mat img = cv::imread(input + '/' + names[fileId]);
+                ++fileId;
+                if (img.data) {
+                    ++readImgs;
+                    if (readImgs - 1 >= initialImageId) {
+                        nextImgId = 1;
+                        readerMetrics.update(startTime);
+                        return img;
+                    }
+                }
+            }
+        }
+        return cv::Mat{};
+    }
+};
+
+class VideoCapWrapper : public ImagesCapture {
+    cv::VideoCapture cap;
+    bool first_read;
+    const read_type type;
+    size_t nextImgId;
+    const double initialImageId;
+    size_t readLengthLimit;
+
+public:
+    VideoCapWrapper(const std::string& input, bool loop, read_type type, size_t initialImageId, size_t readLengthLimit)
+        : ImagesCapture{loop},
+          first_read{true},
+          type{type},
+          nextImgId{0},
+          initialImageId{static_cast<double>(initialImageId)} {
+        if (0 == readLengthLimit) {
+            throw std::runtime_error("readLengthLimit must be positive");
+        }
+        if (cap.open(input)) {
+            this->readLengthLimit = readLengthLimit;
+            if (!cap.set(cv::CAP_PROP_POS_FRAMES, this->initialImageId))
+                throw OpenError("Can't set the frame to begin with");
+            return;
+        }
+        throw InvalidInput("Can't open the video from " + input);
+    }
+
+    double fps() const override {
+        return cap.get(cv::CAP_PROP_FPS);
+    }
+
+    std::string getType() const override {
+        return "VIDEO";
+    }
+
+    cv::Mat read() override {
+        auto startTime = std::chrono::steady_clock::now();
+
+        if (nextImgId >= readLengthLimit) {
+            if (loop && cap.set(cv::CAP_PROP_POS_FRAMES, initialImageId)) {
+                nextImgId = 1;
+                cv::Mat img;
+                cap.read(img);
+                if (type == read_type::safe) {
+                    img = img.clone();
+                }
+                readerMetrics.update(startTime);
+                return img;
+            }
+            return cv::Mat{};
+        }
+        cv::Mat img;
+        bool success = cap.read(img);
+        if (!success && first_read) {
+            throw std::runtime_error("The first image can't be read");
+        }
+        first_read = false;
+        if (!success && loop && cap.set(cv::CAP_PROP_POS_FRAMES, initialImageId)) {
+            nextImgId = 1;
+            cap.read(img);
+        } else {
+            ++nextImgId;
+        }
+        if (type == read_type::safe) {
+            img = img.clone();
+        }
+        readerMetrics.update(startTime);
+        return img;
+    }
+};
+
+class CameraCapWrapper : public ImagesCapture {
+    cv::VideoCapture cap;
+    const read_type type;
+    size_t nextImgId;
+    size_t readLengthLimit;
+
+public:
+    CameraCapWrapper(const std::string& input,
+                     bool loop,
+                     read_type type,
+                     size_t readLengthLimit,
+                     cv::Size cameraResolution)
+        : ImagesCapture{loop},
+          type{type},
+          nextImgId{0} {
+        if (0 == readLengthLimit) {
+            throw std::runtime_error("readLengthLimit must be positive");
+        }
+        try {
+            if (cap.open(std::stoi(input))) {
+                this->readLengthLimit = loop ? std::numeric_limits<size_t>::max() : readLengthLimit;
+                cap.set(cv::CAP_PROP_BUFFERSIZE, 1);
+                cap.set(cv::CAP_PROP_FRAME_WIDTH, cameraResolution.width);
+                cap.set(cv::CAP_PROP_FRAME_HEIGHT, cameraResolution.height);
+                cap.set(cv::CAP_PROP_AUTOFOCUS, true);
+                cap.set(cv::CAP_PROP_FOURCC, cv::VideoWriter::fourcc('M', 'J', 'P', 'G'));
+                return;
+            }
+            throw OpenError("Can't open the camera from " + input);
+        } catch (const std::invalid_argument&) {
+            throw InvalidInput("Can't find the camera " + input);
+        } catch (const std::out_of_range&) { throw InvalidInput("Can't find the camera " + input); }
+    }
+
+    double fps() const override {
+        return cap.get(cv::CAP_PROP_FPS) > 0 ? cap.get(cv::CAP_PROP_FPS) : 30;
+    }
+
+    std::string getType() const override {
+        return "CAMERA";
+    }
+
+    cv::Mat read() override {
+        auto startTime = std::chrono::steady_clock::now();
+
+        if (nextImgId >= readLengthLimit) {
+            return cv::Mat{};
+        }
+        cv::Mat img;
+        if (!cap.read(img)) {
+            throw std::runtime_error("The image can't be captured from the camera");
+        }
+        if (type == read_type::safe) {
+            img = img.clone();
+        }
+        ++nextImgId;
+
+        readerMetrics.update(startTime);
+        return img;
+    }
+};
+
+std::unique_ptr<ImagesCapture> openImagesCapture(const std::string& input,
+                                                 bool loop,
+                                                 read_type type,
+                                                 size_t initialImageId,
+                                                 size_t readLengthLimit,
+                                                 cv::Size cameraResolution
+                                                 ) {
+    if (readLengthLimit == 0)
+        throw std::runtime_error{"Read length limit must be positive"};
+    std::vector<std::string> invalidInputs, openErrors;
+    try {
+        return std::unique_ptr<ImagesCapture>(new ImreadWrapper{input, loop});
+    } catch (const InvalidInput& e) { invalidInputs.push_back(e.what()); } catch (const OpenError& e) {
+        openErrors.push_back(e.what());
+    }
+
+    try {
+        return std::unique_ptr<ImagesCapture>(new DirReader{input, loop, initialImageId, readLengthLimit});
+    } catch (const InvalidInput& e) { invalidInputs.push_back(e.what()); } catch (const OpenError& e) {
+        openErrors.push_back(e.what());
+    }
+
+    try {
+        return std::unique_ptr<ImagesCapture>(new VideoCapWrapper{input, loop, type, initialImageId, readLengthLimit});
+    } catch (const InvalidInput& e) { invalidInputs.push_back(e.what()); } catch (const OpenError& e) {
+        openErrors.push_back(e.what());
+    }
+
+    try {
+        return std::unique_ptr<ImagesCapture>(
+            new CameraCapWrapper{input, loop, type, readLengthLimit, cameraResolution});
+    } catch (const InvalidInput& e) { invalidInputs.push_back(e.what()); } catch (const OpenError& e) {
+        openErrors.push_back(e.what());
+    }
+
+    std::vector<std::string> errorMessages = openErrors.empty() ? invalidInputs : openErrors;
+    std::string errorsInfo;
+    for (const auto& message : errorMessages) {
+        errorsInfo.append(message + "\n");
+    }
+    throw std::runtime_error(errorsInfo);
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/kuhn_munkres.cpp b/python/openvino/runtime/common/demo_utils/src/kuhn_munkres.cpp
new file mode 100644
index 0000000..7d612c1
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/kuhn_munkres.cpp
@@ -0,0 +1,169 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include <utils/kuhn_munkres.hpp>
+
+KuhnMunkres::KuhnMunkres(bool greedy) : n_(), greedy_(greedy) {}
+
+std::vector<size_t> KuhnMunkres::Solve(const cv::Mat& dissimilarity_matrix) {
+    CV_Assert(dissimilarity_matrix.type() == CV_32F);
+    double min_val;
+    cv::minMaxLoc(dissimilarity_matrix, &min_val);
+
+    n_ = std::max(dissimilarity_matrix.rows, dissimilarity_matrix.cols);
+    dm_ = cv::Mat(n_, n_, CV_32F, cv::Scalar(0));
+    marked_ = cv::Mat(n_, n_, CV_8S, cv::Scalar(0));
+    points_ = std::vector<cv::Point>(n_ * 2);
+
+    dissimilarity_matrix.copyTo(dm_(
+            cv::Rect(0, 0, dissimilarity_matrix.cols, dissimilarity_matrix.rows)));
+
+    is_row_visited_ = std::vector<int>(n_, 0);
+    is_col_visited_ = std::vector<int>(n_, 0);
+
+    Run();
+
+    std::vector<size_t> results(dissimilarity_matrix.rows, -1);
+    for (int i = 0; i < dissimilarity_matrix.rows; i++) {
+        const auto ptr = marked_.ptr<char>(i);
+        for (int j = 0; j < dissimilarity_matrix.cols; j++) {
+            if (ptr[j] == kStar) {
+                results[i] = (size_t)j;
+            }
+        }
+    }
+    return results;
+}
+
+void KuhnMunkres::TrySimpleCase() {
+    auto is_row_visited = std::vector<int>(n_, 0);
+    auto is_col_visited = std::vector<int>(n_, 0);
+
+    for (int row = 0; row < n_; row++) {
+        auto ptr = dm_.ptr<float>(row);
+        auto marked_ptr = marked_.ptr<char>(row);
+        auto min_val = *std::min_element(ptr, ptr + n_);
+        for (int col = 0; col < n_; col++) {
+            ptr[col] -= min_val;
+            if (ptr[col] == 0 && !is_col_visited[col] && !is_row_visited[row]) {
+                marked_ptr[col] = kStar;
+                is_col_visited[col] = 1;
+                is_row_visited[row] = 1;
+            }
+        }
+    }
+}
+
+bool KuhnMunkres::CheckIfOptimumIsFound() {
+    int count = 0;
+    for (int i = 0; i < n_; i++) {
+        const auto marked_ptr = marked_.ptr<char>(i);
+        for (int j = 0; j < n_; j++) {
+            if (marked_ptr[j] == kStar) {
+                is_col_visited_[j] = 1;
+                count++;
+            }
+        }
+    }
+
+    return count >= n_;
+}
+
+cv::Point KuhnMunkres::FindUncoveredMinValPos() {
+    auto min_val = std::numeric_limits<float>::max();
+    cv::Point min_val_pos(-1, -1);
+    for (int i = 0; i < n_; i++) {
+        if (!is_row_visited_[i]) {
+            auto dm_ptr = dm_.ptr<float>(i);
+            for (int j = 0; j < n_; j++) {
+                if (!is_col_visited_[j] && dm_ptr[j] < min_val) {
+                    min_val = dm_ptr[j];
+                    min_val_pos = cv::Point(j, i);
+                }
+            }
+        }
+    }
+    return min_val_pos;
+}
+
+void KuhnMunkres::UpdateDissimilarityMatrix(float val) {
+    for (int i = 0; i < n_; i++) {
+        auto dm_ptr = dm_.ptr<float>(i);
+        for (int j = 0; j < n_; j++) {
+            if (is_row_visited_[i]) dm_ptr[j] += val;
+            if (!is_col_visited_[j]) dm_ptr[j] -= val;
+        }
+    }
+}
+
+int KuhnMunkres::FindInRow(int row, int what) {
+    for (int j = 0; j < n_; j++) {
+        if (marked_.at<char>(row, j) == what) {
+            return j;
+        }
+    }
+    return -1;
+}
+
+int KuhnMunkres::FindInCol(int col, int what) {
+    for (int i = 0; i < n_; i++) {
+        if (marked_.at<char>(i, col) == what) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+void KuhnMunkres::Run() {
+    TrySimpleCase();
+    if (greedy_)
+        return;
+    while (!CheckIfOptimumIsFound()) {
+        while (true) {
+            auto point = FindUncoveredMinValPos();
+            auto min_val = dm_.at<float>(point.y, point.x);
+            if (min_val > 0) {
+                UpdateDissimilarityMatrix(min_val);
+            } else {
+                marked_.at<char>(point.y, point.x) = kPrime;
+                int col = FindInRow(point.y, kStar);
+                if (col >= 0) {
+                    is_row_visited_[point.y] = 1;
+                    is_col_visited_[col] = 0;
+                } else {
+                    int count = 0;
+                    points_[count] = point;
+
+                    while (true) {
+                        int row = FindInCol(points_[count].x, kStar);
+                        if (row >= 0) {
+                            count++;
+                            points_[count] = cv::Point(points_[count - 1].x, row);
+                            int col = FindInRow(points_[count].y, kPrime);
+                            count++;
+                            points_[count] = cv::Point(col, points_[count - 1].y);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    for (int i = 0; i < count + 1; i++) {
+                        auto& mark = marked_.at<char>(points_[i].y, points_[i].x);
+                        mark = mark == kStar ? 0 : kStar;
+                    }
+
+                    is_row_visited_ = std::vector<int>(n_, 0);
+                    is_col_visited_ = std::vector<int>(n_, 0);
+
+                    marked_.setTo(0, marked_ == kPrime);
+                    break;
+                }
+            }
+        }
+    }
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/performance_metrics.cpp b/python/openvino/runtime/common/demo_utils/src/performance_metrics.cpp
new file mode 100644
index 0000000..d1e494e
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/performance_metrics.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <limits>
+#include "utils/performance_metrics.hpp"
+#include "utils/slog.hpp"
+
+// timeWindow defines the length of the timespan over which the 'current fps' value is calculated
+PerformanceMetrics::PerformanceMetrics(Duration timeWindow)
+    : timeWindowSize(timeWindow)
+    , firstFrameProcessed(false)
+{}
+
+void PerformanceMetrics::update(TimePoint lastRequestStartTime,
+    const cv::Mat& frame,
+    cv::Point position,
+    int fontFace,
+    double fontScale,
+    cv::Scalar color,
+    int thickness,
+    MetricTypes metricType) {
+    update(lastRequestStartTime);
+    paintMetrics(frame, position, fontFace, fontScale, color, thickness, metricType);
+}
+
+void PerformanceMetrics::update(TimePoint lastRequestStartTime) {
+    TimePoint currentTime = Clock::now();
+
+    if (!firstFrameProcessed) {
+        lastUpdateTime = lastRequestStartTime;
+        firstFrameProcessed = true;
+    }
+
+    currentMovingStatistic.latency += currentTime - lastRequestStartTime;
+    currentMovingStatistic.period = currentTime - lastUpdateTime;
+    currentMovingStatistic.frameCount++;
+
+    if (currentTime - lastUpdateTime > timeWindowSize) {
+        lastMovingStatistic = currentMovingStatistic;
+        totalStatistic.combine(lastMovingStatistic);
+        currentMovingStatistic = Statistic();
+
+        lastUpdateTime = currentTime;
+    }
+}
+
+void PerformanceMetrics::paintMetrics(const cv::Mat& frame, cv::Point position, int fontFace,
+    double fontScale, cv::Scalar color, int thickness, MetricTypes metricType) const {
+    // Draw performance stats over frame
+    Metrics metrics = getLast();
+
+    std::ostringstream out;
+    if (!std::isnan(metrics.latency) &&
+        (metricType == PerformanceMetrics::MetricTypes::LATENCY || metricType == PerformanceMetrics::MetricTypes::ALL)) {
+        out << "Latency: " << std::fixed << std::setprecision(1) << metrics.latency << " ms";
+        putHighlightedText(frame, out.str(), position, fontFace, fontScale, color, thickness);
+    }
+    if (!std::isnan(metrics.fps) &&
+        (metricType == PerformanceMetrics::MetricTypes::FPS || metricType == PerformanceMetrics::MetricTypes::ALL)) {
+        out.str("");
+        out << "FPS: " << std::fixed << std::setprecision(1) << metrics.fps;
+        int offset = metricType == PerformanceMetrics::MetricTypes::ALL ? 30 : 0;
+        putHighlightedText(frame, out.str(), {position.x, position.y + offset}, fontFace, fontScale, color, thickness);
+    }
+}
+
+PerformanceMetrics::Metrics PerformanceMetrics::getLast() const {
+    Metrics metrics;
+
+    metrics.latency = lastMovingStatistic.frameCount != 0
+                      ? std::chrono::duration_cast<Ms>(lastMovingStatistic.latency).count()
+                        / lastMovingStatistic.frameCount
+                      : std::numeric_limits<double>::signaling_NaN();
+    metrics.fps = lastMovingStatistic.period != Duration::zero()
+                  ? lastMovingStatistic.frameCount
+                    / std::chrono::duration_cast<Sec>(lastMovingStatistic.period).count()
+                  : std::numeric_limits<double>::signaling_NaN();
+
+    return metrics;
+}
+
+PerformanceMetrics::Metrics PerformanceMetrics::getTotal() const {
+    Metrics metrics;
+
+    int frameCount = totalStatistic.frameCount + currentMovingStatistic.frameCount;
+    if (frameCount != 0) {
+        metrics.latency = std::chrono::duration_cast<Ms>(
+            totalStatistic.latency + currentMovingStatistic.latency).count() / frameCount;
+        metrics.fps = frameCount / std::chrono::duration_cast<Sec>(
+                                       totalStatistic.period + currentMovingStatistic.period).count();
+    } else {
+        metrics.latency = std::numeric_limits<double>::signaling_NaN();
+        metrics.fps = std::numeric_limits<double>::signaling_NaN();
+    }
+
+    return metrics;
+}
+
+void PerformanceMetrics::logTotal() const {
+    Metrics metrics = getTotal();
+
+    slog::info << "\tLatency: " << std::fixed << std::setprecision(1) << metrics.latency << " ms" << slog::endl;
+    slog::info << "\tFPS: " << metrics.fps << slog::endl;
+}
+
+void logLatencyPerStage(double readLat, double preprocLat, double inferLat, double postprocLat, double renderLat) {
+    slog::info << "\tDecoding:\t" << std::fixed << std::setprecision(1) <<
+        readLat << " ms" << slog::endl;
+    slog::info << "\tPreprocessing:\t" << preprocLat << " ms" << slog::endl;
+    slog::info << "\tInference:\t" << inferLat << " ms" << slog::endl;
+    slog::info << "\tPostprocessing:\t" << postprocLat << " ms" << slog::endl;
+    slog::info << "\tRendering:\t" << renderLat << " ms" << slog::endl;
+}
diff --git a/python/openvino/runtime/common/demo_utils/src/w_dirent.hpp b/python/openvino/runtime/common/demo_utils/src/w_dirent.hpp
new file mode 100644
index 0000000..0df8636
--- /dev/null
+++ b/python/openvino/runtime/common/demo_utils/src/w_dirent.hpp
@@ -0,0 +1,114 @@
+// Copyright (C) 2018-2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#if defined(_WIN32)
+
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+
+#include <WinSock2.h>
+#include <Windows.h>
+#include <stdlib.h>
+
+#else
+
+#include <unistd.h>
+#include <cstdlib>
+#include <string.h>
+
+#endif
+
+#include <string>
+
+#include <sys/stat.h>
+
+#if defined(WIN32)
+    // Copied from linux libc sys/stat.h:
+    #define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
+    #define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
+#endif
+
+struct dirent {
+    char *d_name;
+
+    explicit dirent(const wchar_t *wsFilePath) {
+        size_t i;
+        auto slen = wcslen(wsFilePath);
+        d_name = static_cast<char*>(malloc(slen + 1));
+        wcstombs_s(&i, d_name, slen + 1, wsFilePath, slen);
+    }
+
+    ~dirent() {
+        free(d_name);
+    }
+};
+
+class DIR {
+    WIN32_FIND_DATAA FindFileData;
+    HANDLE hFind;
+    dirent *next;
+
+    static inline bool endsWith(const std::string &src, const char *with) {
+        int wl = static_cast<int>(strlen(with));
+        int so = static_cast<int>(src.length()) - wl;
+        if (so < 0) return false;
+        return 0 == strncmp(with, &src[so], wl);
+    }
+
+public:
+    explicit DIR(const char *dirPath) : next(nullptr) {
+        std::string ws = dirPath;
+        if (endsWith(ws, "\\"))
+            ws += "*";
+        else
+            ws += "\\*";
+        hFind = FindFirstFileA(ws.c_str(), &FindFileData);
+        FindFileData.dwReserved0 = hFind != INVALID_HANDLE_VALUE;
+    }
+
+    ~DIR() {
+        if (!next) delete next;
+        FindClose(hFind);
+    }
+
+    bool isValid() const {
+        return (hFind != INVALID_HANDLE_VALUE && FindFileData.dwReserved0);
+    }
+
+    dirent* nextEnt() {
+        if (next != nullptr) delete next;
+        next = nullptr;
+
+        if (!FindFileData.dwReserved0) return nullptr;
+
+        wchar_t wbuf[4096];
+
+        size_t outSize;
+        mbstowcs_s(&outSize, wbuf, 4094, FindFileData.cFileName, 4094);
+        next = new dirent(wbuf);
+        FindFileData.dwReserved0 = FindNextFileA(hFind, &FindFileData);
+        return next;
+    }
+};
+
+
+static DIR *opendir(const char* dirPath) {
+    auto dp = new DIR(dirPath);
+    if (!dp->isValid()) {
+        delete dp;
+        return nullptr;
+    }
+    return dp;
+}
+
+static struct dirent *readdir(DIR *dp) {
+    return dp->nextEnt();
+}
+
+static void closedir(DIR *dp) {
+    delete dp;
+}
diff --git a/python/openvino/runtime/common/format_reader/CMakeLists.txt b/python/openvino/runtime/common/format_reader/CMakeLists.txt
new file mode 100644
index 0000000..3daab96
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "format_reader")
+
+file (GLOB MAIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file (GLOB LIBRARY_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${LIBRARY_SRC})
+source_group("include" FILES ${LIBRARY_HEADERS})
+
+# Create library file from sources.
+add_library(${TARGET_NAME} SHARED ${MAIN_SRC} ${LIBRARY_HEADERS})
+
+# Find OpenCV components if exist
+find_package(OpenCV QUIET COMPONENTS core imgproc imgcodecs)
+if(NOT OpenCV_FOUND)
+    message(WARNING "OpenCV is disabled or not found, ${TARGET_NAME} will be built without OpenCV support")
+else()
+    target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCV_LIBRARIES} ie_samples_utils)
+    if(UNIX AND NOT APPLE)
+        # Workaround issue that rpath-link is missing for PRIVATE dependencies
+        # Fixed in cmake 3.16.0 https://gitlab.kitware.com/cmake/cmake/issues/19556
+        target_link_libraries(${TARGET_NAME} INTERFACE "-Wl,-rpath-link,${OpenCV_INSTALL_PATH}/lib")
+    endif()
+    # Make this definition public so that it's also seen by dla benchmark. As dla benchmark
+    # uses this macro to identify which image extensions are supported by the image reader
+    target_compile_definitions(${TARGET_NAME} PUBLIC USE_OPENCV)
+endif()
+
+target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_FORMAT_READER)
+
+target_include_directories(${TARGET_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
+                                                 "${CMAKE_CURRENT_SOURCE_DIR}/..")
+
+set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}
+                                                FOLDER cpp_samples)
+
+if(COMMAND add_clang_format_target)
+    add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+endif()
+
+install(
+    TARGETS ${TARGET_NAME}
+    RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL
+    LIBRARY DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL
+)
+
+install(TARGETS ${TARGET_NAME}
+  RUNTIME DESTINATION "dla/bin" COMPONENT EMUTEST
+  LIBRARY DESTINATION "dla/lib" COMPONENT EMUTEST
+  ARCHIVE DESTINATION "dla/lib" COMPONENT EMUTEST)
diff --git a/python/openvino/runtime/common/format_reader/MnistUbyte.cpp b/python/openvino/runtime/common/format_reader/MnistUbyte.cpp
new file mode 100644
index 0000000..182ef99
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/MnistUbyte.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "MnistUbyte.h"
+// clang-format on
+
+using namespace FormatReader;
+
+int MnistUbyte::reverseInt(int i) {
+    unsigned char ch1, ch2, ch3, ch4;
+    ch1 = (unsigned char)(i & 255);
+    ch2 = (unsigned char)((i >> 8) & 255);
+    ch3 = (unsigned char)((i >> 16) & 255);
+    ch4 = (unsigned char)((i >> 24) & 255);
+    return (static_cast<int>(ch1) << 24) + (static_cast<int>(ch2) << 16) + (static_cast<int>(ch3) << 8) + ch4;
+}
+
+MnistUbyte::MnistUbyte(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        return;
+    }
+    int magic_number = 0;
+    int number_of_images = 0;
+    int n_rows = 0;
+    int n_cols = 0;
+    file.read(reinterpret_cast<char*>(&magic_number), sizeof(magic_number));
+    magic_number = reverseInt(magic_number);
+    if (magic_number != 2051) {
+        return;
+    }
+    file.read(reinterpret_cast<char*>(&number_of_images), sizeof(number_of_images));
+    number_of_images = reverseInt(number_of_images);
+    file.read(reinterpret_cast<char*>(&n_rows), sizeof(n_rows));
+    n_rows = reverseInt(n_rows);
+    _height = (size_t)n_rows;
+    file.read(reinterpret_cast<char*>(&n_cols), sizeof(n_cols));
+    n_cols = reverseInt(n_cols);
+    _width = (size_t)n_cols;
+    if (number_of_images > 1) {
+        std::cout << "[MNIST] Warning: number_of_images  in mnist file equals " << number_of_images
+                  << ". Only a first image will be read." << std::endl;
+    }
+
+    size_t size = _width * _height * 1;
+
+    _data.reset(new unsigned char[size], std::default_delete<unsigned char[]>());
+    size_t count = 0;
+    if (0 < number_of_images) {
+        for (int r = 0; r < n_rows; ++r) {
+            for (int c = 0; c < n_cols; ++c) {
+                unsigned char temp = 0;
+                file.read(reinterpret_cast<char*>(&temp), sizeof(temp));
+                _data.get()[count++] = temp;
+            }
+        }
+    }
+
+    file.close();
+}
diff --git a/python/openvino/runtime/common/format_reader/MnistUbyte.h b/python/openvino/runtime/common/format_reader/MnistUbyte.h
new file mode 100644
index 0000000..8991166
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/MnistUbyte.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief Mnist reader
+ * \file MnistUbyte.h
+ */
+#pragma once
+
+#include <memory>
+#include <string>
+
+// clang-format off
+#include "format_reader.h"
+#include "register.h"
+// clang-format on
+
+namespace FormatReader {
+/**
+ * \class MnistUbyte
+ * \brief Reader for mnist db files
+ */
+class MnistUbyte : public Reader {
+private:
+    int reverseInt(int i);
+
+    static Register<MnistUbyte> reg;
+
+public:
+    /**
+     * \brief Constructor of Mnist reader
+     * @param filename - path to input data
+     * @return MnistUbyte reader object
+     */
+    explicit MnistUbyte(const std::string& filename);
+    virtual ~MnistUbyte() {}
+
+    /**
+     * \brief Get size
+     * @return size
+     */
+    size_t size() const override {
+        return _width * _height * 1;
+    }
+
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // format_reader is copied from openvino samples/cpp/common/format_reader/
+    // this might need special care when doing a OV uplift
+    std::shared_ptr<unsigned char> getData(size_t width, size_t height, ResizeType resize_type) override {
+        if ((width * height != 0) && (_width * _height != width * height)) {
+            std::cout << "[ WARNING ] Image won't be resized! Please use OpenCV.\n";
+            return nullptr;
+        }
+        return _data;
+    }
+};
+}  // namespace FormatReader
diff --git a/python/openvino/runtime/common/format_reader/bmp.cpp b/python/openvino/runtime/common/format_reader/bmp.cpp
new file mode 100644
index 0000000..240d13f
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/bmp.cpp
@@ -0,0 +1,64 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <fstream>
+#include <iostream>
+
+#include "bmp.h"
+// clang-format on
+
+using namespace std;
+using namespace FormatReader;
+
+BitMap::BitMap(const string& filename) {
+    BmpHeader header;
+    BmpInfoHeader infoHeader;
+
+    ifstream input(filename, ios::binary);
+    if (!input) {
+        return;
+    }
+
+    input.read(reinterpret_cast<char*>(&header.type), 2);
+
+    if (header.type != 'M' * 256 + 'B') {
+        std::cerr << "[BMP] file is not bmp type\n";
+        return;
+    }
+
+    input.read(reinterpret_cast<char*>(&header.size), 4);
+    input.read(reinterpret_cast<char*>(&header.reserved), 4);
+    input.read(reinterpret_cast<char*>(&header.offset), 4);
+
+    input.read(reinterpret_cast<char*>(&infoHeader), sizeof(BmpInfoHeader));
+
+    bool rowsReversed = infoHeader.height < 0;
+    _width = infoHeader.width;
+    _height = abs(infoHeader.height);
+
+    if (infoHeader.bits != 24) {
+        cerr << "[BMP] 24bpp only supported. But input has:" << infoHeader.bits << "\n";
+        return;
+    }
+
+    if (infoHeader.compression != 0) {
+        cerr << "[BMP] compression not supported\n";
+    }
+
+    int padSize = _width & 3;
+    char pad[3];
+    size_t size = _width * _height * 3;
+
+    _data.reset(new unsigned char[size], std::default_delete<unsigned char[]>());
+
+    input.seekg(header.offset, ios::beg);
+
+    // reading by rows in invert vertically
+    for (uint32_t i = 0; i < _height; i++) {
+        uint32_t storeAt = rowsReversed ? i : (uint32_t)_height - 1 - i;
+        input.read(reinterpret_cast<char*>(_data.get()) + _width * 3 * storeAt, _width * 3);
+        input.read(pad, padSize);
+    }
+}
diff --git a/python/openvino/runtime/common/format_reader/bmp.h b/python/openvino/runtime/common/format_reader/bmp.h
new file mode 100644
index 0000000..ac3ff31
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/bmp.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief BMP reader
+ * \file bmp.h
+ */
+#pragma once
+
+#include <memory>
+#include <string>
+
+// clang-format off
+#include "format_reader.h"
+#include "register.h"
+// clang-format on
+
+namespace FormatReader {
+/**
+ * \class BitMap
+ * \brief Reader for bmp files
+ */
+class BitMap : public Reader {
+private:
+    static Register<BitMap> reg;
+
+    typedef struct BmpHeaderType {
+        unsigned short type = 0u; /* Magic identifier            */
+        unsigned int size = 0u;   /* File size in bytes          */
+        unsigned int reserved = 0u;
+        unsigned int offset = 0u; /* Offset to image data, bytes */
+    } BmpHeader;
+
+    typedef struct BmpInfoHeaderType {
+        unsigned int size = 0u;               /* Header size in bytes      */
+        int width = 0, height = 0;            /* Width and height of image */
+        unsigned short planes = 0u;           /* Number of colour planes   */
+        unsigned short bits = 0u;             /* Bits per pixel            */
+        unsigned int compression = 0u;        /* Compression type          */
+        unsigned int imagesize = 0u;          /* Image size in bytes       */
+        int xresolution = 0, yresolution = 0; /* Pixels per meter          */
+        unsigned int ncolours = 0u;           /* Number of colours         */
+        unsigned int importantcolours = 0u;   /* Important colours         */
+    } BmpInfoHeader;
+
+public:
+    /**
+     * \brief Constructor of BMP reader
+     * @param filename - path to input data
+     * @return BitMap reader object
+     */
+    explicit BitMap(const std::string& filename);
+    virtual ~BitMap() {}
+
+    /**
+     * \brief Get size
+     * @return size
+     */
+    size_t size() const override {
+        return _width * _height * 3;
+    }
+
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // format_reader is copied from openvino samples/cpp/common/format_reader/
+    // this might need special care when doing a OV uplift
+    std::shared_ptr<unsigned char> getData(size_t width, size_t height, ResizeType resize_type) override {
+        if ((width * height != 0) && (_width * _height != width * height)) {
+            std::cout << "[ WARNING ] Image won't be resized! Please use OpenCV.\n";
+            return nullptr;
+        }
+        return _data;
+    }
+};
+}  // namespace FormatReader
diff --git a/python/openvino/runtime/common/format_reader/format_reader.cpp b/python/openvino/runtime/common/format_reader/format_reader.cpp
new file mode 100644
index 0000000..94a8441
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/format_reader.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <iostream>
+
+// clang-format off
+#include "bmp.h"
+#include "MnistUbyte.h"
+#include "yuv_nv12.h"
+#include "opencv_wrapper.h"
+#include "format_reader.h"
+// clang-format on
+
+using namespace FormatReader;
+
+std::vector<Registry::CreatorFunction> Registry::_data;
+
+Register<MnistUbyte> MnistUbyte::reg;
+Register<YUV_NV12> YUV_NV12::reg;
+#ifdef USE_OPENCV
+Register<OCVReader> OCVReader::reg;
+#else
+Register<BitMap> BitMap::reg;
+#endif
+
+Reader* Registry::CreateReader(const char* filename) {
+    for (const auto &maker : _data) {
+        Reader* ol = maker(filename);
+        if (ol != nullptr && ol->size() != 0)
+            return ol;
+        if (ol != nullptr)
+            delete ol;
+    }
+    return nullptr;
+}
+
+void Registry::RegisterReader(CreatorFunction f) {
+    _data.push_back(f);
+}
+
+FORMAT_READER_API(Reader*) CreateFormatReader(const char* filename) {
+    return Registry::CreateReader(filename);
+}
diff --git a/python/openvino/runtime/common/format_reader/format_reader.h b/python/openvino/runtime/common/format_reader/format_reader.h
new file mode 100644
index 0000000..99fc573
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/format_reader.h
@@ -0,0 +1,95 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief Format reader abstract class implementation
+ * \file format_reader.h
+ */
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#    ifdef IMPLEMENT_FORMAT_READER
+#        define FORMAT_READER_API(type) extern "C" __declspec(dllexport) type
+#    else
+#        define FORMAT_READER_API(type) extern "C" type
+#    endif
+#elif (__GNUC__ >= 4)
+#    ifdef IMPLEMENT_FORMAT_READER
+#        define FORMAT_READER_API(type) extern "C" __attribute__((visibility("default"))) type
+#    else
+#        define FORMAT_READER_API(type) extern "C" type
+#    endif
+#else
+#    define FORMAT_READER_API(TYPE) extern "C" TYPE
+#endif
+
+namespace FormatReader {
+/**
+ * \class FormatReader
+ * \brief This is an abstract class for reading input data
+ */
+class Reader {
+protected:
+    /// \brief height
+    size_t _height = 0;
+    /// \brief width
+    size_t _width = 0;
+    /// \brief data
+    std::shared_ptr<unsigned char> _data;
+
+public:
+    virtual ~Reader() = default;
+
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // format_reader is copied from openvino samples/cpp/common/format_reader/
+    // this might need special care when doing a OV uplift
+    enum ResizeType {
+        RESIZE,  // resize the image to target (height, width)
+        PAD_RESIZE,  // pad the image into a squared image and then resize the image to target (height, width)
+    };
+
+    /**
+     * \brief Get width
+     * @return width
+     */
+    size_t width() const {
+        return _width;
+    }
+
+    /**
+     * \brief Get height
+     * @return height
+     */
+    size_t height() const {
+        return _height;
+    }
+
+    /**
+     * \brief Get input data ptr
+     * @return shared pointer with input data
+     * @In case of using OpenCV, parameters width and height will be used for image resizing
+     */
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // Needs special care when doing a OV uplift
+    virtual std::shared_ptr<unsigned char> getData(size_t width = 0, size_t height = 0,
+                                                   ResizeType resize_type = ResizeType::RESIZE) = 0;
+
+    /**
+     * \brief Get size
+     * @return size
+     */
+    virtual size_t size() const = 0;
+};
+}  // namespace FormatReader
+
+/**
+ * \brief Function for create reader
+ * @return FormatReader pointer
+ */
+FORMAT_READER_API(FormatReader::Reader*) CreateFormatReader(const char* filename);
diff --git a/python/openvino/runtime/common/format_reader/format_reader_ptr.h b/python/openvino/runtime/common/format_reader/format_reader_ptr.h
new file mode 100644
index 0000000..eb9bf8e
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/format_reader_ptr.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief Implementation of smart pointer for Reader class
+ * \file format_reader_ptr.h
+ */
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "format_reader.h"
+
+namespace FormatReader {
+class ReaderPtr {
+public:
+    explicit ReaderPtr(const char* imageName) : reader(CreateFormatReader(imageName)) {}
+    /**
+     * @brief dereference operator overload
+     * @return Reader
+     */
+    Reader* operator->() const noexcept {
+        return reader.get();
+    }
+
+    /**
+     * @brief dereference operator overload
+     * @return Reader
+     */
+    Reader* operator*() const noexcept {
+        return reader.get();
+    }
+
+    Reader* get() {
+        return reader.get();
+    }
+
+protected:
+    std::unique_ptr<Reader> reader;
+};
+}  // namespace FormatReader
diff --git a/python/openvino/runtime/common/format_reader/opencv_wrapper.cpp b/python/openvino/runtime/common/format_reader/opencv_wrapper.cpp
new file mode 100644
index 0000000..b8ebeef
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/opencv_wrapper.cpp
@@ -0,0 +1,83 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef USE_OPENCV
+#    include <fstream>
+#    include <iostream>
+
+// clang-format off
+#    include <opencv2/opencv.hpp>
+
+#    include "samples/slog.hpp"
+#    include "opencv_wrapper.h"
+// clang-format on
+
+using namespace std;
+using namespace FormatReader;
+
+OCVReader::OCVReader(const string& filename) {
+    img = cv::imread(filename);
+    _size = 0;
+
+    if (img.empty()) {
+        return;
+    }
+
+    _size = img.size().width * img.size().height * img.channels();
+    _width = img.size().width;
+    _height = img.size().height;
+}
+
+// Set the maximum number of printed warnings; large image directories can otherwise be overwhelming
+static size_t resize_warning_count = 0;
+const size_t max_resize_warnings = 5;
+
+std::shared_ptr<unsigned char> OCVReader::getData(size_t width = 0, size_t height = 0, ResizeType resize_type = ResizeType::RESIZE) {
+    if (width == 0)
+        width = img.cols;
+
+    if (height == 0)
+        height = img.rows;
+
+    size_t size = width * height * img.channels();
+    _data.reset(new unsigned char[size], std::default_delete<unsigned char[]>());
+
+    if (width != static_cast<size_t>(img.cols) || height != static_cast<size_t>(img.rows)) {
+        if (resize_warning_count < max_resize_warnings) {
+            slog::warn << "Image is resized from (" << img.cols << ", " << img.rows << ") to (" << width << ", " << height
+                       << ")" << slog::endl;
+            resize_warning_count++;
+        } else if (resize_warning_count == max_resize_warnings) {
+            slog::warn << "Additional image resizing messages have been suppressed." << slog::endl;
+            resize_warning_count++;
+        }
+    }
+
+    cv::Mat resized;
+    if (resize_type == ResizeType::RESIZE) {
+        resized = cv::Mat(cv::Size(width, height), img.type(), _data.get());
+        // cv::resize() just copy data to output image if sizes are the same
+        cv::resize(img, resized, cv::Size(width, height));
+    } else if (resize_type == ResizeType::PAD_RESIZE)
+    {
+        cv::Mat padded;
+        // Find the larger side out of width and height of the image
+        int max_dim = std::max(img.rows, img.cols);
+        // Calculate padding for shorter dimension
+        int top = (max_dim - img.rows) / 2;
+        int bottom = (max_dim - img.rows + 1) / 2;
+        int left = (max_dim - img.cols) / 2;
+        int right = (max_dim - img.cols + 1) / 2;
+        // Add padding (0, i.e., black) to make the image a square
+        cv::copyMakeBorder(img, padded, top, bottom, left, right, cv::BORDER_CONSTANT, cv::Scalar());
+        cv::resize(padded, resized, cv::Size(width, height));
+        std::memcpy(_data.get(), resized.data, resized.total() * resized.elemSize());
+    } else {
+        slog::err << "Specified resize type is not implemented." << slog::endl;
+        std::exit(1);
+    }
+
+    return _data;
+}
+#endif
diff --git a/python/openvino/runtime/common/format_reader/opencv_wrapper.h b/python/openvino/runtime/common/format_reader/opencv_wrapper.h
new file mode 100644
index 0000000..c402e8d
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/opencv_wrapper.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief Image reader
+ * \file opencv_wrapper.h
+ */
+#pragma once
+
+#ifdef USE_OPENCV
+#    include <memory>
+#    include <string>
+
+// clang-format off
+#    include <opencv2/opencv.hpp>
+
+#    include "format_reader.h"
+#    include "register.h"
+// clang-format on
+
+namespace FormatReader {
+/**
+ * \class OCVMAT
+ * \brief OpenCV Wrapper
+ */
+class OCVReader : public Reader {
+private:
+    cv::Mat img;
+    size_t _size;
+    static Register<OCVReader> reg;
+
+public:
+    /**
+     * \brief Constructor of BMP reader
+     * @param filename - path to input data
+     * @return BitMap reader object
+     */
+    explicit OCVReader(const std::string& filename);
+    virtual ~OCVReader() {}
+
+    /**
+     * \brief Get size
+     * @return size
+     */
+    size_t size() const override {
+        return _size;
+    }
+
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // format_reader is copied from openvino samples/cpp/common/format_reader/
+    // this might need special care when doing a OV uplift
+    std::shared_ptr<unsigned char> getData(size_t width,
+                                           size_t height,
+                                           ResizeType resize_type) override;
+};
+}  // namespace FormatReader
+#endif
diff --git a/python/openvino/runtime/common/format_reader/register.h b/python/openvino/runtime/common/format_reader/register.h
new file mode 100644
index 0000000..781eca3
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/register.h
@@ -0,0 +1,58 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+/**
+ * \brief Register for readers
+ * \file register.h
+ */
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "format_reader.h"
+
+namespace FormatReader {
+/**
+ * \class Registry
+ * \brief Create reader from fabric
+ */
+class Registry {
+private:
+    typedef std::function<Reader*(const std::string& filename)> CreatorFunction;
+    static std::vector<CreatorFunction> _data;
+
+public:
+    /**
+     * \brief Create reader
+     * @param filename - path to input data
+     * @return Reader for input data or nullptr
+     */
+    static Reader* CreateReader(const char* filename);
+
+    /**
+     * \brief Registers reader in fabric
+     * @param f - a creation function
+     */
+    static void RegisterReader(CreatorFunction f);
+};
+
+/**
+ * \class Register
+ * \brief Registers reader in fabric
+ */
+template <typename To>
+class Register {
+public:
+    /**
+     * \brief Constructor creates creation function for fabric
+     * @return Register object
+     */
+    Register() {
+        Registry::RegisterReader([](const std::string& filename) -> Reader* {
+            return new To(filename);
+        });
+    }
+};
+}  // namespace FormatReader
diff --git a/python/openvino/runtime/common/format_reader/yuv_nv12.cpp b/python/openvino/runtime/common/format_reader/yuv_nv12.cpp
new file mode 100644
index 0000000..f25c5cb
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/yuv_nv12.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "yuv_nv12.h"
+// clang-format on
+
+using namespace FormatReader;
+
+YUV_NV12::YUV_NV12(const std::string& filename) {
+    auto pos = filename.rfind('.');
+    if (pos == std::string::npos)
+        return;
+    if (filename.substr(pos + 1) != "yuv")
+        return;
+
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        return;
+    }
+
+    file.seekg(0, file.end);
+    _size = file.tellg();
+    file.seekg(0, file.beg);
+
+    _data.reset(new unsigned char[_size], std::default_delete<unsigned char[]>());
+
+    file.read(reinterpret_cast<char*>(_data.get()), _size);
+
+    file.close();
+}
diff --git a/python/openvino/runtime/common/format_reader/yuv_nv12.h b/python/openvino/runtime/common/format_reader/yuv_nv12.h
new file mode 100644
index 0000000..dd74c7b
--- /dev/null
+++ b/python/openvino/runtime/common/format_reader/yuv_nv12.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * \brief YUV NV12 reader
+ * \file yuv_nv12.h
+ */
+#pragma once
+
+#include <memory>
+#include <string>
+
+// clang-format off
+#include "format_reader.h"
+#include "register.h"
+// clang-format on
+
+namespace FormatReader {
+/**
+ * \class YUV_NV12
+ * \brief Reader for YUV NV12 files
+ */
+class YUV_NV12 : public Reader {
+private:
+    static Register<YUV_NV12> reg;
+    size_t _size = 0;
+
+public:
+    /**
+     * \brief Constructor of YUV NV12 reader
+     * @param filename - path to input data
+     * @return YUV_NV12 reader object
+     */
+    explicit YUV_NV12(const std::string& filename);
+    virtual ~YUV_NV12() {}
+
+    /**
+     * \brief Get size
+     * @return size
+     */
+    size_t size() const override {
+        return _size;
+    }
+
+    // langsu: ResizeType is a added by us to support custom resizing functions (only in opencv_wrapper).
+    // format_reader is copied from openvino samples/cpp/common/format_reader/
+    // this might need special care when doing a OV uplift
+    std::shared_ptr<unsigned char> getData(size_t width, size_t height, Reader::ResizeType resize_type) override {
+        if ((width * height * 3 / 2 != size())) {
+            std::cout << "Image dimensions not match with NV12 file size \n";
+            return nullptr;
+        }
+        return _data;
+    }
+};
+}  // namespace FormatReader
diff --git a/python/openvino/runtime/common/models/CMakeLists.txt b/python/openvino/runtime/common/models/CMakeLists.txt
new file mode 100644
index 0000000..07c8da3
--- /dev/null
+++ b/python/openvino/runtime/common/models/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+file(GLOB SOURCES ./src/*.cpp)
+file(GLOB HEADERS ./include/models/*.h)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${SOURCES})
+source_group("include" FILES ${HEADERS})
+
+add_library(models STATIC ${SOURCES} ${HEADERS})
+target_include_directories(models PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+target_link_libraries(models PRIVATE openvino::runtime utils opencv_core opencv_imgproc)
diff --git a/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h b/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h
new file mode 100644
index 0000000..94afbda
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/associative_embedding_decoder.h
@@ -0,0 +1,94 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+struct Peak {
+    explicit Peak(const cv::Point2f& keypoint = cv::Point2f(-1, -1), const float score = 0.0f, const float tag = 0.0f)
+        : keypoint(keypoint),
+          score(score),
+          tag(tag) {}
+
+    cv::Point2f keypoint;
+    float score;
+    float tag;
+};
+
+class Pose {
+public:
+    explicit Pose(size_t numJoints) : peaks(numJoints) {}
+
+    void add(size_t index, Peak peak) {
+        peaks[index] = peak;
+        sum += peak.score;
+        poseTag = poseTag * static_cast<float>(validPointsNum) + peak.tag;
+        poseCenter = poseCenter * static_cast<float>(validPointsNum) + peak.keypoint;
+        validPointsNum += 1;
+        poseTag = poseTag / static_cast<float>(validPointsNum);
+        poseCenter = poseCenter / static_cast<float>(validPointsNum);
+    }
+
+    float getPoseTag() const {
+        return poseTag;
+    }
+
+    float getMeanScore() const {
+        return sum / static_cast<float>(size());
+    }
+
+    Peak& getPeak(size_t index) {
+        return peaks[index];
+    }
+
+    cv::Point2f& getPoseCenter() {
+        return poseCenter;
+    }
+
+    size_t size() const {
+        return peaks.size();
+    }
+
+private:
+    std::vector<Peak> peaks;
+    cv::Point2f poseCenter = cv::Point2f(0.f, 0.f);
+    int validPointsNum = 0;
+    float poseTag = 0;
+    float sum = 0;
+};
+
+void findPeaks(const std::vector<cv::Mat>& nmsHeatMaps,
+               const std::vector<cv::Mat>& aembdsMaps,
+               std::vector<std::vector<Peak>>& allPeaks,
+               size_t jointId,
+               size_t maxNumPeople,
+               float detectionThreshold);
+
+std::vector<Pose> matchByTag(std::vector<std::vector<Peak>>& allPeaks,
+                             size_t maxNumPeople,
+                             size_t numJoints,
+                             float tagThreshold);
+
+void adjustAndRefine(std::vector<Pose>& allPoses,
+                     const std::vector<cv::Mat>& heatMaps,
+                     const std::vector<cv::Mat>& aembdsMaps,
+                     int poseId,
+                     float delta);
diff --git a/python/openvino/runtime/common/models/include/models/classification_model.h b/python/openvino/runtime/common/models/include/models/classification_model.h
new file mode 100644
index 0000000..6d32e44
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/classification_model.h
@@ -0,0 +1,57 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+namespace ov {
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ClassificationModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load.
+    /// @param nTop - number of top results.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+    /// @param labels - array of labels for every class.
+    /// @param layout - model input layout
+    ClassificationModel(const std::string& modelFileName,
+                        size_t nTop,
+                        bool useAutoResize,
+                        const std::vector<std::string>& labels,
+                        const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+    static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+protected:
+    size_t nTop;
+    std::vector<std::string> labels;
+
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/deblurring_model.h b/python/openvino/runtime/common/models/include/models/deblurring_model.h
new file mode 100644
index 0000000..33f5542
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/deblurring_model.h
@@ -0,0 +1,52 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class DeblurringModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param inputImgSize size of image to set model input shape
+    /// @param layout - model input layout
+    DeblurringModel(const std::string& modelFileName, const cv::Size& inputImgSize, const std::string& layout = "");
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+    static const size_t stride = 32;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model.h b/python/openvino/runtime/common/models/include/models/detection_model.h
new file mode 100644
index 0000000..4d57a61
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model.h
@@ -0,0 +1,51 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+class DetectionModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+    /// @param labels - array of labels for every class. If this array is empty or contains less elements
+    /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param layout - model input layout
+    DetectionModel(const std::string& modelFileName,
+                   float confidenceThreshold,
+                   bool useAutoResize,
+                   const std::vector<std::string>& labels,
+                   const std::string& layout = "");
+
+    static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+protected:
+    float confidenceThreshold;
+    std::vector<std::string> labels;
+
+    std::string getLabelName(int labelID) {
+        return (size_t)labelID < labels.size() ? labels[labelID] : std::string("Label #") + std::to_string(labelID);
+    }
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_centernet.h b/python/openvino/runtime/common/models/include/models/detection_model_centernet.h
new file mode 100644
index 0000000..db9ebdb
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_centernet.h
@@ -0,0 +1,59 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelCenterNet : public DetectionModel {
+public:
+    struct BBox {
+        float left;
+        float top;
+        float right;
+        float bottom;
+
+        float getWidth() const {
+            return (right - left) + 1.0f;
+        }
+        float getHeight() const {
+            return (bottom - top) + 1.0f;
+        }
+    };
+    static const int INIT_VECTOR_SIZE = 200;
+
+    ModelCenterNet(const std::string& modelFileName,
+                   float confidenceThreshold,
+                   const std::vector<std::string>& labels = std::vector<std::string>(),
+                   const std::string& layout = "");
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h b/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h
new file mode 100644
index 0000000..8ec2b21
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_faceboxes.h
@@ -0,0 +1,55 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelFaceBoxes : public DetectionModel {
+public:
+    static const int INIT_VECTOR_SIZE = 200;
+
+    ModelFaceBoxes(const std::string& modelFileName,
+                   float confidenceThreshold,
+                   bool useAutoResize,
+                   float boxIOUThreshold,
+                   const std::string& layout = "");
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    size_t maxProposalsCount;
+    const float boxIOUThreshold;
+    const std::vector<float> variance;
+    const std::vector<int> steps;
+    const std::vector<std::vector<int>> minSizes;
+    std::vector<Anchor> anchors;
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void priorBoxes(const std::vector<std::pair<size_t, size_t>>& featureMaps);
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h b/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h
new file mode 100644
index 0000000..ac2c235
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_retinaface.h
@@ -0,0 +1,74 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelRetinaFace : public DetectionModel {
+public:
+    static const int LANDMARKS_NUM = 5;
+    static const int INIT_VECTOR_SIZE = 200;
+    /// Loads model and performs required initialization
+    /// @param model_name name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// @param boxIOUThreshold - threshold for NMS boxes filtering, varies in [0.0, 1.0] range.
+    /// @param layout - model input layout
+    ModelRetinaFace(const std::string& model_name,
+                    float confidenceThreshold,
+                    bool useAutoResize,
+                    float boxIOUThreshold,
+                    const std::string& layout = "");
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    struct AnchorCfgLine {
+        int stride;
+        std::vector<int> scales;
+        int baseSize;
+        std::vector<int> ratios;
+    };
+
+    bool shouldDetectMasks;
+    bool shouldDetectLandmarks;
+    const float boxIOUThreshold;
+    const float maskThreshold;
+    float landmarkStd;
+
+    enum OutputType { OUT_BOXES, OUT_SCORES, OUT_LANDMARKS, OUT_MASKSCORES, OUT_MAX };
+
+    std::vector<std::string> separateOutputsNames[OUT_MAX];
+    const std::vector<AnchorCfgLine> anchorCfg;
+    std::map<int, std::vector<Anchor>> anchorsFpn;
+    std::vector<std::vector<Anchor>> anchors;
+
+    void generateAnchorsFpn();
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h b/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h
new file mode 100644
index 0000000..68cc907
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_retinaface_pt.h
@@ -0,0 +1,81 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core/types.hpp>
+#include <utils/nms.hpp>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class Model;
+class Tensor;
+}  // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+class ModelRetinaFacePT : public DetectionModel {
+public:
+    struct Box {
+        float cX;
+        float cY;
+        float width;
+        float height;
+    };
+
+    /// Loads model and performs required initialization
+    /// @param model_name name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// @param boxIOUThreshold - threshold for NMS boxes filtering, varies in [0.0, 1.0] range.
+    /// @param layout - model input layout
+    ModelRetinaFacePT(const std::string& modelFileName,
+                      float confidenceThreshold,
+                      bool useAutoResize,
+                      float boxIOUThreshold,
+                      const std::string& layout = "");
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    size_t landmarksNum;
+    const float boxIOUThreshold;
+    float variance[2] = {0.1f, 0.2f};
+
+    enum OutputType { OUT_BOXES, OUT_SCORES, OUT_LANDMARKS, OUT_MAX };
+
+    std::vector<ModelRetinaFacePT::Box> priors;
+
+    std::vector<size_t> filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold);
+    std::vector<float> getFilteredScores(const ov::Tensor& scoresTensor, const std::vector<size_t>& indicies);
+    std::vector<cv::Point2f> getFilteredLandmarks(const ov::Tensor& landmarksTensor,
+                                                  const std::vector<size_t>& indicies,
+                                                  int imgWidth,
+                                                  int imgHeight);
+    std::vector<ModelRetinaFacePT::Box> generatePriorData();
+    std::vector<Anchor> getFilteredProposals(const ov::Tensor& boxesTensor,
+                                             const std::vector<size_t>& indicies,
+                                             int imgWidth,
+                                             int imgHeight);
+
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_ssd.h b/python/openvino/runtime/common/models/include/models/detection_model_ssd.h
new file mode 100644
index 0000000..646d7b0
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_ssd.h
@@ -0,0 +1,63 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/detection_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelSSD : public DetectionModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+    /// @param labels - array of labels for every class. If this array is empty or contains less elements
+    /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param layout - model input layout
+    ModelSSD(const std::string& modelFileName,
+             float confidenceThreshold,
+             bool useAutoResize,
+             const std::vector<std::string>& labels = std::vector<std::string>(),
+             const std::string& layout = "");
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    std::unique_ptr<ResultBase> postprocessSingleOutput(InferenceResult& infResult);
+    std::unique_ptr<ResultBase> postprocessMultipleOutputs(InferenceResult& infResult);
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void prepareSingleOutput(std::shared_ptr<ov::Model>& model);
+    void prepareMultipleOutputs(std::shared_ptr<ov::Model>& model);
+    size_t objectSize = 0;
+    size_t detectionsNumId = 0;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolo.h b/python/openvino/runtime/common/models/include/models/detection_model_yolo.h
new file mode 100644
index 0000000..38b0b64
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolo.h
@@ -0,0 +1,107 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/op/region_yolo.hpp>
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+struct DetectedObject;
+struct InferenceResult;
+struct ResultBase;
+
+class ModelYolo : public DetectionModel {
+protected:
+    class Region {
+    public:
+        int num = 0;
+        size_t classes = 0;
+        int coords = 0;
+        std::vector<float> anchors;
+        size_t outputWidth = 0;
+        size_t outputHeight = 0;
+
+        Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo);
+        Region(size_t classes,
+               int coords,
+               const std::vector<float>& anchors,
+               const std::vector<int64_t>& masks,
+               size_t outputWidth,
+               size_t outputHeight);
+    };
+
+public:
+    enum YoloVersion { YOLO_V1V2, YOLO_V3, YOLO_V4, YOLO_V4_TINY, YOLOF };
+
+    /// Constructor.
+    /// @param modelFileName name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+    /// @param useAdvancedPostprocessing - if true, an advanced algorithm for filtering/postprocessing will be used
+    /// (with better processing of multiple crossing objects). Otherwise, classic algorithm will be used.
+    /// @param boxIOUThreshold - threshold to treat separate output regions as one object for filtering
+    /// during postprocessing (only one of them should stay). The default value is 0.5
+    /// @param labels - array of labels for every class. If this array is empty or contains less elements
+    /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param anchors - vector of anchors coordinates. Required for YOLOv4, for other versions it may be omitted.
+    /// @param masks - vector of masks values. Required for YOLOv4, for other versions it may be omitted.
+    /// @param layout - model input layout
+    ModelYolo(const std::string& modelFileName,
+              float confidenceThreshold,
+              bool useAutoResize,
+              bool useAdvancedPostprocessing = true,
+              float boxIOUThreshold = 0.5,
+              const std::vector<std::string>& labels = std::vector<std::string>(),
+              const std::vector<float>& anchors = std::vector<float>(),
+              const std::vector<int64_t>& masks = std::vector<int64_t>(),
+              const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+    void parseYOLOOutput(const std::string& output_name,
+                         const ov::Tensor& tensor,
+                         const unsigned long resized_im_h,
+                         const unsigned long resized_im_w,
+                         const unsigned long original_im_h,
+                         const unsigned long original_im_w,
+                         std::vector<DetectedObject>& objects);
+
+    static int calculateEntryIndex(int entriesNum, int lcoords, size_t lclasses, int location, int entry);
+    static double intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2);
+
+    std::map<std::string, Region> regions;
+    double boxIOUThreshold;
+    bool useAdvancedPostprocessing;
+    bool isObjConf = 1;
+    YoloVersion yoloVersion;
+    const std::vector<float> presetAnchors;
+    const std::vector<int64_t> presetMasks;
+    ov::Layout yoloRegionLayout = "NCHW";
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h b/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h
new file mode 100644
index 0000000..66c4f03
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolov3_onnx.h
@@ -0,0 +1,50 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+class ModelYoloV3ONNX: public DetectionModel {
+public:
+    /// Constructor.
+    /// @param modelFileName name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param labels - array of labels for every class. If this array is empty or contains less elements
+    /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param layout - model input layout
+    ModelYoloV3ONNX(const std::string& modelFileName,
+                    float confidenceThreshold,
+                    const std::vector<std::string>& labels = std::vector<std::string>(),
+                    const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+    std::string boxesOutputName;
+    std::string scoresOutputName;
+    std::string indicesOutputName;
+    static const int numberOfClasses = 80;
+};
diff --git a/python/openvino/runtime/common/models/include/models/detection_model_yolox.h b/python/openvino/runtime/common/models/include/models/detection_model_yolox.h
new file mode 100644
index 0000000..d7e4ea3
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/detection_model_yolox.h
@@ -0,0 +1,54 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include "models/detection_model.h"
+
+class ModelYoloX: public DetectionModel {
+public:
+    /// Constructor.
+    /// @param modelFileName name of model to load
+    /// @param confidenceThreshold - threshold to eliminate low-confidence detections.
+    /// Any detected object with confidence lower than this threshold will be ignored.
+    /// @param boxIOUThreshold - threshold to treat separate output regions as one object for filtering
+    /// during postprocessing (only one of them should stay). The default value is 0.5
+    /// @param labels - array of labels for every class. If this array is empty or contains less elements
+    /// than actual classes number, default "Label #N" will be shown for missing items.
+    /// @param layout - model input layout
+    ModelYoloX(const std::string& modelFileName,
+                    float confidenceThreshold,
+                    float boxIOUThreshold = 0.5,
+                    const std::vector<std::string>& labels = std::vector<std::string>(),
+                    const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void setStridesGrids();
+
+    double boxIOUThreshold;
+    std::vector<std::pair<size_t, size_t>> grids;
+    std::vector<size_t> expandedStrides;
+    static const size_t numberOfClasses = 80;
+};
diff --git a/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h b/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h
new file mode 100644
index 0000000..66e217e
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/hpe_model_associative_embedding.h
@@ -0,0 +1,89 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+#include <utils/image_utils.h>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+class Shape;
+}  // namespace ov
+struct HumanPose;
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class HpeAssociativeEmbedding : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param aspectRatio - the ratio of input width to its height.
+    /// @param targetSize - the length of a short image side used for model reshaping.
+    /// @param confidenceThreshold - threshold to eliminate low-confidence poses.
+    /// Any pose with confidence lower than this threshold will be ignored.
+    /// @param layout - model input layout
+    HpeAssociativeEmbedding(const std::string& modelFileName,
+                            double aspectRatio,
+                            int targetSize,
+                            float confidenceThreshold,
+                            const std::string& layout = "",
+                            float delta = 0.0,
+                            RESIZE_MODE resizeMode = RESIZE_KEEP_ASPECT);
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+    cv::Size inputLayerSize;
+    double aspectRatio;
+    int targetSize;
+    float confidenceThreshold;
+    float delta;
+
+    std::string embeddingsTensorName;
+    std::string heatmapsTensorName;
+    std::string nmsHeatmapsTensorName;
+
+    static const int numJoints = 17;
+    static const int stride = 32;
+    static const int maxNumPeople = 30;
+    static const cv::Vec3f meanPixel;
+    static const float detectionThreshold;
+    static const float tagThreshold;
+
+    void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+    std::string findTensorByName(const std::string& tensorName, const std::vector<std::string>& outputsNames);
+
+    std::vector<cv::Mat> split(float* data, const ov::Shape& shape);
+
+    std::vector<HumanPose> extractPoses(std::vector<cv::Mat>& heatMaps,
+                                        const std::vector<cv::Mat>& aembdsMaps,
+                                        const std::vector<cv::Mat>& nmsHeatMaps) const;
+};
diff --git a/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h b/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h
new file mode 100644
index 0000000..d5e1ce7
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/hpe_model_openpose.h
@@ -0,0 +1,78 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct HumanPose;
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class HPEOpenPose : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param aspectRatio - the ratio of input width to its height.
+    /// @param targetSize - the height used for model reshaping.
+    /// @param confidenceThreshold - threshold to eliminate low-confidence keypoints.
+    /// @param layout - model input layout
+    HPEOpenPose(const std::string& modelFileName,
+                double aspectRatio,
+                int targetSize,
+                float confidenceThreshold,
+                const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+    static const size_t keypointsNumber = 18;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+    static const int minJointsNumber = 3;
+    static const int stride = 8;
+    static const int upsampleRatio = 4;
+    static const cv::Vec3f meanPixel;
+    static const float minPeaksDistance;
+    static const float midPointsScoreThreshold;
+    static const float foundMidPointsRatioThreshold;
+    static const float minSubsetScore;
+    cv::Size inputLayerSize;
+    double aspectRatio;
+    int targetSize;
+    float confidenceThreshold;
+
+    std::vector<HumanPose> extractPoses(const std::vector<cv::Mat>& heatMaps, const std::vector<cv::Mat>& pafs) const;
+    void resizeFeatureMaps(std::vector<cv::Mat>& featureMaps) const;
+
+    void changeInputSize(std::shared_ptr<ov::Model>& model);
+};
diff --git a/python/openvino/runtime/common/models/include/models/image_model.h b/python/openvino/runtime/common/models/include/models/image_model.h
new file mode 100644
index 0000000..b18daa1
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/image_model.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <memory>
+#include <string>
+
+#include "models/model_base.h"
+#include "utils/image_utils.h"
+
+namespace ov {
+class InferRequest;
+}  // namespace ov
+struct InputData;
+struct InternalModelData;
+
+class ImageModel : public ModelBase {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param useAutoResize - if true, image is resized by openvino
+    /// @param layout - model input layout
+    ImageModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout = "");
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+
+protected:
+    bool useAutoResize;
+
+    size_t netInputHeight = 0;
+    size_t netInputWidth = 0;
+    cv::InterpolationFlags interpolationMode = cv::INTER_LINEAR;
+    RESIZE_MODE resizeMode = RESIZE_FILL;
+};
diff --git a/python/openvino/runtime/common/models/include/models/input_data.h b/python/openvino/runtime/common/models/include/models/input_data.h
new file mode 100644
index 0000000..bff9fa5
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/input_data.h
@@ -0,0 +1,41 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <opencv2/opencv.hpp>
+
+struct InputData {
+    virtual ~InputData() {}
+
+    template <class T>
+    T& asRef() {
+        return dynamic_cast<T&>(*this);
+    }
+
+    template <class T>
+    const T& asRef() const {
+        return dynamic_cast<const T&>(*this);
+    }
+};
+
+struct ImageInputData : public InputData {
+    cv::Mat inputImage;
+
+    ImageInputData() {}
+    ImageInputData(const cv::Mat& img) {
+        inputImage = img;
+    }
+};
diff --git a/python/openvino/runtime/common/models/include/models/internal_model_data.h b/python/openvino/runtime/common/models/include/models/internal_model_data.h
new file mode 100644
index 0000000..61d7744
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/internal_model_data.h
@@ -0,0 +1,48 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+struct InternalModelData {
+    virtual ~InternalModelData() {}
+
+    template <class T>
+    T& asRef() {
+        return dynamic_cast<T&>(*this);
+    }
+
+    template <class T>
+    const T& asRef() const {
+        return dynamic_cast<const T&>(*this);
+    }
+};
+
+struct InternalImageModelData : public InternalModelData {
+    InternalImageModelData(int width, int height) : inputImgWidth(width), inputImgHeight(height) {}
+
+    int inputImgWidth;
+    int inputImgHeight;
+};
+
+struct InternalScaleData : public InternalImageModelData {
+    InternalScaleData(int width, int height, float scaleX, float scaleY)
+        : InternalImageModelData(width, height),
+          scaleX(scaleX),
+          scaleY(scaleY) {}
+
+    float scaleX;
+    float scaleY;
+};
diff --git a/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h b/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h
new file mode 100644
index 0000000..8b22ac2
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/jpeg_restoration_model.h
@@ -0,0 +1,55 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+class JPEGRestorationModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param inputImgSize size of image to set model input shape
+    /// @param jpegCompression flag allows to perform compression before the inference
+    /// @param layout - model input layout
+    JPEGRestorationModel(const std::string& modelFileName,
+                         const cv::Size& inputImgSize,
+                         bool jpegCompression,
+                         const std::string& layout = "");
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+    void changeInputSize(std::shared_ptr<ov::Model>& model);
+
+    static const size_t stride = 8;
+    bool jpegCompression = false;
+};
diff --git a/python/openvino/runtime/common/models/include/models/model_base.h b/python/openvino/runtime/common/models/include/models/model_base.h
new file mode 100644
index 0000000..c6d9cc1
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/model_base.h
@@ -0,0 +1,77 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/args_helper.hpp>
+#include <utils/config_factory.h>
+#include <utils/ocv_common.hpp>
+
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class ModelBase {
+public:
+    ModelBase(const std::string& modelFileName, const std::string& layout = "")
+        : modelFileName(modelFileName),
+          inputsLayouts(parseLayoutString(layout)) {}
+
+    virtual ~ModelBase() {}
+
+    virtual std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) = 0;
+    virtual ov::CompiledModel compileModel(const ModelConfig& config, ov::Core& core);
+    virtual void onLoadCompleted(const std::vector<ov::InferRequest>& requests) {}
+    virtual std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) = 0;
+
+    const std::vector<std::string>& getOutputsNames() const {
+        return outputsNames;
+    }
+    const std::vector<std::string>& getInputsNames() const {
+        return inputsNames;
+    }
+
+    std::string getModelFileName() {
+        return modelFileName;
+    }
+
+    void setInputsPreprocessing(bool reverseInputChannels,
+                                const std::string& meanValues,
+                                const std::string& scaleValues) {
+        this->inputTransform = InputTransform(reverseInputChannels, meanValues, scaleValues);
+    }
+
+protected:
+    virtual void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) = 0;
+
+    std::shared_ptr<ov::Model> prepareModel(ov::Core& core);
+
+    InputTransform inputTransform = InputTransform();
+    std::vector<std::string> inputsNames;
+    std::vector<std::string> outputsNames;
+    ov::CompiledModel compiledModel;
+    std::string modelFileName;
+    ModelConfig config = {};
+    std::map<std::string, ov::Layout> inputsLayouts;
+    ov::Layout getInputLayout(const ov::Output<ov::Node>& input);
+};
diff --git a/python/openvino/runtime/common/models/include/models/openpose_decoder.h b/python/openvino/runtime/common/models/include/models/openpose_decoder.h
new file mode 100644
index 0000000..d40e56e
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/openpose_decoder.h
@@ -0,0 +1,62 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stddef.h>
+
+#include <vector>
+
+#include <opencv2/core.hpp>
+
+struct HumanPose;
+
+struct Peak {
+    Peak(const int id = -1, const cv::Point2f& pos = cv::Point2f(), const float score = 0.0f);
+
+    int id;
+    cv::Point2f pos;
+    float score;
+};
+
+struct HumanPoseByPeaksIndices {
+    explicit HumanPoseByPeaksIndices(const int keypointsNumber);
+
+    std::vector<int> peaksIndices;
+    int nJoints;
+    float score;
+};
+
+struct TwoJointsConnection {
+    TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score);
+
+    int firstJointIdx;
+    int secondJointIdx;
+    float score;
+};
+
+void findPeaks(const std::vector<cv::Mat>& heatMaps,
+               const float minPeaksDistance,
+               std::vector<std::vector<Peak>>& allPeaks,
+               int heatMapId,
+               float confidenceThreshold);
+
+std::vector<HumanPose> groupPeaksToPoses(const std::vector<std::vector<Peak>>& allPeaks,
+                                         const std::vector<cv::Mat>& pafs,
+                                         const size_t keypointsNumber,
+                                         const float midPointsScoreThreshold,
+                                         const float foundMidPointsRatioThreshold,
+                                         const int minJointsNumber,
+                                         const float minSubsetScore);
diff --git a/python/openvino/runtime/common/models/include/models/results.h b/python/openvino/runtime/common/models/include/models/results.h
new file mode 100644
index 0000000..6b3a89d
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/results.h
@@ -0,0 +1,122 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include "internal_model_data.h"
+
+struct MetaData;
+struct ResultBase {
+    ResultBase(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : frameId(frameId),
+          metaData(metaData) {}
+    virtual ~ResultBase() {}
+
+    int64_t frameId;
+
+    std::shared_ptr<MetaData> metaData;
+    bool IsEmpty() {
+        return frameId < 0;
+    }
+
+    template <class T>
+    T& asRef() {
+        return dynamic_cast<T&>(*this);
+    }
+
+    template <class T>
+    const T& asRef() const {
+        return dynamic_cast<const T&>(*this);
+    }
+};
+
+struct InferenceResult : public ResultBase {
+    std::shared_ptr<InternalModelData> internalModelData;
+    std::map<std::string, ov::Tensor> outputsData;
+
+    /// Returns the first output tensor
+    /// This function is a useful addition to direct access to outputs list as many models have only one output
+    /// @returns first output tensor
+    ov::Tensor getFirstOutputTensor() {
+        if (outputsData.empty()) {
+            throw std::out_of_range("Outputs map is empty.");
+        }
+        return outputsData.begin()->second;
+    }
+
+    /// Returns true if object contains no valid data
+    /// @returns true if object contains no valid data
+    bool IsEmpty() {
+        return outputsData.empty();
+    }
+};
+
+struct ClassificationResult : public ResultBase {
+    ClassificationResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : ResultBase(frameId, metaData) {}
+
+    struct Classification {
+        unsigned int id;
+        std::string label;
+        float score;
+
+        Classification(unsigned int id, const std::string& label, float score) : id(id), label(label), score(score) {}
+    };
+
+    std::vector<Classification> topLabels;
+};
+
+struct DetectedObject : public cv::Rect2f {
+    unsigned int labelID;
+    std::string label;
+    float confidence;
+};
+
+struct DetectionResult : public ResultBase {
+    DetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : ResultBase(frameId, metaData) {}
+    std::vector<DetectedObject> objects;
+};
+
+struct RetinaFaceDetectionResult : public DetectionResult {
+    RetinaFaceDetectionResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : DetectionResult(frameId, metaData) {}
+    std::vector<cv::Point2f> landmarks;
+};
+
+struct ImageResult : public ResultBase {
+    ImageResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : ResultBase(frameId, metaData) {}
+    cv::Mat resultImage;
+};
+
+struct HumanPose {
+    std::vector<cv::Point2f> keypoints;
+    float score;
+};
+
+struct HumanPoseResult : public ResultBase {
+    HumanPoseResult(int64_t frameId = -1, const std::shared_ptr<MetaData>& metaData = nullptr)
+        : ResultBase(frameId, metaData) {}
+    std::vector<HumanPose> poses;
+};
diff --git a/python/openvino/runtime/common/models/include/models/segmentation_model.h b/python/openvino/runtime/common/models/include/models/segmentation_model.h
new file mode 100644
index 0000000..9d4d2cb
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/segmentation_model.h
@@ -0,0 +1,50 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+namespace ov {
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct ResultBase;
+
+#pragma once
+class SegmentationModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param useAutoResize - if true, image will be resized by openvino.
+    /// Otherwise, image will be preprocessed and resized using OpenCV routines.
+    /// @param layout - model input layout
+    SegmentationModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout = "");
+
+    static std::vector<std::string> loadLabels(const std::string& labelFilename);
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+
+    int outHeight = 0;
+    int outWidth = 0;
+    int outChannels = 0;
+};
diff --git a/python/openvino/runtime/common/models/include/models/style_transfer_model.h b/python/openvino/runtime/common/models/include/models/style_transfer_model.h
new file mode 100644
index 0000000..9bcc541
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/style_transfer_model.h
@@ -0,0 +1,43 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class StyleTransferModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param layout - model input layout
+    StyleTransferModel(const std::string& modelFileName, const std::string& layout = "");
+
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/include/models/super_resolution_model.h b/python/openvino/runtime/common/models/include/models/super_resolution_model.h
new file mode 100644
index 0000000..773b5c3
--- /dev/null
+++ b/python/openvino/runtime/common/models/include/models/super_resolution_model.h
@@ -0,0 +1,49 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <memory>
+#include <string>
+
+#include <opencv2/core/types.hpp>
+
+#include "models/image_model.h"
+
+namespace ov {
+class InferRequest;
+class Model;
+}  // namespace ov
+struct InferenceResult;
+struct InputData;
+struct InternalModelData;
+struct ResultBase;
+
+class SuperResolutionModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    /// @param layout - model input layout
+    SuperResolutionModel(const std::string& modelFileName,
+                         const cv::Size& inputImgSize,
+                         const std::string& layout = "");
+
+    std::shared_ptr<InternalModelData> preprocess(const InputData& inputData, ov::InferRequest& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void changeInputSize(std::shared_ptr<ov::Model>& model, int coeff);
+    void prepareInputsOutputs(std::shared_ptr<ov::Model>& model) override;
+};
diff --git a/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp
new file mode 100644
index 0000000..b1e8285
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/associative_embedding_decoder.cpp
@@ -0,0 +1,201 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/associative_embedding_decoder.h"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include <utils/kuhn_munkres.hpp>
+
+void findPeaks(const std::vector<cv::Mat>& nmsHeatMaps,
+               const std::vector<cv::Mat>& aembdsMaps,
+               std::vector<std::vector<Peak>>& allPeaks,
+               size_t jointId,
+               size_t maxNumPeople,
+               float detectionThreshold) {
+    const cv::Mat& nmsHeatMap = nmsHeatMaps[jointId];
+    const float* heatMapData = nmsHeatMap.ptr<float>();
+    cv::Size outputSize = nmsHeatMap.size();
+
+    std::vector<int> indices(outputSize.area());
+    std::iota(std::begin(indices), std::end(indices), 0);
+    std::partial_sort(std::begin(indices),
+                      std::begin(indices) + maxNumPeople,
+                      std::end(indices),
+                      [heatMapData](int l, int r) {
+                          return heatMapData[l] > heatMapData[r];
+                      });
+
+    for (size_t personId = 0; personId < maxNumPeople; personId++) {
+        int index = indices[personId];
+        int x = index / outputSize.width;
+        int y = index % outputSize.width;
+        float tag = aembdsMaps[jointId].at<float>(x, y);
+        float score = heatMapData[index];
+        allPeaks[jointId].reserve(maxNumPeople);
+        if (score > detectionThreshold) {
+            allPeaks[jointId].emplace_back(Peak{cv::Point2f(static_cast<float>(x), static_cast<float>(y)), score, tag});
+        }
+    }
+}
+
+std::vector<Pose> matchByTag(std::vector<std::vector<Peak>>& allPeaks,
+                             size_t maxNumPeople,
+                             size_t numJoints,
+                             float tagThreshold) {
+    size_t jointOrder[]{0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16};
+    std::vector<Pose> allPoses;
+    for (size_t jointId : jointOrder) {
+        std::vector<Peak>& jointPeaks = allPeaks[jointId];
+        std::vector<float> tags;
+        for (auto& peak : jointPeaks) {
+            tags.push_back(peak.tag);
+        }
+        if (allPoses.empty()) {
+            for (size_t personId = 0; personId < jointPeaks.size(); personId++) {
+                Peak peak = jointPeaks[personId];
+                Pose pose = Pose(numJoints);
+                pose.add(jointId, peak);
+                allPoses.push_back(pose);
+            }
+            continue;
+        }
+        if (jointPeaks.empty() || (allPoses.size() == maxNumPeople)) {
+            continue;
+        }
+        std::vector<float> posesTags;
+        std::vector<cv::Point2f> posesCenters;
+        for (auto& pose : allPoses) {
+            posesTags.push_back(pose.getPoseTag());
+            posesCenters.push_back(pose.getPoseCenter());
+        }
+        size_t numAdded = tags.size();
+        size_t numGrouped = posesTags.size();
+        cv::Mat tagsDiff(numAdded, numGrouped, CV_32F);
+        cv::Mat matchingCost(numAdded, numGrouped, CV_32F);
+        std::vector<float> dists(numAdded);
+        for (size_t j = 0; j < numGrouped; j++) {
+            float minDist = std::numeric_limits<float>::max();
+            // Compute euclidean distance (in spatial space) between the pose center and all joints.
+            const cv::Point2f center = posesCenters.at(j);
+            for (size_t i = 0; i < numAdded; i++) {
+                cv::Point2f v = jointPeaks.at(i).keypoint - center;
+                float dist = std::sqrt(v.x * v.x + v.y * v.y);
+                dists[i] = dist;
+                minDist = std::min(dist, minDist);
+            }
+            // Compute semantic distance (in embedding space) between the pose tag and all joints
+            // and corresponding matching costs.
+            auto poseTag = posesTags[j];
+            for (size_t i = 0; i < numAdded; i++) {
+                float diff = static_cast<float>(cv::norm(tags[i] - poseTag));
+                tagsDiff.at<float>(i, j) = diff;
+                if (diff < tagThreshold) {
+                    diff *= dists[i] / (minDist + 1e-10f);
+                }
+                matchingCost.at<float>(i, j) = std::round(diff) * 100 - jointPeaks[i].score;
+            }
+        }
+
+        if (numAdded > numGrouped) {
+            cv::copyMakeBorder(matchingCost,
+                               matchingCost,
+                               0,
+                               0,
+                               0,
+                               numAdded - numGrouped,
+                               cv::BORDER_CONSTANT,
+                               10000000);
+        }
+        // Get pairs
+        auto res = KuhnMunkres().Solve(matchingCost);
+        for (size_t row = 0; row < res.size(); row++) {
+            size_t col = res[row];
+            if (row < numAdded && col < numGrouped && tagsDiff.at<float>(row, col) < tagThreshold) {
+                allPoses[col].add(jointId, jointPeaks[row]);
+            } else {
+                Pose pose = Pose(numJoints);
+                pose.add(jointId, jointPeaks[row]);
+                allPoses.push_back(pose);
+            }
+        }
+    }
+    return allPoses;
+}
+
+namespace {
+cv::Point2f adjustLocation(const int x, const int y, const cv::Mat& heatMap) {
+    cv::Point2f delta(0.f, 0.f);
+    int width = heatMap.cols;
+    int height = heatMap.rows;
+    if ((1 < x) && (x < width - 1) && (1 < y) && (y < height - 1)) {
+        auto diffX = heatMap.at<float>(y, x + 1) - heatMap.at<float>(y, x - 1);
+        auto diffY = heatMap.at<float>(y + 1, x) - heatMap.at<float>(y - 1, x);
+        delta.x = diffX > 0 ? 0.25f : -0.25f;
+        delta.y = diffY > 0 ? 0.25f : -0.25f;
+    }
+    return delta;
+}
+}  // namespace
+
+void adjustAndRefine(std::vector<Pose>& allPoses,
+                     const std::vector<cv::Mat>& heatMaps,
+                     const std::vector<cv::Mat>& aembdsMaps,
+                     int poseId,
+                     const float delta) {
+    Pose& pose = allPoses[poseId];
+    float poseTag = pose.getPoseTag();
+    for (size_t jointId = 0; jointId < pose.size(); jointId++) {
+        Peak& peak = pose.getPeak(jointId);
+        const cv::Mat& heatMap = heatMaps[jointId];
+        const cv::Mat& aembds = aembdsMaps[jointId];
+
+        if (peak.score > 0) {
+            // Adjust
+            int x = static_cast<int>(peak.keypoint.x);
+            int y = static_cast<int>(peak.keypoint.y);
+            peak.keypoint += adjustLocation(x, y, heatMap);
+            if (delta) {
+                peak.keypoint.x += delta;
+                peak.keypoint.y += delta;
+            }
+        } else {
+            // Refine
+            // Get position with the closest tag value to the pose tag
+            cv::Mat diff = cv::abs(aembds - poseTag);
+            diff.convertTo(diff, CV_32S, 1.0, 0.0);
+            diff.convertTo(diff, CV_32F);
+            diff -= heatMap;
+            double min;
+            cv::Point2i minLoc;
+            cv::minMaxLoc(diff, &min, 0, &minLoc);
+            int x = minLoc.x;
+            int y = minLoc.y;
+            float val = heatMap.at<float>(y, x);
+            if (val > 0) {
+                peak.keypoint.x = static_cast<float>(x);
+                peak.keypoint.y = static_cast<float>(y);
+                peak.keypoint += adjustLocation(x, y, heatMap);
+                // Peak score is assigned directly, so it does not affect the pose score.
+                peak.score = val;
+            }
+        }
+    }
+}
diff --git a/python/openvino/runtime/common/models/src/classification_model.cpp b/python/openvino/runtime/common/models/src/classification_model.cpp
new file mode 100644
index 0000000..90bc0d5
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/classification_model.cpp
@@ -0,0 +1,196 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/classification_model.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/op/softmax.hpp>
+#include <openvino/op/topk.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/slog.hpp>
+
+#include "models/results.h"
+
+ClassificationModel::ClassificationModel(const std::string& modelFileName,
+                                         size_t nTop,
+                                         bool useAutoResize,
+                                         const std::vector<std::string>& labels,
+                                         const std::string& layout)
+    : ImageModel(modelFileName, useAutoResize, layout),
+      nTop(nTop),
+      labels(labels) {}
+
+std::unique_ptr<ResultBase> ClassificationModel::postprocess(InferenceResult& infResult) {
+    const ov::Tensor& indicesTensor = infResult.outputsData.find(outputsNames[0])->second;
+    const int* indicesPtr = indicesTensor.data<int>();
+    const ov::Tensor& scoresTensor = infResult.outputsData.find(outputsNames[1])->second;
+    const float* scoresPtr = scoresTensor.data<float>();
+
+    ClassificationResult* result = new ClassificationResult(infResult.frameId, infResult.metaData);
+    auto retVal = std::unique_ptr<ResultBase>(result);
+
+    result->topLabels.reserve(scoresTensor.get_size());
+    for (size_t i = 0; i < scoresTensor.get_size(); ++i) {
+        int ind = indicesPtr[i];
+        if (ind < 0 || ind >= static_cast<int>(labels.size())) {
+            throw std::runtime_error("Invalid index for the class label is found during postprocessing");
+        }
+        result->topLabels.emplace_back(ind, labels[ind], scoresPtr[i]);
+    }
+
+    return retVal;
+}
+
+std::vector<std::string> ClassificationModel::loadLabels(const std::string& labelFilename) {
+    std::vector<std::string> labels;
+
+    /* Read labels */
+    std::ifstream inputFile(labelFilename);
+    if (!inputFile.is_open())
+        throw std::runtime_error("Can't open the labels file: " + labelFilename);
+    std::string labelsLine;
+    while (std::getline(inputFile, labelsLine)) {
+        size_t labelBeginIdx = labelsLine.find(' ');
+        size_t labelEndIdx = labelsLine.find(',');  // can be npos when class has only one label
+        if (labelBeginIdx == std::string::npos) {
+            throw std::runtime_error("The labels file has incorrect format.");
+        }
+        labels.push_back(labelsLine.substr(labelBeginIdx + 1, labelEndIdx - (labelBeginIdx + 1)));
+    }
+    if (labels.empty())
+        throw std::logic_error("File is empty: " + labelFilename);
+
+    return labels;
+}
+
+void ClassificationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("Classification model wrapper supports topologies with only 1 input");
+    }
+    const auto& input = model->input();
+    inputsNames.push_back(input.get_any_name());
+
+    const ov::Shape& inputShape = input.get_shape();
+    const ov::Layout& inputLayout = getInputLayout(input);
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    const auto width = inputShape[ov::layout::width_idx(inputLayout)];
+    const auto height = inputShape[ov::layout::height_idx(inputLayout)];
+    if (height != width) {
+        throw std::logic_error("Model input has incorrect image shape. Must be NxN square."
+                               " Got " +
+                               std::to_string(height) + "x" + std::to_string(width) + ".");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 1) {
+        throw std::logic_error("Classification model wrapper supports topologies with only 1 output");
+    }
+
+    const ov::Shape& outputShape = model->output().get_shape();
+    if (outputShape.size() != 2 && outputShape.size() != 4) {
+        throw std::logic_error("Classification model wrapper supports topologies only with"
+                               " 2-dimensional or 4-dimensional output");
+    }
+
+    const ov::Layout outputLayout("NCHW");
+    if (outputShape.size() == 4 && (outputShape[ov::layout::height_idx(outputLayout)] != 1 ||
+                                    outputShape[ov::layout::width_idx(outputLayout)] != 1)) {
+        throw std::logic_error("Classification model wrapper supports topologies only"
+                               " with 4-dimensional output which has last two dimensions of size 1");
+    }
+
+    size_t classesNum = outputShape[ov::layout::channels_idx(outputLayout)];
+    if (nTop > classesNum) {
+        throw std::logic_error("The model provides " + std::to_string(classesNum) + " classes, but " +
+                               std::to_string(nTop) + " labels are requested to be predicted");
+    }
+    if (classesNum == labels.size() + 1) {
+        labels.insert(labels.begin(), "other");
+        slog::warn << "Inserted 'other' label as first." << slog::endl;
+    } else if (classesNum != labels.size()) {
+        throw std::logic_error("Model's number of classes and parsed labels must match (" +
+                               std::to_string(outputShape[1]) + " and " + std::to_string(labels.size()) + ')');
+    }
+
+    ppp.output().tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+
+    // --------------------------- Adding softmax and topK output  ---------------------------
+    auto nodes = model->get_ops();
+    auto softmaxNodeIt = std::find_if(std::begin(nodes), std::end(nodes), [](const std::shared_ptr<ov::Node>& op) {
+        return std::string(op->get_type_name()) == "Softmax";
+    });
+
+    std::shared_ptr<ov::Node> softmaxNode;
+    if (softmaxNodeIt == nodes.end()) {
+        auto logitsNode = model->get_output_op(0)->input(0).get_source_output().get_node();
+        softmaxNode = std::make_shared<ov::op::v1::Softmax>(logitsNode->output(0), 1);
+    } else {
+        softmaxNode = *softmaxNodeIt;
+    }
+    const auto k = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, std::vector<size_t>{nTop});
+    std::shared_ptr<ov::Node> topkNode = std::make_shared<ov::op::v3::TopK>(softmaxNode,
+                                                                            k,
+                                                                            1,
+                                                                            ov::op::v3::TopK::Mode::MAX,
+                                                                            ov::op::v3::TopK::SortType::SORT_VALUES);
+
+    auto indices = std::make_shared<ov::op::v0::Result>(topkNode->output(0));
+    auto scores = std::make_shared<ov::op::v0::Result>(topkNode->output(1));
+    ov::ResultVector res({scores, indices});
+    model = std::make_shared<ov::Model>(res, model->get_parameters(), "classification");
+
+    // manually set output tensors name for created topK node
+    model->outputs()[0].set_names({"indices"});
+    outputsNames.push_back("indices");
+    model->outputs()[1].set_names({"scores"});
+    outputsNames.push_back("scores");
+
+    // set output precisions
+    ppp = ov::preprocess::PrePostProcessor(model);
+    ppp.output("indices").tensor().set_element_type(ov::element::i32);
+    ppp.output("scores").tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+}
diff --git a/python/openvino/runtime/common/models/src/deblurring_model.cpp b/python/openvino/runtime/common/models/src/deblurring_model.cpp
new file mode 100644
index 0000000..261efb3
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/deblurring_model.cpp
@@ -0,0 +1,158 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/deblurring_model.h"
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+DeblurringModel::DeblurringModel(const std::string& modelFileName,
+                                 const cv::Size& inputImgSize,
+                                 const std::string& layout)
+    : ImageModel(modelFileName, false, layout) {
+    netInputHeight = inputImgSize.height;
+    netInputWidth = inputImgSize.width;
+}
+
+void DeblurringModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("Deblurring model wrapper supports topologies with only 1 input");
+    }
+
+    inputsNames.push_back(model->input().get_any_name());
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+        inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 1) {
+        throw std::logic_error("Deblurring model wrapper supports topologies with only 1 output");
+    }
+
+    outputsNames.push_back(model->output().get_any_name());
+
+    const ov::Shape& outputShape = model->output().get_shape();
+    const ov::Layout outputLayout("NCHW");
+    if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+        outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's output is expected");
+    }
+
+    ppp.output().tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+
+    changeInputSize(model);
+}
+
+void DeblurringModel::changeInputSize(std::shared_ptr<ov::Model>& model) {
+    const ov::Layout& layout = ov::layout::get_layout(model->input());
+    ov::Shape inputShape = model->input().get_shape();
+
+    const auto batchId = ov::layout::batch_idx(layout);
+    const auto heightId = ov::layout::height_idx(layout);
+    const auto widthId = ov::layout::width_idx(layout);
+
+    if (inputShape[heightId] % stride || inputShape[widthId] % stride) {
+        throw std::logic_error("Model input shape HxW = " + std::to_string(inputShape[heightId]) + "x" +
+                               std::to_string(inputShape[widthId]) + "must be divisible by stride " +
+                               std::to_string(stride));
+    }
+
+    netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride;
+    netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride;
+
+    inputShape[batchId] = 1;
+    inputShape[heightId] = netInputHeight;
+    inputShape[widthId] = netInputWidth;
+
+    model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> DeblurringModel::preprocess(const InputData& inputData, ov::InferRequest& request) {
+    auto& image = inputData.asRef<ImageInputData>().inputImage;
+    size_t h = image.rows;
+    size_t w = image.cols;
+    cv::Mat resizedImage;
+
+    if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) {
+        int bottom = netInputHeight - h;
+        int right = netInputWidth - w;
+        cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0);
+    } else {
+        slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+        cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight));
+    }
+    request.set_input_tensor(wrapMat2Tensor(resizedImage));
+
+    return std::make_shared<InternalImageModelData>(image.cols, image.rows);
+}
+
+std::unique_ptr<ResultBase> DeblurringModel::postprocess(InferenceResult& infResult) {
+    ImageResult* result = new ImageResult;
+    *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+    const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+    const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+    std::vector<cv::Mat> imgPlanes;
+    const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+    const ov::Layout outputLayout("NCHW");
+    size_t outHeight = static_cast<int>((outputShape[ov::layout::height_idx(outputLayout)]));
+    size_t outWidth = static_cast<int>((outputShape[ov::layout::width_idx(outputLayout)]));
+    size_t numOfPixels = outWidth * outHeight;
+    imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+    cv::Mat resultImg;
+    cv::merge(imgPlanes, resultImg);
+
+    if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) &&
+        static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight &&
+        netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) &&
+        static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) {
+        result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+    } else {
+        cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+    }
+
+    result->resultImage.convertTo(result->resultImage, CV_8UC3, 255);
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model.cpp b/python/openvino/runtime/common/models/src/detection_model.cpp
new file mode 100644
index 0000000..83e2d22
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model.cpp
@@ -0,0 +1,52 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model.h"
+
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "models/image_model.h"
+
+DetectionModel::DetectionModel(const std::string& modelFileName,
+                               float confidenceThreshold,
+                               bool useAutoResize,
+                               const std::vector<std::string>& labels,
+                               const std::string& layout)
+    : ImageModel(modelFileName, useAutoResize, layout),
+      confidenceThreshold(confidenceThreshold),
+      labels(labels) {}
+
+std::vector<std::string> DetectionModel::loadLabels(const std::string& labelFilename) {
+    std::vector<std::string> labelsList;
+
+    /* Read labels (if any) */
+    if (!labelFilename.empty()) {
+        std::ifstream inputFile(labelFilename);
+        if (!inputFile.is_open())
+            throw std::runtime_error("Can't open the labels file: " + labelFilename);
+        std::string label;
+        while (std::getline(inputFile, label)) {
+            labelsList.push_back(label);
+        }
+        if (labelsList.empty())
+            throw std::logic_error("File is empty: " + labelFilename);
+    }
+
+    return labelsList;
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_centernet.cpp b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp
new file mode 100644
index 0000000..eac42a7
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_centernet.cpp
@@ -0,0 +1,302 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_centernet.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <utility>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelCenterNet::ModelCenterNet(const std::string& modelFileName,
+                               float confidenceThreshold,
+                               const std::vector<std::string>& labels,
+                               const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) {}
+
+void ModelCenterNet::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("CenterNet model wrapper expects models that have only 1 input");
+    }
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    inputTransform.setPrecision(ppp, model->input().get_any_name());
+    ppp.input().tensor().set_layout("NHWC");
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Reading image input parameters -------------------------------------------
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 3) {
+        throw std::logic_error("CenterNet model wrapper expects models that have 3 outputs");
+    }
+
+    const ov::Layout outLayout{"NCHW"};
+    for (const auto& output : model->outputs()) {
+        auto outTensorName = output.get_any_name();
+        outputsNames.push_back(outTensorName);
+        ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outLayout);
+    }
+    std::sort(outputsNames.begin(), outputsNames.end());
+    model = ppp.build();
+}
+
+cv::Point2f getDir(const cv::Point2f& srcPoint, float rotRadius) {
+    float sn = sinf(rotRadius);
+    float cs = cosf(rotRadius);
+
+    cv::Point2f srcResult(0.0f, 0.0f);
+    srcResult.x = srcPoint.x * cs - srcPoint.y * sn;
+    srcResult.y = srcPoint.x * sn + srcPoint.y * cs;
+
+    return srcResult;
+}
+
+cv::Point2f get3rdPoint(const cv::Point2f& a, const cv::Point2f& b) {
+    cv::Point2f direct = a - b;
+    return b + cv::Point2f(-direct.y, direct.x);
+}
+
+cv::Mat getAffineTransform(float centerX,
+                           float centerY,
+                           int srcW,
+                           float rot,
+                           size_t outputWidth,
+                           size_t outputHeight,
+                           bool inv = false) {
+    float rotRad = static_cast<float>(CV_PI) * rot / 180.0f;
+    auto srcDir = getDir({0.0f, -0.5f * srcW}, rotRad);
+    cv::Point2f dstDir(0.0f, -0.5f * outputWidth);
+    std::vector<cv::Point2f> src(3, {0.0f, 0.0f});
+    std::vector<cv::Point2f> dst(3, {0.0f, 0.0f});
+
+    src[0] = {centerX, centerY};
+    src[1] = srcDir + src[0];
+    src[2] = get3rdPoint(src[0], src[1]);
+
+    dst[0] = {outputWidth * 0.5f, outputHeight * 0.5f};
+    dst[1] = dst[0] + dstDir;
+    dst[2] = get3rdPoint(dst[0], dst[1]);
+
+    cv::Mat trans;
+    if (inv) {
+        trans = cv::getAffineTransform(dst, src);
+    } else {
+        trans = cv::getAffineTransform(src, dst);
+    }
+
+    return trans;
+}
+
+std::shared_ptr<InternalModelData> ModelCenterNet::preprocess(const InputData& inputData, ov::InferRequest& request) {
+    auto& img = inputData.asRef<ImageInputData>().inputImage;
+    const auto& resizedImg = resizeImageExt(img, netInputWidth, netInputHeight, RESIZE_KEEP_ASPECT_LETTERBOX);
+
+    request.set_input_tensor(wrapMat2Tensor(inputTransform(resizedImg)));
+    return std::make_shared<InternalImageModelData>(img.cols, img.rows);
+}
+
+std::vector<std::pair<size_t, float>> nms(float* scoresPtr, const ov::Shape& shape, float threshold, int kernel = 3) {
+    std::vector<std::pair<size_t, float>> scores;
+    scores.reserve(ModelCenterNet::INIT_VECTOR_SIZE);
+    auto chSize = shape[2] * shape[3];
+
+    for (size_t i = 0; i < shape[1] * shape[2] * shape[3]; ++i) {
+        scoresPtr[i] = expf(scoresPtr[i]) / (1 + expf(scoresPtr[i]));
+    }
+
+    for (size_t ch = 0; ch < shape[1]; ++ch) {
+        for (size_t w = 0; w < shape[2]; ++w) {
+            for (size_t h = 0; h < shape[3]; ++h) {
+                float max = scoresPtr[chSize * ch + shape[2] * w + h];
+
+                // ---------------------  filter on threshold--------------------------------------
+                if (max < threshold) {
+                    continue;
+                }
+
+                // ---------------------  store index and score------------------------------------
+                scores.push_back({chSize * ch + shape[2] * w + h, max});
+
+                bool next = true;
+                // ---------------------- maxpool2d -----------------------------------------------
+                for (int i = -kernel / 2; i < kernel / 2 + 1 && next; ++i) {
+                    for (int j = -kernel / 2; j < kernel / 2 + 1; ++j) {
+                        if (w + i >= 0 && w + i < shape[2] && h + j >= 0 && h + j < shape[3]) {
+                            if (scoresPtr[chSize * ch + shape[2] * (w + i) + h + j] > max) {
+                                scores.pop_back();
+                                next = false;
+                                break;
+                            }
+                        } else {
+                            if (max < 0) {
+                                scores.pop_back();
+                                next = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return scores;
+}
+
+static std::vector<std::pair<size_t, float>> filterScores(const ov::Tensor& scoresTensor, float threshold) {
+    auto shape = scoresTensor.get_shape();
+    float* scoresPtr = scoresTensor.data<float>();
+
+    return nms(scoresPtr, shape, threshold);
+}
+
+std::vector<std::pair<float, float>> filterReg(const ov::Tensor& regressionTensor,
+                                               const std::vector<std::pair<size_t, float>>& scores,
+                                               size_t chSize) {
+    const float* regPtr = regressionTensor.data<float>();
+    std::vector<std::pair<float, float>> reg;
+
+    for (auto s : scores) {
+        reg.push_back({regPtr[s.first % chSize], regPtr[chSize + s.first % chSize]});
+    }
+
+    return reg;
+}
+
+std::vector<std::pair<float, float>> filterWH(const ov::Tensor& whTensor,
+                                              const std::vector<std::pair<size_t, float>>& scores,
+                                              size_t chSize) {
+    const float* whPtr = whTensor.data<float>();
+    std::vector<std::pair<float, float>> wh;
+
+    for (auto s : scores) {
+        wh.push_back({whPtr[s.first % chSize], whPtr[chSize + s.first % chSize]});
+    }
+
+    return wh;
+}
+
+std::vector<ModelCenterNet::BBox> calcBoxes(const std::vector<std::pair<size_t, float>>& scores,
+                                            const std::vector<std::pair<float, float>>& reg,
+                                            const std::vector<std::pair<float, float>>& wh,
+                                            const ov::Shape& shape) {
+    std::vector<ModelCenterNet::BBox> boxes(scores.size());
+
+    for (size_t i = 0; i < boxes.size(); ++i) {
+        size_t chIdx = scores[i].first % (shape[2] * shape[3]);
+        auto xCenter = chIdx % shape[3];
+        auto yCenter = chIdx / shape[3];
+
+        boxes[i].left = xCenter + reg[i].first - wh[i].first / 2.0f;
+        boxes[i].top = yCenter + reg[i].second - wh[i].second / 2.0f;
+        boxes[i].right = xCenter + reg[i].first + wh[i].first / 2.0f;
+        boxes[i].bottom = yCenter + reg[i].second + wh[i].second / 2.0f;
+    }
+
+    return boxes;
+}
+
+void transform(std::vector<ModelCenterNet::BBox>& boxes,
+               const ov::Shape& shape,
+               int scale,
+               float centerX,
+               float centerY) {
+    cv::Mat1f trans = getAffineTransform(centerX, centerY, scale, 0, shape[2], shape[3], true);
+
+    for (auto& b : boxes) {
+        ModelCenterNet::BBox newbb;
+
+        newbb.left = trans.at<float>(0, 0) * b.left + trans.at<float>(0, 1) * b.top + trans.at<float>(0, 2);
+        newbb.top = trans.at<float>(1, 0) * b.left + trans.at<float>(1, 1) * b.top + trans.at<float>(1, 2);
+        newbb.right = trans.at<float>(0, 0) * b.right + trans.at<float>(0, 1) * b.bottom + trans.at<float>(0, 2);
+        newbb.bottom = trans.at<float>(1, 0) * b.right + trans.at<float>(1, 1) * b.bottom + trans.at<float>(1, 2);
+
+        b = newbb;
+    }
+}
+
+std::unique_ptr<ResultBase> ModelCenterNet::postprocess(InferenceResult& infResult) {
+    // --------------------------- Filter data and get valid indices ---------------------------------
+    const auto& heatmapTensor = infResult.outputsData[outputsNames[0]];
+    const auto& heatmapTensorShape = heatmapTensor.get_shape();
+    const auto chSize = heatmapTensorShape[2] * heatmapTensorShape[3];
+    const auto scores = filterScores(heatmapTensor, confidenceThreshold);
+
+    const auto& regressionTensor = infResult.outputsData[outputsNames[1]];
+    const auto reg = filterReg(regressionTensor, scores, chSize);
+
+    const auto& whTensor = infResult.outputsData[outputsNames[2]];
+    const auto wh = filterWH(whTensor, scores, chSize);
+
+    // --------------------------- Calculate bounding boxes & apply inverse affine transform ----------
+    auto boxes = calcBoxes(scores, reg, wh, heatmapTensorShape);
+
+    const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+    const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+    const auto scale = std::max(imgWidth, imgHeight);
+    const float centerX = imgWidth / 2.0f;
+    const float centerY = imgHeight / 2.0f;
+
+    transform(boxes, heatmapTensorShape, scale, centerX, centerY);
+
+    // --------------------------- Create detection result objects ------------------------------------
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+
+    result->objects.reserve(scores.size());
+    for (size_t i = 0; i < scores.size(); ++i) {
+        DetectedObject desc;
+        desc.confidence = scores[i].second;
+        desc.labelID = scores[i].first / chSize;
+        desc.label = getLabelName(desc.labelID);
+        desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth));
+        desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight));
+        desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth));
+        desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight));
+
+        result->objects.push_back(desc);
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp
new file mode 100644
index 0000000..bb349a6
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_faceboxes.cpp
@@ -0,0 +1,261 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_faceboxes.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelFaceBoxes::ModelFaceBoxes(const std::string& modelFileName,
+                               float confidenceThreshold,
+                               bool useAutoResize,
+                               float boxIOUThreshold,
+                               const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout),
+      maxProposalsCount(0),
+      boxIOUThreshold(boxIOUThreshold),
+      variance({0.1f, 0.2f}),
+      steps({32, 64, 128}),
+      minSizes({{32, 64, 128}, {256}, {512}}) {}
+
+void ModelFaceBoxes::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("FaceBoxes model wrapper expects models that have only 1 input");
+    }
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    inputTransform.setPrecision(ppp, model->input().get_any_name());
+    ppp.input().tensor().set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Reading image input parameters -------------------------------------------
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 2) {
+        throw std::logic_error("FaceBoxes model wrapper expects models that have 2 outputs");
+    }
+
+    const ov::Layout outputLayout{"CHW"};
+    maxProposalsCount = model->outputs().front().get_shape()[ov::layout::height_idx(outputLayout)];
+    for (const auto& output : model->outputs()) {
+        const auto outTensorName = output.get_any_name();
+        outputsNames.push_back(outTensorName);
+        ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+    }
+    std::sort(outputsNames.begin(), outputsNames.end());
+    model = ppp.build();
+
+    // --------------------------- Calculating anchors ----------------------------------------------------
+    std::vector<std::pair<size_t, size_t>> featureMaps;
+    for (auto s : steps) {
+        featureMaps.push_back({netInputHeight / s, netInputWidth / s});
+    }
+
+    priorBoxes(featureMaps);
+}
+
+void calculateAnchors(std::vector<Anchor>& anchors,
+                      const std::vector<float>& vx,
+                      const std::vector<float>& vy,
+                      const int minSize,
+                      const int step) {
+    float skx = static_cast<float>(minSize);
+    float sky = static_cast<float>(minSize);
+
+    std::vector<float> dense_cx, dense_cy;
+
+    for (auto x : vx) {
+        dense_cx.push_back(x * step);
+    }
+
+    for (auto y : vy) {
+        dense_cy.push_back(y * step);
+    }
+
+    for (auto cy : dense_cy) {
+        for (auto cx : dense_cx) {
+            anchors.push_back(
+                {cx - 0.5f * skx, cy - 0.5f * sky, cx + 0.5f * skx, cy + 0.5f * sky});  // left top right bottom
+        }
+    }
+}
+
+void calculateAnchorsZeroLevel(std::vector<Anchor>& anchors,
+                               const int fx,
+                               const int fy,
+                               const std::vector<int>& minSizes,
+                               const int step) {
+    for (auto s : minSizes) {
+        std::vector<float> vx, vy;
+        if (s == 32) {
+            vx.push_back(static_cast<float>(fx));
+            vx.push_back(fx + 0.25f);
+            vx.push_back(fx + 0.5f);
+            vx.push_back(fx + 0.75f);
+
+            vy.push_back(static_cast<float>(fy));
+            vy.push_back(fy + 0.25f);
+            vy.push_back(fy + 0.5f);
+            vy.push_back(fy + 0.75f);
+        } else if (s == 64) {
+            vx.push_back(static_cast<float>(fx));
+            vx.push_back(fx + 0.5f);
+
+            vy.push_back(static_cast<float>(fy));
+            vy.push_back(fy + 0.5f);
+        } else {
+            vx.push_back(fx + 0.5f);
+            vy.push_back(fy + 0.5f);
+        }
+        calculateAnchors(anchors, vx, vy, s, step);
+    }
+}
+
+void ModelFaceBoxes::priorBoxes(const std::vector<std::pair<size_t, size_t>>& featureMaps) {
+    anchors.reserve(maxProposalsCount);
+
+    for (size_t k = 0; k < featureMaps.size(); ++k) {
+        std::vector<float> a;
+        for (size_t i = 0; i < featureMaps[k].first; ++i) {
+            for (size_t j = 0; j < featureMaps[k].second; ++j) {
+                if (k == 0) {
+                    calculateAnchorsZeroLevel(anchors, j, i, minSizes[k], steps[k]);
+                } else {
+                    calculateAnchors(anchors, {j + 0.5f}, {i + 0.5f}, minSizes[k][0], steps[k]);
+                }
+            }
+        }
+    }
+}
+
+std::pair<std::vector<size_t>, std::vector<float>> filterScores(const ov::Tensor& scoresTensor,
+                                                                const float confidenceThreshold) {
+    auto shape = scoresTensor.get_shape();
+    const float* scoresPtr = scoresTensor.data<float>();
+
+    std::vector<size_t> indices;
+    std::vector<float> scores;
+    scores.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+    indices.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+    for (size_t i = 1; i < shape[1] * shape[2]; i = i + 2) {
+        if (scoresPtr[i] > confidenceThreshold) {
+            indices.push_back(i / 2);
+            scores.push_back(scoresPtr[i]);
+        }
+    }
+
+    return {indices, scores};
+}
+
+std::vector<Anchor> filterBoxes(const ov::Tensor& boxesTensor,
+                                const std::vector<Anchor>& anchors,
+                                const std::vector<size_t>& validIndices,
+                                const std::vector<float>& variance) {
+    auto shape = boxesTensor.get_shape();
+    const float* boxesPtr = boxesTensor.data<float>();
+
+    std::vector<Anchor> boxes;
+    boxes.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE);
+    for (auto i : validIndices) {
+        auto objStart = shape[2] * i;
+
+        auto dx = boxesPtr[objStart];
+        auto dy = boxesPtr[objStart + 1];
+        auto dw = boxesPtr[objStart + 2];
+        auto dh = boxesPtr[objStart + 3];
+
+        auto predCtrX = dx * variance[0] * anchors[i].getWidth() + anchors[i].getXCenter();
+        auto predCtrY = dy * variance[0] * anchors[i].getHeight() + anchors[i].getYCenter();
+        auto predW = exp(dw * variance[1]) * anchors[i].getWidth();
+        auto predH = exp(dh * variance[1]) * anchors[i].getHeight();
+
+        boxes.push_back({static_cast<float>(predCtrX - 0.5f * predW),
+                         static_cast<float>(predCtrY - 0.5f * predH),
+                         static_cast<float>(predCtrX + 0.5f * predW),
+                         static_cast<float>(predCtrY + 0.5f * predH)});
+    }
+
+    return boxes;
+}
+
+std::unique_ptr<ResultBase> ModelFaceBoxes::postprocess(InferenceResult& infResult) {
+    // Filter scores and get valid indices for bounding boxes
+    const auto scoresTensor = infResult.outputsData[outputsNames[1]];
+    const auto scores = filterScores(scoresTensor, confidenceThreshold);
+
+    // Filter bounding boxes on indices
+    auto boxesTensor = infResult.outputsData[outputsNames[0]];
+    std::vector<Anchor> boxes = filterBoxes(boxesTensor, anchors, scores.first, variance);
+
+    // Apply Non-maximum Suppression
+    const std::vector<int> keep = nms(boxes, scores.second, boxIOUThreshold);
+
+    // Create detection result objects
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+    const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+    const float scaleX = static_cast<float>(netInputWidth) / imgWidth;
+    const float scaleY = static_cast<float>(netInputHeight) / imgHeight;
+
+    result->objects.reserve(keep.size());
+    for (auto i : keep) {
+        DetectedObject desc;
+        desc.confidence = scores.second[i];
+        desc.x = clamp(boxes[i].left / scaleX, 0.f, static_cast<float>(imgWidth));
+        desc.y = clamp(boxes[i].top / scaleY, 0.f, static_cast<float>(imgHeight));
+        desc.width = clamp(boxes[i].getWidth() / scaleX, 0.f, static_cast<float>(imgWidth));
+        desc.height = clamp(boxes[i].getHeight() / scaleY, 0.f, static_cast<float>(imgHeight));
+        desc.labelID = 0;
+        desc.label = labels[0];
+
+        result->objects.push_back(desc);
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp
new file mode 100644
index 0000000..8835725
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_retinaface.cpp
@@ -0,0 +1,394 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_retinaface.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <stdexcept>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelRetinaFace::ModelRetinaFace(const std::string& modelFileName,
+                                 float confidenceThreshold,
+                                 bool useAutoResize,
+                                 float boxIOUThreshold,
+                                 const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout),  // Default label is "Face"
+      shouldDetectMasks(false),
+      shouldDetectLandmarks(false),
+      boxIOUThreshold(boxIOUThreshold),
+      maskThreshold(0.8f),
+      landmarkStd(1.0f),
+      anchorCfg({{32, {32, 16}, 16, {1}}, {16, {8, 4}, 16, {1}}, {8, {2, 1}, 16, {1}}}) {
+    generateAnchorsFpn();
+}
+
+void ModelRetinaFace::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("RetinaFace model wrapper expects models that have only 1 input");
+    }
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Reading image input parameters -------------------------------------------
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 6 && outputs.size() != 9 && outputs.size() != 12) {
+        throw std::logic_error("RetinaFace model wrapper expects models that have 6, 9 or 12 outputs");
+    }
+
+    const ov::Layout outputLayout{"NCHW"};
+    std::vector<size_t> outputsSizes[OUT_MAX];
+    for (const auto& output : model->outputs()) {
+        auto outTensorName = output.get_any_name();
+        outputsNames.push_back(outTensorName);
+        ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+
+        OutputType type = OUT_MAX;
+        if (outTensorName.find("box") != std::string::npos) {
+            type = OUT_BOXES;
+        } else if (outTensorName.find("cls") != std::string::npos) {
+            type = OUT_SCORES;
+        } else if (outTensorName.find("landmark") != std::string::npos) {
+            type = OUT_LANDMARKS;
+            shouldDetectLandmarks = true;
+        } else if (outTensorName.find("type") != std::string::npos) {
+            type = OUT_MASKSCORES;
+            labels.clear();
+            labels.push_back("No Mask");
+            labels.push_back("Mask");
+            shouldDetectMasks = true;
+            landmarkStd = 0.2f;
+        } else {
+            continue;
+        }
+
+        size_t num = output.get_shape()[ov::layout::height_idx(outputLayout)];
+        size_t i = 0;
+        for (; i < outputsSizes[type].size(); ++i) {
+            if (num < outputsSizes[type][i]) {
+                break;
+            }
+        }
+        separateOutputsNames[type].insert(separateOutputsNames[type].begin() + i, outTensorName);
+        outputsSizes[type].insert(outputsSizes[type].begin() + i, num);
+    }
+    model = ppp.build();
+
+    for (size_t idx = 0; idx < outputsSizes[OUT_BOXES].size(); ++idx) {
+        size_t width = outputsSizes[OUT_BOXES][idx];
+        size_t height = outputsSizes[OUT_BOXES][idx];
+        auto s = anchorCfg[idx].stride;
+        auto anchorNum = anchorsFpn[s].size();
+
+        anchors.push_back(std::vector<Anchor>(height * width * anchorNum));
+        for (size_t iw = 0; iw < width; ++iw) {
+            size_t sw = iw * s;
+            for (size_t ih = 0; ih < height; ++ih) {
+                size_t sh = ih * s;
+                for (size_t k = 0; k < anchorNum; ++k) {
+                    Anchor& anc = anchors[idx][(ih * width + iw) * anchorNum + k];
+                    anc.left = anchorsFpn[s][k].left + sw;
+                    anc.top = anchorsFpn[s][k].top + sh;
+                    anc.right = anchorsFpn[s][k].right + sw;
+                    anc.bottom = anchorsFpn[s][k].bottom + sh;
+                }
+            }
+        }
+    }
+}
+
+std::vector<Anchor> ratioEnum(const Anchor& anchor, const std::vector<int>& ratios) {
+    std::vector<Anchor> retVal;
+    const auto w = anchor.getWidth();
+    const auto h = anchor.getHeight();
+    const auto xCtr = anchor.getXCenter();
+    const auto yCtr = anchor.getYCenter();
+
+    for (const auto ratio : ratios) {
+        const auto size = w * h;
+        const auto sizeRatio = static_cast<float>(size) / ratio;
+        const auto ws = sqrt(sizeRatio);
+        const auto hs = ws * ratio;
+        retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)),
+                          static_cast<float>(yCtr - 0.5f * (hs - 1.0f)),
+                          static_cast<float>(xCtr + 0.5f * (ws - 1.0f)),
+                          static_cast<float>(yCtr + 0.5f * (hs - 1.0f))});
+    }
+    return retVal;
+}
+
+std::vector<Anchor> scaleEnum(const Anchor& anchor, const std::vector<int>& scales) {
+    std::vector<Anchor> retVal;
+    const auto w = anchor.getWidth();
+    const auto h = anchor.getHeight();
+    const auto xCtr = anchor.getXCenter();
+    const auto yCtr = anchor.getYCenter();
+
+    for (auto scale : scales) {
+        const auto ws = w * scale;
+        const auto hs = h * scale;
+        retVal.push_back({static_cast<float>(xCtr - 0.5f * (ws - 1.0f)),
+                          static_cast<float>(yCtr - 0.5f * (hs - 1.0f)),
+                          static_cast<float>(xCtr + 0.5f * (ws - 1.0f)),
+                          static_cast<float>(yCtr + 0.5f * (hs - 1.0f))});
+    }
+    return retVal;
+}
+
+std::vector<Anchor> generateAnchors(const int baseSize,
+                                                     const std::vector<int>& ratios,
+                                                     const std::vector<int>& scales) {
+    Anchor baseAnchor{0.0f, 0.0f, baseSize - 1.0f, baseSize - 1.0f};
+    auto ratioAnchors = ratioEnum(baseAnchor, ratios);
+    std::vector<Anchor> retVal;
+
+    for (const auto& ra : ratioAnchors) {
+        auto addon = scaleEnum(ra, scales);
+        retVal.insert(retVal.end(), addon.begin(), addon.end());
+    }
+    return retVal;
+}
+
+void ModelRetinaFace::generateAnchorsFpn() {
+    auto cfg = anchorCfg;
+    std::sort(cfg.begin(), cfg.end(), [](const AnchorCfgLine& x, const AnchorCfgLine& y) {
+        return x.stride > y.stride;
+    });
+
+    for (const auto& cfgLine : cfg) {
+        anchorsFpn.emplace(cfgLine.stride, generateAnchors(cfgLine.baseSize, cfgLine.ratios, cfgLine.scales));
+    }
+}
+
+std::vector<size_t> thresholding(const ov::Tensor& scoresTensor, const int anchorNum, const float confidenceThreshold) {
+    std::vector<size_t> indices;
+    indices.reserve(ModelRetinaFace::INIT_VECTOR_SIZE);
+    auto shape = scoresTensor.get_shape();
+    size_t restAnchors = shape[1] - anchorNum;
+    const float* scoresPtr = scoresTensor.data<float>();
+
+    for (size_t x = anchorNum; x < shape[1]; ++x) {
+        for (size_t y = 0; y < shape[2]; ++y) {
+            for (size_t z = 0; z < shape[3]; ++z) {
+                auto idx = (x * shape[2] + y) * shape[3] + z;
+                auto score = scoresPtr[idx];
+                if (score >= confidenceThreshold) {
+                    indices.push_back((y * shape[3] + z) * restAnchors + (x - anchorNum));
+                }
+            }
+        }
+    }
+
+    return indices;
+}
+
+void filterScores(std::vector<float>& scores,
+                  const std::vector<size_t>& indices,
+                  const ov::Tensor& scoresTensor,
+                  const int anchorNum) {
+    const auto& shape = scoresTensor.get_shape();
+    const float* scoresPtr = scoresTensor.data<float>();
+    const auto start = shape[2] * shape[3] * anchorNum;
+
+    for (auto i : indices) {
+        auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum;
+        scores.push_back(scoresPtr[start + offset]);
+    }
+}
+
+void filterBoxes(std::vector<Anchor>& boxes,
+                 const std::vector<size_t>& indices,
+                 const ov::Tensor& boxesTensor,
+                 int anchorNum,
+                 const std::vector<Anchor>& anchors) {
+    const auto& shape = boxesTensor.get_shape();
+    const float* boxesPtr = boxesTensor.data<float>();
+    const auto boxPredLen = shape[1] / anchorNum;
+    const auto blockWidth = shape[2] * shape[3];
+
+    for (auto i : indices) {
+        auto offset = blockWidth * boxPredLen * (i % anchorNum) + (i / anchorNum);
+
+        const auto dx = boxesPtr[offset];
+        const auto dy = boxesPtr[offset + blockWidth];
+        const auto dw = boxesPtr[offset + blockWidth * 2];
+        const auto dh = boxesPtr[offset + blockWidth * 3];
+
+        const auto predCtrX = dx * anchors[i].getWidth() + anchors[i].getXCenter();
+        const auto predCtrY = dy * anchors[i].getHeight() + anchors[i].getYCenter();
+        const auto predW = exp(dw) * anchors[i].getWidth();
+        const auto predH = exp(dh) * anchors[i].getHeight();
+
+        boxes.push_back({static_cast<float>(predCtrX - 0.5f * (predW - 1.0f)),
+                         static_cast<float>(predCtrY - 0.5f * (predH - 1.0f)),
+                         static_cast<float>(predCtrX + 0.5f * (predW - 1.0f)),
+                         static_cast<float>(predCtrY + 0.5f * (predH - 1.0f))});
+    }
+}
+
+void filterLandmarks(std::vector<cv::Point2f>& landmarks,
+                     const std::vector<size_t>& indices,
+                     const ov::Tensor& landmarksTensor,
+                     int anchorNum,
+                     const std::vector<Anchor>& anchors,
+                     const float landmarkStd) {
+    const auto& shape = landmarksTensor.get_shape();
+    const float* landmarksPtr = landmarksTensor.data<float>();
+    const auto landmarkPredLen = shape[1] / anchorNum;
+    const auto blockWidth = shape[2] * shape[3];
+
+    for (auto i : indices) {
+        for (int j = 0; j < ModelRetinaFace::LANDMARKS_NUM; ++j) {
+            auto offset = (i % anchorNum) * landmarkPredLen * shape[2] * shape[3] + i / anchorNum;
+            auto deltaX = landmarksPtr[offset + j * 2 * blockWidth] * landmarkStd;
+            auto deltaY = landmarksPtr[offset + (j * 2 + 1) * blockWidth] * landmarkStd;
+            landmarks.push_back({deltaX * anchors[i].getWidth() + anchors[i].getXCenter(),
+                                 deltaY * anchors[i].getHeight() + anchors[i].getYCenter()});
+        }
+    }
+}
+
+void filterMasksScores(std::vector<float>& masks,
+                       const std::vector<size_t>& indices,
+                       const ov::Tensor& maskScoresTensor,
+                       const int anchorNum) {
+    auto shape = maskScoresTensor.get_shape();
+    const float* maskScoresPtr = maskScoresTensor.data<float>();
+    auto start = shape[2] * shape[3] * anchorNum * 2;
+
+    for (auto i : indices) {
+        auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum;
+        masks.push_back(maskScoresPtr[start + offset]);
+    }
+}
+
+std::unique_ptr<ResultBase> ModelRetinaFace::postprocess(InferenceResult& infResult) {
+    std::vector<float> scores;
+    scores.reserve(INIT_VECTOR_SIZE);
+    std::vector<Anchor> boxes;
+    boxes.reserve(INIT_VECTOR_SIZE);
+    std::vector<cv::Point2f> landmarks;
+    std::vector<float> masks;
+
+    if (shouldDetectLandmarks) {
+        landmarks.reserve(INIT_VECTOR_SIZE);
+    }
+    if (shouldDetectMasks) {
+        masks.reserve(INIT_VECTOR_SIZE);
+    }
+
+    // --------------------------- Gather & Filter output from all levels
+    // ----------------------------------------------------------
+    for (size_t idx = 0; idx < anchorCfg.size(); ++idx) {
+        const auto boxRaw = infResult.outputsData[separateOutputsNames[OUT_BOXES][idx]];
+        const auto scoresRaw = infResult.outputsData[separateOutputsNames[OUT_SCORES][idx]];
+        auto s = anchorCfg[idx].stride;
+        auto anchorNum = anchorsFpn[s].size();
+
+        auto validIndices = thresholding(scoresRaw, anchorNum, confidenceThreshold);
+        filterScores(scores, validIndices, scoresRaw, anchorNum);
+        filterBoxes(boxes, validIndices, boxRaw, anchorNum, anchors[idx]);
+        if (shouldDetectLandmarks) {
+            const auto landmarksRaw = infResult.outputsData[separateOutputsNames[OUT_LANDMARKS][idx]];
+            filterLandmarks(landmarks, validIndices, landmarksRaw, anchorNum, anchors[idx], landmarkStd);
+        }
+        if (shouldDetectMasks) {
+            const auto masksRaw = infResult.outputsData[separateOutputsNames[OUT_MASKSCORES][idx]];
+            filterMasksScores(masks, validIndices, masksRaw, anchorNum);
+        }
+    }
+    // --------------------------- Apply Non-maximum Suppression
+    // ---------------------------------------------------------- !shouldDetectLandmarks determines nms behavior, if
+    // true - boundaries are included in areas calculation
+    const auto keep = nms(boxes, scores, boxIOUThreshold, !shouldDetectLandmarks);
+
+    // --------------------------- Create detection result objects
+    // --------------------------------------------------------
+    RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData);
+
+    const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+    const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+    const auto scaleX = static_cast<float>(netInputWidth) / imgWidth;
+    const auto scaleY = static_cast<float>(netInputHeight) / imgHeight;
+
+    result->objects.reserve(keep.size());
+    result->landmarks.reserve(keep.size() * ModelRetinaFace::LANDMARKS_NUM);
+    for (auto i : keep) {
+        DetectedObject desc;
+        desc.confidence = scores[i];
+        //--- Scaling coordinates
+        boxes[i].left /= scaleX;
+        boxes[i].top /= scaleY;
+        boxes[i].right /= scaleX;
+        boxes[i].bottom /= scaleY;
+
+        desc.x = clamp(boxes[i].left, 0.f, static_cast<float>(imgWidth));
+        desc.y = clamp(boxes[i].top, 0.f, static_cast<float>(imgHeight));
+        desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast<float>(imgWidth));
+        desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast<float>(imgHeight));
+        //--- Default label 0 - Face. If detecting masks then labels would be 0 - No Mask, 1 - Mask
+        desc.labelID = shouldDetectMasks ? (masks[i] > maskThreshold) : 0;
+        desc.label = labels[desc.labelID];
+        result->objects.push_back(desc);
+
+        //--- Scaling landmarks coordinates
+        for (size_t l = 0; l < ModelRetinaFace::LANDMARKS_NUM && shouldDetectLandmarks; ++l) {
+            landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x =
+                clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x / scaleX, 0.f, static_cast<float>(imgWidth));
+            landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y =
+                clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y / scaleY, 0.f, static_cast<float>(imgHeight));
+            result->landmarks.push_back(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l]);
+        }
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
new file mode 100644
index 0000000..8322c3c
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_retinaface_pt.cpp
@@ -0,0 +1,277 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_retinaface_pt.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/nms.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+ModelRetinaFacePT::ModelRetinaFacePT(const std::string& modelFileName,
+                                     float confidenceThreshold,
+                                     bool useAutoResize,
+                                     float boxIOUThreshold,
+                                     const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, {"Face"}, layout),  // Default label is "Face"
+      landmarksNum(0),
+      boxIOUThreshold(boxIOUThreshold) {}
+
+void ModelRetinaFacePT::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("RetinaFacePT model wrapper expects models that have only 1 input");
+    }
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    inputTransform.setPrecision(ppp, model->input().get_any_name());
+    ppp.input().tensor().set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Reading image input parameters -------------------------------------------
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 3) {
+        throw std::logic_error("RetinaFace model wrapper expects models that have 3 outputs");
+    }
+
+    landmarksNum = 0;
+
+    outputsNames.resize(2);
+    std::vector<uint32_t> outputsSizes[OUT_MAX];
+    const ov::Layout chw("CHW");
+    const ov::Layout nchw("NCHW");
+    for (auto& output : model->outputs()) {
+        auto outTensorName = output.get_any_name();
+        outputsNames.push_back(outTensorName);
+        ppp.output(outTensorName)
+            .tensor()
+            .set_element_type(ov::element::f32)
+            .set_layout(output.get_shape().size() == 4 ? nchw : chw);
+
+        if (outTensorName.find("bbox") != std::string::npos) {
+            outputsNames[OUT_BOXES] = outTensorName;
+        } else if (outTensorName.find("cls") != std::string::npos) {
+            outputsNames[OUT_SCORES] = outTensorName;
+        } else if (outTensorName.find("landmark") != std::string::npos) {
+            // Landmarks might be optional, if it is present, resize names array to fit landmarks output name to the
+            // last item of array Considering that other outputs names are already filled in or will be filled later
+            outputsNames.resize(std::max(outputsNames.size(), (size_t)OUT_LANDMARKS + 1));
+            outputsNames[OUT_LANDMARKS] = outTensorName;
+            landmarksNum =
+                output.get_shape()[ov::layout::width_idx(chw)] / 2;  // Each landmark consist of 2 variables (x and y)
+        } else {
+            continue;
+        }
+    }
+
+    if (outputsNames[OUT_BOXES] == "" || outputsNames[OUT_SCORES] == "") {
+        throw std::logic_error("Bbox or cls layers are not found");
+    }
+
+    model = ppp.build();
+    priors = generatePriorData();
+}
+
+std::vector<size_t> ModelRetinaFacePT::filterByScore(const ov::Tensor& scoresTensor, const float confidenceThreshold) {
+    std::vector<size_t> indicies;
+    const auto& shape = scoresTensor.get_shape();
+    const float* scoresPtr = scoresTensor.data<float>();
+
+    for (size_t x = 0; x < shape[1]; ++x) {
+        const auto idx = (x * shape[2] + 1);
+        const auto score = scoresPtr[idx];
+        if (score >= confidenceThreshold) {
+            indicies.push_back(x);
+        }
+    }
+
+    return indicies;
+}
+
+std::vector<float> ModelRetinaFacePT::getFilteredScores(const ov::Tensor& scoresTensor,
+                                                        const std::vector<size_t>& indicies) {
+    const auto& shape = scoresTensor.get_shape();
+    const float* scoresPtr = scoresTensor.data<float>();
+
+    std::vector<float> scores;
+    scores.reserve(indicies.size());
+
+    for (auto i : indicies) {
+        scores.push_back(scoresPtr[i * shape[2] + 1]);
+    }
+    return scores;
+}
+
+std::vector<cv::Point2f> ModelRetinaFacePT::getFilteredLandmarks(const ov::Tensor& landmarksTensor,
+                                                                 const std::vector<size_t>& indicies,
+                                                                 int imgWidth,
+                                                                 int imgHeight) {
+    const auto& shape = landmarksTensor.get_shape();
+    const float* landmarksPtr = landmarksTensor.data<float>();
+
+    std::vector<cv::Point2f> landmarks(landmarksNum * indicies.size());
+
+    for (size_t i = 0; i < indicies.size(); i++) {
+        const size_t idx = indicies[i];
+        const auto& prior = priors[idx];
+        for (size_t j = 0; j < landmarksNum; j++) {
+            landmarks[i * landmarksNum + j].x =
+                clamp(prior.cX + landmarksPtr[idx * shape[2] + j * 2] * variance[0] * prior.width, 0.f, 1.f) * imgWidth;
+            landmarks[i * landmarksNum + j].y =
+                clamp(prior.cY + landmarksPtr[idx * shape[2] + j * 2 + 1] * variance[0] * prior.height, 0.f, 1.f) *
+                imgHeight;
+        }
+    }
+    return landmarks;
+}
+
+std::vector<ModelRetinaFacePT::Box> ModelRetinaFacePT::generatePriorData() {
+    const float globalMinSizes[][2] = {{16, 32}, {64, 128}, {256, 512}};
+    const float steps[] = {8., 16., 32.};
+    std::vector<ModelRetinaFacePT::Box> anchors;
+    for (size_t stepNum = 0; stepNum < arraySize(steps); stepNum++) {
+        const int featureW = static_cast<int>(std::round(netInputWidth / steps[stepNum]));
+        const int featureH = static_cast<int>(std::round(netInputHeight / steps[stepNum]));
+
+        const auto& minSizes = globalMinSizes[stepNum];
+        for (int i = 0; i < featureH; i++) {
+            for (int j = 0; j < featureW; j++) {
+                for (auto minSize : minSizes) {
+                    const float sKX = minSize / netInputWidth;
+                    const float sKY = minSize / netInputHeight;
+                    const float denseCY = (i + 0.5f) * steps[stepNum] / netInputHeight;
+                    const float denseCX = (j + 0.5f) * steps[stepNum] / netInputWidth;
+                    anchors.push_back(ModelRetinaFacePT::Box{denseCX, denseCY, sKX, sKY});
+                }
+            }
+        }
+    }
+    return anchors;
+}
+
+std::vector<Anchor> ModelRetinaFacePT::getFilteredProposals(const ov::Tensor& boxesTensor,
+                                                                             const std::vector<size_t>& indicies,
+                                                                             int imgWidth,
+                                                                             int imgHeight) {
+    std::vector<Anchor> rects;
+    rects.reserve(indicies.size());
+
+    const auto& shape = boxesTensor.get_shape();
+    const float* boxesPtr = boxesTensor.data<float>();
+
+    if (shape[1] != priors.size()) {
+        throw std::logic_error("rawBoxes size is not equal to priors size");
+    }
+
+    for (auto i : indicies) {
+        const auto pRawBox = reinterpret_cast<const Box*>(boxesPtr + i * shape[2]);
+        const auto& prior = priors[i];
+        const float cX = priors[i].cX + pRawBox->cX * variance[0] * prior.width;
+        const float cY = priors[i].cY + pRawBox->cY * variance[0] * prior.height;
+        const float width = prior.width * exp(pRawBox->width * variance[1]);
+        const float height = prior.height * exp(pRawBox->height * variance[1]);
+        rects.push_back(Anchor{clamp(cX - width / 2, 0.f, 1.f) * imgWidth,
+                               clamp(cY - height / 2, 0.f, 1.f) * imgHeight,
+                               clamp(cX + width / 2, 0.f, 1.f) * imgWidth,
+                               clamp(cY + height / 2, 0.f, 1.f) * imgHeight});
+    }
+
+    return rects;
+}
+
+std::unique_ptr<ResultBase> ModelRetinaFacePT::postprocess(InferenceResult& infResult) {
+    // (raw_output, scale_x, scale_y, face_prob_threshold, image_size):
+    const auto boxesTensor = infResult.outputsData[outputsNames[OUT_BOXES]];
+    const auto scoresTensor = infResult.outputsData[outputsNames[OUT_SCORES]];
+
+    const auto& validIndicies = filterByScore(scoresTensor, confidenceThreshold);
+    const auto& scores = getFilteredScores(scoresTensor, validIndicies);
+
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+    const auto& landmarks = landmarksNum ? getFilteredLandmarks(infResult.outputsData[outputsNames[OUT_LANDMARKS]],
+                                                                validIndicies,
+                                                                internalData.inputImgWidth,
+                                                                internalData.inputImgHeight)
+                                         : std::vector<cv::Point2f>();
+
+    const auto& proposals =
+        getFilteredProposals(boxesTensor, validIndicies, internalData.inputImgWidth, internalData.inputImgHeight);
+
+    const auto& keptIndicies = nms(proposals, scores, boxIOUThreshold, !landmarksNum);
+
+    // --------------------------- Create detection result objects
+    // --------------------------------------------------------
+    RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData);
+
+    result->objects.reserve(keptIndicies.size());
+    result->landmarks.reserve(keptIndicies.size() * landmarksNum);
+    for (auto i : keptIndicies) {
+        DetectedObject desc;
+        desc.confidence = scores[i];
+
+        //--- Scaling coordinates
+        desc.x = proposals[i].left;
+        desc.y = proposals[i].top;
+        desc.width = proposals[i].getWidth();
+        desc.height = proposals[i].getHeight();
+
+        desc.labelID = 0;
+        desc.label = labels[desc.labelID];
+        result->objects.push_back(desc);
+
+        //--- Filtering landmarks coordinates
+        for (uint32_t l = 0; l < landmarksNum; ++l) {
+            result->landmarks.emplace_back(landmarks[i * landmarksNum + l].x, landmarks[i * landmarksNum + l].y);
+        }
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_ssd.cpp b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp
new file mode 100644
index 0000000..ef741ee
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_ssd.cpp
@@ -0,0 +1,281 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_ssd.h"
+
+#include <algorithm>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/ocv_common.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+struct InputData;
+
+ModelSSD::ModelSSD(const std::string& modelFileName,
+                   float confidenceThreshold,
+                   bool useAutoResize,
+                   const std::vector<std::string>& labels,
+                   const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout) {}
+
+std::shared_ptr<InternalModelData> ModelSSD::preprocess(const InputData& inputData, ov::InferRequest& request) {
+    if (inputsNames.size() > 1) {
+        const auto& imageInfoTensor = request.get_tensor(inputsNames[1]);
+        const auto info = imageInfoTensor.data<float>();
+        info[0] = static_cast<float>(netInputHeight);
+        info[1] = static_cast<float>(netInputWidth);
+        info[2] = 1;
+        request.set_tensor(inputsNames[1], imageInfoTensor);
+    }
+
+    return DetectionModel::preprocess(inputData, request);
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocess(InferenceResult& infResult) {
+    return outputsNames.size() > 1 ? postprocessMultipleOutputs(infResult) : postprocessSingleOutput(infResult);
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocessSingleOutput(InferenceResult& infResult) {
+    const ov::Tensor& detectionsTensor = infResult.getFirstOutputTensor();
+    size_t detectionsNum = detectionsTensor.get_shape()[detectionsNumId];
+    const float* detections = detectionsTensor.data<float>();
+
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    auto retVal = std::unique_ptr<ResultBase>(result);
+
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+    for (size_t i = 0; i < detectionsNum; i++) {
+        float image_id = detections[i * objectSize + 0];
+        if (image_id < 0) {
+            break;
+        }
+
+        float confidence = detections[i * objectSize + 2];
+
+        /** Filtering out objects with confidence < confidence_threshold probability **/
+        if (confidence > confidenceThreshold) {
+            DetectedObject desc;
+
+            desc.confidence = confidence;
+            desc.labelID = static_cast<int>(detections[i * objectSize + 1]);
+            desc.label = getLabelName(desc.labelID);
+
+            desc.x = clamp(detections[i * objectSize + 3] * internalData.inputImgWidth,
+                           0.f,
+                           static_cast<float>(internalData.inputImgWidth));
+            desc.y = clamp(detections[i * objectSize + 4] * internalData.inputImgHeight,
+                           0.f,
+                           static_cast<float>(internalData.inputImgHeight));
+            desc.width = clamp(detections[i * objectSize + 5] * internalData.inputImgWidth,
+                               0.f,
+                               static_cast<float>(internalData.inputImgWidth)) -
+                         desc.x;
+            desc.height = clamp(detections[i * objectSize + 6] * internalData.inputImgHeight,
+                                0.f,
+                                static_cast<float>(internalData.inputImgHeight)) -
+                          desc.y;
+
+            result->objects.push_back(desc);
+        }
+    }
+
+    return retVal;
+}
+
+std::unique_ptr<ResultBase> ModelSSD::postprocessMultipleOutputs(InferenceResult& infResult) {
+    const float* boxes = infResult.outputsData[outputsNames[0]].data<float>();
+    size_t detectionsNum = infResult.outputsData[outputsNames[0]].get_shape()[detectionsNumId];
+    const float* labels = infResult.outputsData[outputsNames[1]].data<float>();
+    const float* scores = outputsNames.size() > 2 ? infResult.outputsData[outputsNames[2]].data<float>() : nullptr;
+
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    auto retVal = std::unique_ptr<ResultBase>(result);
+
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+    // In models with scores are stored in separate output, coordinates are normalized to [0,1]
+    // In other multiple-outputs models coordinates are normalized to [0,netInputWidth] and [0,netInputHeight]
+    float widthScale = static_cast<float>(internalData.inputImgWidth) / (scores ? 1 : netInputWidth);
+    float heightScale = static_cast<float>(internalData.inputImgHeight) / (scores ? 1 : netInputHeight);
+
+    for (size_t i = 0; i < detectionsNum; i++) {
+        float confidence = scores ? scores[i] : boxes[i * objectSize + 4];
+
+        /** Filtering out objects with confidence < confidence_threshold probability **/
+        if (confidence > confidenceThreshold) {
+            DetectedObject desc;
+
+            desc.confidence = confidence;
+            desc.labelID = static_cast<int>(labels[i]);
+            desc.label = getLabelName(desc.labelID);
+
+            desc.x = clamp(boxes[i * objectSize] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth));
+            desc.y =
+                clamp(boxes[i * objectSize + 1] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight));
+            desc.width =
+                clamp(boxes[i * objectSize + 2] * widthScale, 0.f, static_cast<float>(internalData.inputImgWidth)) -
+                desc.x;
+            desc.height =
+                clamp(boxes[i * objectSize + 3] * heightScale, 0.f, static_cast<float>(internalData.inputImgHeight)) -
+                desc.y;
+
+            result->objects.push_back(desc);
+        }
+    }
+
+    return retVal;
+}
+
+void ModelSSD::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input ------------------------------------------------------
+    ov::preprocess::PrePostProcessor ppp(model);
+    for (const auto& input : model->inputs()) {
+        auto inputTensorName = input.get_any_name();
+        const ov::Shape& shape = input.get_shape();
+        ov::Layout inputLayout = getInputLayout(input);
+
+        if (shape.size() == 4) {  // 1st input contains images
+            if (inputsNames.empty()) {
+                inputsNames.push_back(inputTensorName);
+            } else {
+                inputsNames[0] = inputTensorName;
+            }
+
+            inputTransform.setPrecision(ppp, inputTensorName);
+            ppp.input(inputTensorName).tensor().set_layout({"NHWC"});
+
+            if (useAutoResize) {
+                ppp.input(inputTensorName).tensor().set_spatial_dynamic_shape();
+
+                ppp.input(inputTensorName)
+                    .preprocess()
+                    .convert_element_type(ov::element::f32)
+                    .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+            }
+
+            ppp.input(inputTensorName).model().set_layout(inputLayout);
+
+            netInputWidth = shape[ov::layout::width_idx(inputLayout)];
+            netInputHeight = shape[ov::layout::height_idx(inputLayout)];
+        } else if (shape.size() == 2) {  // 2nd input contains image info
+            inputsNames.resize(2);
+            inputsNames[1] = inputTensorName;
+            ppp.input(inputTensorName).tensor().set_element_type(ov::element::f32);
+        } else {
+            throw std::logic_error("Unsupported " + std::to_string(input.get_shape().size()) +
+                                   "D "
+                                   "input layer '" +
+                                   input.get_any_name() +
+                                   "'. "
+                                   "Only 2D and 4D input layers are supported");
+        }
+    }
+    model = ppp.build();
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() == 1) {
+        prepareSingleOutput(model);
+    } else {
+        prepareMultipleOutputs(model);
+    }
+}
+
+void ModelSSD::prepareSingleOutput(std::shared_ptr<ov::Model>& model) {
+    const auto& output = model->output();
+    outputsNames.push_back(output.get_any_name());
+
+    const ov::Shape& shape = output.get_shape();
+    const ov::Layout& layout("NCHW");
+    if (shape.size() != 4) {
+        throw std::logic_error("SSD single output must have 4 dimensions, but had " + std::to_string(shape.size()));
+    }
+    detectionsNumId = ov::layout::height_idx(layout);
+    objectSize = shape[ov::layout::width_idx(layout)];
+    if (objectSize != 7) {
+        throw std::logic_error("SSD single output must have 7 as a last dimension, but had " +
+                               std::to_string(objectSize));
+    }
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.output().tensor().set_element_type(ov::element::f32).set_layout(layout);
+    model = ppp.build();
+}
+
+void ModelSSD::prepareMultipleOutputs(std::shared_ptr<ov::Model>& model) {
+    const ov::OutputVector& outputs = model->outputs();
+    for (auto& output : outputs) {
+        const auto& tensorNames = output.get_names();
+        for (const auto& name : tensorNames) {
+            if (name.find("boxes") != std::string::npos) {
+                outputsNames.push_back(name);
+                break;
+            } else if (name.find("labels") != std::string::npos) {
+                outputsNames.push_back(name);
+                break;
+            } else if (name.find("scores") != std::string::npos) {
+                outputsNames.push_back(name);
+                break;
+            }
+        }
+    }
+    if (outputsNames.size() != 2 && outputsNames.size() != 3) {
+        throw std::logic_error("SSD model wrapper must have 2 or 3 outputs, but had " +
+                               std::to_string(outputsNames.size()));
+    }
+    std::sort(outputsNames.begin(), outputsNames.end());
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    const auto& boxesShape = model->output(outputsNames[0]).get_partial_shape().get_max_shape();
+
+    ov::Layout boxesLayout;
+    if (boxesShape.size() == 2) {
+        boxesLayout = "NC";
+        detectionsNumId = ov::layout::batch_idx(boxesLayout);
+        objectSize = boxesShape[ov::layout::channels_idx(boxesLayout)];
+
+        if (objectSize != 5) {
+            throw std::logic_error("Incorrect 'boxes' output shape, [n][5] shape is required");
+        }
+    } else if (boxesShape.size() == 3) {
+        boxesLayout = "CHW";
+        detectionsNumId = ov::layout::height_idx(boxesLayout);
+        objectSize = boxesShape[ov::layout::width_idx(boxesLayout)];
+
+        if (objectSize != 4) {
+            throw std::logic_error("Incorrect 'boxes' output shape, [b][n][4] shape is required");
+        }
+    } else {
+        throw std::logic_error("Incorrect number of 'boxes' output dimensions, expected 2 or 3, but had " +
+                               std::to_string(boxesShape.size()));
+    }
+
+    ppp.output(outputsNames[0]).tensor().set_layout(boxesLayout);
+
+    for (const auto& outName : outputsNames) {
+        ppp.output(outName).tensor().set_element_type(ov::element::f32);
+    }
+    model = ppp.build();
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolo.cpp b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
new file mode 100644
index 0000000..2c4fb1d
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolo.cpp
@@ -0,0 +1,481 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolo.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+std::vector<float> defaultAnchors[] = {
+    // YOLOv1v2
+    {0.57273f, 0.677385f, 1.87446f, 2.06253f, 3.33843f, 5.47434f, 7.88282f, 3.52778f, 9.77052f, 9.16828f},
+    // YOLOv3
+    {10.0f,
+     13.0f,
+     16.0f,
+     30.0f,
+     33.0f,
+     23.0f,
+     30.0f,
+     61.0f,
+     62.0f,
+     45.0f,
+     59.0f,
+     119.0f,
+     116.0f,
+     90.0f,
+     156.0f,
+     198.0f,
+     373.0f,
+     326.0f},
+    // YOLOv4
+    {12.0f,
+     16.0f,
+     19.0f,
+     36.0f,
+     40.0f,
+     28.0f,
+     36.0f,
+     75.0f,
+     76.0f,
+     55.0f,
+     72.0f,
+     146.0f,
+     142.0f,
+     110.0f,
+     192.0f,
+     243.0f,
+     459.0f,
+     401.0f},
+    // YOLOv4_Tiny
+    {10.0f, 14.0f, 23.0f, 27.0f, 37.0f, 58.0f, 81.0f, 82.0f, 135.0f, 169.0f, 344.0f, 319.0f},
+    // YOLOF
+    {16.0f, 16.0f, 32.0f, 32.0f, 64.0f, 64.0f, 128.0f, 128.0f, 256.0f, 256.0f, 512.0f, 512.0f}};
+
+const std::vector<int64_t> defaultMasks[] = {
+    // YOLOv1v2
+    {},
+    // YOLOv3
+    {},
+    // YOLOv4
+    {0, 1, 2, 3, 4, 5, 6, 7, 8},
+    // YOLOv4_Tiny
+    {1, 2, 3, 3, 4, 5},
+    // YOLOF
+    {0, 1, 2, 3, 4, 5}};
+
+static inline float sigmoid(float x) {
+    return 1.f / (1.f + exp(-x));
+}
+
+static inline float linear(float x) {
+    return x;
+}
+
+ModelYolo::ModelYolo(const std::string& modelFileName,
+                     float confidenceThreshold,
+                     bool useAutoResize,
+                     bool useAdvancedPostprocessing,
+                     float boxIOUThreshold,
+                     const std::vector<std::string>& labels,
+                     const std::vector<float>& anchors,
+                     const std::vector<int64_t>& masks,
+                     const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, useAutoResize, labels, layout),
+      boxIOUThreshold(boxIOUThreshold),
+      useAdvancedPostprocessing(useAdvancedPostprocessing),
+      yoloVersion(YOLO_V3),
+      presetAnchors(anchors),
+      presetMasks(masks) {}
+
+void ModelYolo::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("YOLO model wrapper accepts models that have only 1 input");
+    }
+
+    const auto& input = model->input();
+    const ov::Shape& inputShape = model->input().get_shape();
+    ov::Layout inputLayout = getInputLayout(input);
+
+    if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 3-channel input");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+
+    //--- Reading image input parameters
+    inputsNames.push_back(model->input().get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    std::map<std::string, ov::Shape> outShapes;
+    for (auto& out : outputs) {
+        ppp.output(out.get_any_name()).tensor().set_element_type(ov::element::f32);
+        if (out.get_shape().size() == 4) {
+            if (out.get_shape()[ov::layout::height_idx("NCHW")] != out.get_shape()[ov::layout::width_idx("NCHW")] &&
+                out.get_shape()[ov::layout::height_idx("NHWC")] == out.get_shape()[ov::layout::width_idx("NHWC")]) {
+                ppp.output(out.get_any_name()).model().set_layout("NHWC");
+                // outShapes are saved before ppp.build() thus set yoloRegionLayout as it is in model before ppp.build()
+                yoloRegionLayout = "NHWC";
+            }
+            // yolo-v1-tiny-tf out shape is [1, 21125] thus set layout only for 4 dim tensors
+            ppp.output(out.get_any_name()).tensor().set_layout("NCHW");
+        }
+        outputsNames.push_back(out.get_any_name());
+        outShapes[out.get_any_name()] = out.get_shape();
+    }
+    model = ppp.build();
+
+    yoloVersion = YOLO_V3;
+    bool isRegionFound = false;
+    for (const auto& op : model->get_ordered_ops()) {
+        if (std::string("RegionYolo") == op->get_type_name()) {
+            auto regionYolo = std::dynamic_pointer_cast<ov::op::v0::RegionYolo>(op);
+
+            if (regionYolo) {
+                if (!regionYolo->get_mask().size()) {
+                    yoloVersion = YOLO_V1V2;
+                }
+
+                const auto& opName = op->get_friendly_name();
+                for (const auto& out : outputs) {
+                    if (out.get_node()->get_friendly_name() == opName ||
+                        out.get_node()->get_input_node_ptr(0)->get_friendly_name() == opName) {
+                        isRegionFound = true;
+                        regions.emplace(out.get_any_name(), Region(regionYolo));
+                    }
+                }
+            }
+        }
+    }
+
+    if (!isRegionFound) {
+        switch (outputsNames.size()) {
+            case 1:
+                yoloVersion = YOLOF;
+                break;
+            case 2:
+                yoloVersion = YOLO_V4_TINY;
+                break;
+            case 3:
+                yoloVersion = YOLO_V4;
+                break;
+        }
+
+        int num = yoloVersion == YOLOF ? 6 : 3;
+        isObjConf = yoloVersion == YOLOF ? 0 : 1;
+        int i = 0;
+
+        auto chosenMasks = presetMasks.size() ? presetMasks : defaultMasks[yoloVersion];
+        if (chosenMasks.size() != num * outputs.size()) {
+            throw std::runtime_error(std::string("Invalid size of masks array, got ") +
+                                     std::to_string(presetMasks.size()) + ", should be " +
+                                     std::to_string(num * outputs.size()));
+        }
+
+        std::sort(outputsNames.begin(),
+                  outputsNames.end(),
+                  [&outShapes, this](const std::string& x, const std::string& y) {
+                      return outShapes[x][ov::layout::height_idx(yoloRegionLayout)] >
+                             outShapes[y][ov::layout::height_idx(yoloRegionLayout)];
+                  });
+
+        for (const auto& name : outputsNames) {
+            const auto& shape = outShapes[name];
+            if (shape[ov::layout::channels_idx(yoloRegionLayout)] % num != 0) {
+                throw std::logic_error(std::string("Output tensor ") + name + " has wrong channel dimension");
+            }
+            regions.emplace(
+                name,
+                Region(shape[ov::layout::channels_idx(yoloRegionLayout)] / num - 4 - (isObjConf ? 1 : 0),
+                       4,
+                       presetAnchors.size() ? presetAnchors : defaultAnchors[yoloVersion],
+                       std::vector<int64_t>(chosenMasks.begin() + i * num, chosenMasks.begin() + (i + 1) * num),
+                       shape[ov::layout::width_idx(yoloRegionLayout)],
+                       shape[ov::layout::height_idx(yoloRegionLayout)]));
+            i++;
+        }
+    } else {
+        // Currently externally set anchors and masks are supported only for YoloV4
+        if (presetAnchors.size() || presetMasks.size()) {
+            slog::warn << "Preset anchors and mask can be set for YoloV4 model only. "
+                          "This model is not YoloV4, so these options will be ignored."
+                       << slog::endl;
+        }
+    }
+}
+
+std::unique_ptr<ResultBase> ModelYolo::postprocess(InferenceResult& infResult) {
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    std::vector<DetectedObject> objects;
+
+    // Parsing outputs
+    const auto& internalData = infResult.internalModelData->asRef<InternalImageModelData>();
+
+    for (auto& output : infResult.outputsData) {
+        this->parseYOLOOutput(output.first,
+                              output.second,
+                              netInputHeight,
+                              netInputWidth,
+                              internalData.inputImgHeight,
+                              internalData.inputImgWidth,
+                              objects);
+    }
+
+    if (useAdvancedPostprocessing) {
+        // Advanced postprocessing
+        // Checking IOU threshold conformance
+        // For every i-th object we're finding all objects it intersects with, and comparing confidence
+        // If i-th object has greater confidence than all others, we include it into result
+        for (const auto& obj1 : objects) {
+            bool isGoodResult = true;
+            for (const auto& obj2 : objects) {
+                if (obj1.labelID == obj2.labelID && obj1.confidence < obj2.confidence &&
+                    intersectionOverUnion(obj1, obj2) >= boxIOUThreshold) {  // if obj1 is the same as obj2, condition
+                                                                             // expression will evaluate to false anyway
+                    isGoodResult = false;
+                    break;
+                }
+            }
+            if (isGoodResult) {
+                result->objects.push_back(obj1);
+            }
+        }
+    } else {
+        // Classic postprocessing
+        std::sort(objects.begin(), objects.end(), [](const DetectedObject& x, const DetectedObject& y) {
+            return x.confidence > y.confidence;
+        });
+        for (size_t i = 0; i < objects.size(); ++i) {
+            if (objects[i].confidence == 0)
+                continue;
+            for (size_t j = i + 1; j < objects.size(); ++j)
+                if (intersectionOverUnion(objects[i], objects[j]) >= boxIOUThreshold)
+                    objects[j].confidence = 0;
+            result->objects.push_back(objects[i]);
+        }
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
+
+void ModelYolo::parseYOLOOutput(const std::string& output_name,
+                                const ov::Tensor& tensor,
+                                const unsigned long resized_im_h,
+                                const unsigned long resized_im_w,
+                                const unsigned long original_im_h,
+                                const unsigned long original_im_w,
+                                std::vector<DetectedObject>& objects) {
+    // --------------------------- Extracting layer parameters -------------------------------------
+    auto it = regions.find(output_name);
+    if (it == regions.end()) {
+        throw std::runtime_error(std::string("Can't find output layer with name ") + output_name);
+    }
+    auto& region = it->second;
+
+    int sideW = 0;
+    int sideH = 0;
+    unsigned long scaleH;
+    unsigned long scaleW;
+    switch (yoloVersion) {
+        case YOLO_V1V2:
+            sideH = region.outputHeight;
+            sideW = region.outputWidth;
+            scaleW = region.outputWidth;
+            scaleH = region.outputHeight;
+            break;
+        case YOLO_V3:
+        case YOLO_V4:
+        case YOLO_V4_TINY:
+        case YOLOF:
+            sideH = static_cast<int>(tensor.get_shape()[ov::layout::height_idx("NCHW")]);
+            sideW = static_cast<int>(tensor.get_shape()[ov::layout::width_idx("NCHW")]);
+            scaleW = resized_im_w;
+            scaleH = resized_im_h;
+            break;
+    }
+
+    auto entriesNum = sideW * sideH;
+    const float* outData = tensor.data<float>();
+
+    auto postprocessRawData =
+        (yoloVersion == YOLO_V4 || yoloVersion == YOLO_V4_TINY || yoloVersion == YOLOF) ? sigmoid : linear;
+
+    // --------------------------- Parsing YOLO Region output -------------------------------------
+    for (int i = 0; i < entriesNum; ++i) {
+        int row = i / sideW;
+        int col = i % sideW;
+        for (int n = 0; n < region.num; ++n) {
+            //--- Getting region data
+            int obj_index = calculateEntryIndex(entriesNum,
+                                                region.coords,
+                                                region.classes + isObjConf,
+                                                n * entriesNum + i,
+                                                region.coords);
+            int box_index =
+                calculateEntryIndex(entriesNum, region.coords, region.classes + isObjConf, n * entriesNum + i, 0);
+            float scale = isObjConf ? postprocessRawData(outData[obj_index]) : 1;
+
+            //--- Preliminary check for confidence threshold conformance
+            if (scale >= confidenceThreshold) {
+                //--- Calculating scaled region's coordinates
+                float x, y;
+                if (yoloVersion == YOLOF) {
+                    x = (static_cast<float>(col) / sideW +
+                         outData[box_index + 0 * entriesNum] * region.anchors[2 * n] / scaleW) *
+                        original_im_w;
+                    y = (static_cast<float>(row) / sideH +
+                         outData[box_index + 1 * entriesNum] * region.anchors[2 * n + 1] / scaleH) *
+                        original_im_h;
+                } else {
+                    x = static_cast<float>((col + postprocessRawData(outData[box_index + 0 * entriesNum])) / sideW *
+                                           original_im_w);
+                    y = static_cast<float>((row + postprocessRawData(outData[box_index + 1 * entriesNum])) / sideH *
+                                           original_im_h);
+                }
+                float height = static_cast<float>(std::exp(outData[box_index + 3 * entriesNum]) *
+                                                  region.anchors[2 * n + 1] * original_im_h / scaleH);
+                float width = static_cast<float>(std::exp(outData[box_index + 2 * entriesNum]) * region.anchors[2 * n] *
+                                                 original_im_w / scaleW);
+
+                DetectedObject obj;
+                obj.x = clamp(x - width / 2, 0.f, static_cast<float>(original_im_w));
+                obj.y = clamp(y - height / 2, 0.f, static_cast<float>(original_im_h));
+                obj.width = clamp(width, 0.f, static_cast<float>(original_im_w - obj.x));
+                obj.height = clamp(height, 0.f, static_cast<float>(original_im_h - obj.y));
+
+                for (size_t j = 0; j < region.classes; ++j) {
+                    int class_index = calculateEntryIndex(entriesNum,
+                                                          region.coords,
+                                                          region.classes + isObjConf,
+                                                          n * entriesNum + i,
+                                                          region.coords + isObjConf + j);
+                    float prob = scale * postprocessRawData(outData[class_index]);
+
+                    //--- Checking confidence threshold conformance and adding region to the list
+                    if (prob >= confidenceThreshold) {
+                        obj.confidence = prob;
+                        obj.labelID = j;
+                        obj.label = getLabelName(obj.labelID);
+                        objects.push_back(obj);
+                    }
+                }
+            }
+        }
+    }
+}
+
+int ModelYolo::calculateEntryIndex(int totalCells, int lcoords, size_t lclasses, int location, int entry) {
+    int n = location / totalCells;
+    int loc = location % totalCells;
+    return (n * (lcoords + lclasses) + entry) * totalCells + loc;
+}
+
+double ModelYolo::intersectionOverUnion(const DetectedObject& o1, const DetectedObject& o2) {
+    double overlappingWidth = fmin(o1.x + o1.width, o2.x + o2.width) - fmax(o1.x, o2.x);
+    double overlappingHeight = fmin(o1.y + o1.height, o2.y + o2.height) - fmax(o1.y, o2.y);
+    double intersectionArea =
+        (overlappingWidth < 0 || overlappingHeight < 0) ? 0 : overlappingHeight * overlappingWidth;
+    double unionArea = o1.width * o1.height + o2.width * o2.height - intersectionArea;
+    return intersectionArea / unionArea;
+}
+
+ModelYolo::Region::Region(const std::shared_ptr<ov::op::v0::RegionYolo>& regionYolo) {
+    coords = regionYolo->get_num_coords();
+    classes = regionYolo->get_num_classes();
+    auto mask = regionYolo->get_mask();
+    num = mask.size();
+
+    auto shape = regionYolo->get_input_shape(0);
+    outputWidth = shape[3];
+    outputHeight = shape[2];
+
+    if (num) {
+        // Parsing YoloV3 parameters
+        anchors.resize(num * 2);
+
+        for (int i = 0; i < num; ++i) {
+            anchors[i * 2] = regionYolo->get_anchors()[mask[i] * 2];
+            anchors[i * 2 + 1] = regionYolo->get_anchors()[mask[i] * 2 + 1];
+        }
+    } else {
+        // Parsing YoloV2 parameters
+        num = regionYolo->get_num_regions();
+        anchors = regionYolo->get_anchors();
+        if (anchors.empty()) {
+            anchors = defaultAnchors[YOLO_V1V2];
+            num = 5;
+        }
+    }
+}
+
+ModelYolo::Region::Region(size_t classes,
+                          int coords,
+                          const std::vector<float>& anchors,
+                          const std::vector<int64_t>& masks,
+                          size_t outputWidth,
+                          size_t outputHeight)
+    : classes(classes),
+      coords(coords),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight) {
+    num = masks.size();
+
+    if (anchors.size() == 0 || anchors.size() % 2 != 0) {
+        throw std::runtime_error("Explicitly initialized region should have non-empty even-sized regions vector");
+    }
+
+    if (num) {
+        this->anchors.resize(num * 2);
+
+        for (int i = 0; i < num; ++i) {
+            this->anchors[i * 2] = anchors[masks[i] * 2];
+            this->anchors[i * 2 + 1] = anchors[masks[i] * 2 + 1];
+        }
+    } else {
+        this->anchors = anchors;
+        num = anchors.size() / 2;
+    }
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp
new file mode 100644
index 0000000..132eb9e
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolov3_onnx.cpp
@@ -0,0 +1,188 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolov3_onnx.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+#include "utils/image_utils.h"
+
+ModelYoloV3ONNX::ModelYoloV3ONNX(const std::string& modelFileName,
+                                 float confidenceThreshold,
+                                 const std::vector<std::string>& labels,
+                                 const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout) {
+        interpolationMode = cv::INTER_CUBIC;
+        resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX;
+    }
+
+
+void ModelYoloV3ONNX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare inputs ------------------------------------------------------
+    const ov::OutputVector& inputs = model->inputs();
+    if (inputs.size() != 2) {
+        throw std::logic_error("YoloV3ONNX model wrapper expects models that have 2 inputs");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    inputsNames.reserve(inputs.size());
+    for (auto& input : inputs) {
+        const ov::Shape& currentShape = input.get_shape();
+        std::string currentName = input.get_any_name();
+        const ov::Layout& currentLayout = getInputLayout(input);
+
+        if (currentShape.size() == 4) {
+            if (currentShape[ov::layout::channels_idx(currentLayout)] != 3) {
+                throw std::logic_error("Expected 4D image input with 3 channels");
+            }
+            inputsNames[0] = currentName;
+            netInputWidth = currentShape[ov::layout::width_idx(currentLayout)];
+            netInputHeight = currentShape[ov::layout::height_idx(currentLayout)];
+            ppp.input(currentName).tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+        } else if (currentShape.size() == 2) {
+            if (currentShape[ov::layout::channels_idx(currentLayout)] != 2) {
+                throw std::logic_error("Expected 2D image info input with 2 channels");
+            }
+            inputsNames[1] = currentName;
+            ppp.input(currentName).tensor().set_element_type(ov::element::i32);
+        }
+        ppp.input(currentName).model().set_layout(currentLayout);
+    }
+
+    // --------------------------- Prepare outputs -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 3) {
+        throw std::logic_error("YoloV3ONNX model wrapper expects models that have 3 outputs");
+    }
+
+    for (auto& output : outputs) {
+        const ov::Shape& currentShape = output.get_partial_shape().get_max_shape();
+        std::string currentName = output.get_any_name();
+        if (currentShape.back() == 3) {
+            indicesOutputName = currentName;
+            ppp.output(currentName).tensor().set_element_type(ov::element::i32);
+        } else if (currentShape[2] == 4) {
+            boxesOutputName = currentName;
+            ppp.output(currentName).tensor().set_element_type(ov::element::f32);
+        } else if (currentShape[1] == numberOfClasses) {
+            scoresOutputName = currentName;
+            ppp.output(currentName).tensor().set_element_type(ov::element::f32);
+        } else {
+            throw std::logic_error("Expected shapes [:,:,4], [:,"
+                + std::to_string(numberOfClasses) + ",:] and [:,3] for outputs");
+        }
+        outputsNames.push_back(currentName);
+    }
+    model = ppp.build();
+}
+
+std::shared_ptr<InternalModelData> ModelYoloV3ONNX::preprocess(const InputData& inputData,
+                                                               ov::InferRequest& request) {
+    const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+
+    cv::Mat info(cv::Size(1, 2), CV_32SC1);
+    info.at<int>(0, 0) = origImg.rows;
+    info.at<int>(0, 1) = origImg.cols;
+    auto allocator = std::make_shared<SharedTensorAllocator>(info);
+    ov::Tensor infoInput = ov::Tensor(ov::element::i32, ov::Shape({1, 2}),  ov::Allocator(allocator));
+
+    request.set_tensor(inputsNames[1], infoInput);
+
+    return ImageModel::preprocess(inputData, request);
+}
+
+namespace {
+float getScore(const ov::Tensor& scoresTensor, size_t classInd, size_t boxInd) {
+    const float* scoresPtr = scoresTensor.data<float>();
+    const auto shape = scoresTensor.get_shape();
+    int N = shape[2];
+
+    return scoresPtr[classInd * N + boxInd];
+}
+}
+
+std::unique_ptr<ResultBase> ModelYoloV3ONNX::postprocess(InferenceResult& infResult) {
+    // Get info about input image
+    const auto imgWidth = infResult.internalModelData->asRef<InternalImageModelData>().inputImgWidth;
+    const auto imgHeight = infResult.internalModelData->asRef<InternalImageModelData>().inputImgHeight;
+
+    // Get outputs tensors
+    const ov::Tensor& boxes = infResult.outputsData[boxesOutputName];
+    const float* boxesPtr = boxes.data<float>();
+
+    const ov::Tensor& scores = infResult.outputsData[scoresOutputName];
+    const ov::Tensor& indices = infResult.outputsData[indicesOutputName];
+
+    const int* indicesData = indices.data<int>();
+    const auto indicesShape = indices.get_shape();
+    const auto boxShape = boxes.get_shape();
+
+    // Generate detection results
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+    size_t numberOfBoxes = indicesShape.size() == 3 ? indicesShape[1] : indicesShape[0];
+    int indicesStride = indicesShape.size() == 3 ? indicesShape[2] : indicesShape[1];
+
+    for (size_t i = 0; i < numberOfBoxes; ++i) {
+        int batchInd = indicesData[i * indicesStride];
+        int classInd = indicesData[i * indicesStride + 1];
+        int boxInd = indicesData[i * indicesStride + 2];
+
+        if (batchInd == -1) {
+            break;
+        }
+
+        float score = getScore(scores, classInd, boxInd);
+
+        if (score > confidenceThreshold) {
+            DetectedObject obj;
+            size_t startPos = boxShape[2] * boxInd;
+
+            auto x = boxesPtr[startPos + 1];
+            auto y = boxesPtr[startPos];
+            auto width = boxesPtr[startPos + 3] - x;
+            auto height = boxesPtr[startPos + 2] - y;
+
+            // Create new detected box
+            obj.x = clamp(x, 0.f, static_cast<float>(imgWidth));
+            obj.y = clamp(y, 0.f, static_cast<float>(imgHeight));
+            obj.height = clamp(height, 0.f, static_cast<float>(imgHeight));
+            obj.width = clamp(width, 0.f, static_cast<float>(imgWidth));
+            obj.confidence = score;
+            obj.labelID = classInd;
+            obj.label = getLabelName(classInd);
+
+            result->objects.push_back(obj);
+
+        }
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/detection_model_yolox.cpp b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp
new file mode 100644
index 0000000..1e434ff
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/detection_model_yolox.cpp
@@ -0,0 +1,194 @@
+/*
+// Copyright (C) 2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/detection_model_yolox.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+#include "utils/image_utils.h"
+#include "utils/nms.hpp"
+
+ModelYoloX::ModelYoloX(const std::string& modelFileName,
+                                 float confidenceThreshold,
+                                 float boxIOUThreshold,
+                                 const std::vector<std::string>& labels,
+                                 const std::string& layout)
+    : DetectionModel(modelFileName, confidenceThreshold, false, labels, layout),
+      boxIOUThreshold(boxIOUThreshold) {
+        resizeMode = RESIZE_KEEP_ASPECT;
+}
+
+void ModelYoloX::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    const ov::OutputVector& inputs = model->inputs();
+    if (inputs.size() != 1) {
+        throw std::logic_error("YOLOX model wrapper accepts models that have only 1 input");
+    }
+
+    //--- Check image input
+    const auto& input = model->input();
+    const ov::Shape& inputShape = model->input().get_shape();
+    ov::Layout inputLayout = getInputLayout(input);
+
+    if (inputShape.size() != 4 && inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("Expected 4D image input with 3 channels");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    ppp.input().model().set_layout(inputLayout);
+
+    //--- Reading image input parameters
+    inputsNames.push_back(input.get_any_name());
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+    setStridesGrids();
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 1) {
+        throw std::logic_error("YoloX model wrapper expects models that have only 1 output");
+    }
+    const auto& output = model->output();
+    outputsNames.push_back(output.get_any_name());
+    const ov::Shape& shape = output.get_shape();
+
+    if (shape.size() != 3) {
+        throw std::logic_error("YOLOX single output must have 3 dimensions, but had " + std::to_string(shape.size()));
+    }
+    ppp.output().tensor().set_element_type(ov::element::f32);
+
+    model = ppp.build();
+}
+
+void ModelYoloX::setStridesGrids() {
+    std::vector<size_t> strides = {8, 16, 32};
+    std::vector<size_t> hsizes(3);
+    std::vector<size_t> wsizes(3);
+
+    for (size_t i = 0; i < strides.size(); ++i) {
+        hsizes[i] = netInputHeight / strides[i];
+        wsizes[i] = netInputWidth / strides[i];
+    }
+
+    for (size_t size_index = 0; size_index < hsizes.size(); ++size_index) {
+        for (size_t h_index = 0; h_index < hsizes[size_index]; ++h_index) {
+            for (size_t w_index = 0; w_index < wsizes[size_index]; ++w_index) {
+                grids.emplace_back(w_index, h_index);
+                expandedStrides.push_back(strides[size_index]);
+            }
+        }
+    }
+}
+
+std::shared_ptr<InternalModelData> ModelYoloX::preprocess(const InputData& inputData,
+                                                          ov::InferRequest& request) {
+    const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+    double scale = std::min(static_cast<double>(netInputWidth) / origImg.cols,
+                            static_cast<double>(netInputHeight) / origImg.rows);
+
+    cv::Mat resizedImage = resizeImageExt(origImg, netInputWidth, netInputHeight, resizeMode,
+                                          interpolationMode, nullptr, cv::Scalar(114, 114, 114));
+
+    request.set_input_tensor(wrapMat2Tensor(resizedImage));
+    return std::make_shared<InternalScaleData>(origImg.cols, origImg.rows, scale, scale);
+}
+
+std::unique_ptr<ResultBase> ModelYoloX::postprocess(InferenceResult& infResult) {
+    // Get metadata about input image shape and scale
+    const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+
+    // Get output tensor
+    const ov::Tensor& output = infResult.outputsData[outputsNames[0]];
+    const auto& outputShape = output.get_shape();
+    float* outputPtr = output.data<float>();
+
+    // Generate detection results
+    DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData);
+
+    // Update coordinates according to strides
+    for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
+        size_t startPos = outputShape[2] * box_index;
+        outputPtr[startPos] = (outputPtr[startPos] + grids[box_index].first) * expandedStrides[box_index];
+        outputPtr[startPos + 1] = (outputPtr[startPos + 1] + grids[box_index].second) * expandedStrides[box_index];
+        outputPtr[startPos + 2] = std::exp(outputPtr[startPos + 2]) * expandedStrides[box_index];
+        outputPtr[startPos + 3] = std::exp(outputPtr[startPos + 3]) * expandedStrides[box_index];
+    }
+
+    // Filter predictions
+    std::vector<Anchor> validBoxes;
+    std::vector<float> scores;
+    std::vector<size_t> classes;
+    for (size_t box_index = 0; box_index < expandedStrides.size(); ++box_index) {
+        size_t startPos = outputShape[2] * box_index;
+        float score = outputPtr[startPos + 4];
+        if (score < confidenceThreshold)
+            continue;
+        float maxClassScore = -1;
+        size_t mainClass = 0;
+        for (size_t class_index = 0; class_index < numberOfClasses; ++class_index) {
+            if (outputPtr[startPos + 5 + class_index] > maxClassScore) {
+                maxClassScore = outputPtr[startPos + 5 + class_index];
+                mainClass = class_index;
+            }
+        }
+
+        // Filter by score
+        score *= maxClassScore;
+        if (score < confidenceThreshold)
+            continue;
+
+        // Add successful boxes
+        scores.push_back(score);
+        classes.push_back(mainClass);
+        Anchor trueBox = {outputPtr[startPos + 0] - outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] - outputPtr[startPos + 3] / 2,
+                          outputPtr[startPos + 0] + outputPtr[startPos + 2] / 2, outputPtr[startPos + 1] + outputPtr[startPos + 3] / 2};
+        validBoxes.push_back(Anchor({trueBox.left / scale.scaleX, trueBox.top / scale.scaleY,
+                                     trueBox.right / scale.scaleX, trueBox.bottom / scale.scaleY}));
+    }
+
+    // NMS for valid boxes
+    std::vector<int> keep = nms(validBoxes, scores, boxIOUThreshold, true);
+    for (auto& index: keep) {
+        // Create new detected box
+        DetectedObject obj;
+        obj.x = clamp(validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
+        obj.y = clamp(validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
+        obj.height = clamp(validBoxes[index].bottom - validBoxes[index].top, 0.f, static_cast<float>(scale.inputImgHeight));
+        obj.width = clamp(validBoxes[index].right - validBoxes[index].left, 0.f, static_cast<float>(scale.inputImgWidth));
+        obj.confidence = scores[index];
+        obj.labelID = classes[index];
+        obj.label = getLabelName(classes[index]);
+        result->objects.push_back(obj);
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
new file mode 100644
index 0000000..33a3604
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/hpe_model_associative_embedding.cpp
@@ -0,0 +1,264 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/hpe_model_associative_embedding.h"
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/associative_embedding_decoder.h"
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+const cv::Vec3f HpeAssociativeEmbedding::meanPixel = cv::Vec3f::all(128);
+const float HpeAssociativeEmbedding::detectionThreshold = 0.1f;
+const float HpeAssociativeEmbedding::tagThreshold = 1.0f;
+
+HpeAssociativeEmbedding::HpeAssociativeEmbedding(const std::string& modelFileName,
+                                                 double aspectRatio,
+                                                 int targetSize,
+                                                 float confidenceThreshold,
+                                                 const std::string& layout,
+                                                 float delta,
+                                                 RESIZE_MODE resizeMode)
+    : ImageModel(modelFileName, false, layout),
+      aspectRatio(aspectRatio),
+      targetSize(targetSize),
+      confidenceThreshold(confidenceThreshold),
+      delta(delta) {
+        resizeMode = resizeMode;
+        interpolationMode = cv::INTER_CUBIC;
+      }
+
+void HpeAssociativeEmbedding::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input Tensors ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("HPE AE model wrapper supports topologies with only 1 input.");
+    }
+    inputsNames.push_back(model->input().get_any_name());
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+        inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output Tensors -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 2 && outputs.size() != 3) {
+        throw std::logic_error("HPE AE model model wrapper supports topologies only with 2 or 3 outputs");
+    }
+
+    for (const auto& output : model->outputs()) {
+        const auto& outTensorName = output.get_any_name();
+        ppp.output(outTensorName).tensor().set_element_type(ov::element::f32);
+
+        for (const auto& name : output.get_names()) {
+            outputsNames.push_back(name);
+        }
+
+        const ov::Shape& outputShape = output.get_shape();
+        if (outputShape.size() != 4 && outputShape.size() != 5) {
+            throw std::logic_error("output tensors are expected to be 4-dimensional or 5-dimensional");
+        }
+        if (outputShape[ov::layout::batch_idx("NC...")] != 1 || outputShape[ov::layout::channels_idx("NC...")] != 17) {
+            throw std::logic_error("output tensors are expected to have 1 batch size and 17 channels");
+        }
+    }
+    model = ppp.build();
+
+    embeddingsTensorName = findTensorByName("embeddings", outputsNames);
+    heatmapsTensorName = findTensorByName("heatmaps", outputsNames);
+    try {
+        nmsHeatmapsTensorName = findTensorByName("nms_heatmaps", outputsNames);
+    } catch (const std::runtime_error&) { nmsHeatmapsTensorName = heatmapsTensorName; }
+
+    changeInputSize(model);
+}
+
+void HpeAssociativeEmbedding::changeInputSize(std::shared_ptr<ov::Model>& model) {
+    ov::Shape inputShape = model->input().get_shape();
+    const ov::Layout& layout = ov::layout::get_layout(model->input());
+    const auto batchId = ov::layout::batch_idx(layout);
+    const auto heightId = ov::layout::height_idx(layout);
+    const auto widthId = ov::layout::width_idx(layout);
+
+    if (!targetSize) {
+        targetSize = static_cast<int>(std::min(inputShape[heightId], inputShape[widthId]));
+    }
+    int inputHeight = aspectRatio >= 1.0 ? targetSize : static_cast<int>(std::round(targetSize / aspectRatio));
+    int inputWidth = aspectRatio >= 1.0 ? static_cast<int>(std::round(targetSize * aspectRatio)) : targetSize;
+    int height = static_cast<int>((inputHeight + stride - 1) / stride) * stride;
+    int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride;
+    inputShape[batchId] = 1;
+    inputShape[heightId] = height;
+    inputShape[widthId] = width;
+    inputLayerSize = cv::Size(width, height);
+
+    model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> HpeAssociativeEmbedding::preprocess(const InputData& inputData,
+                                                                       ov::InferRequest& request) {
+    auto& image = inputData.asRef<ImageInputData>().inputImage;
+    cv::Rect roi;
+    auto paddedImage = resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi);
+    if (inputLayerSize.height - stride >= roi.height || inputLayerSize.width - stride >= roi.width) {
+        slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+    }
+    request.set_input_tensor(wrapMat2Tensor(paddedImage));
+
+    return std::make_shared<InternalScaleData>(paddedImage.cols,
+                                               paddedImage.rows,
+                                               image.size().width / static_cast<float>(roi.width),
+                                               image.size().height / static_cast<float>(roi.height));
+}
+
+std::unique_ptr<ResultBase> HpeAssociativeEmbedding::postprocess(InferenceResult& infResult) {
+    HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData);
+
+    const auto& aembds = infResult.outputsData[embeddingsTensorName];
+    const ov::Shape& aembdsShape = aembds.get_shape();
+    float* const aembdsMapped = aembds.data<float>();
+    std::vector<cv::Mat> aembdsMaps = split(aembdsMapped, aembdsShape);
+
+    const auto& heats = infResult.outputsData[heatmapsTensorName];
+    const ov::Shape& heatMapsShape = heats.get_shape();
+    float* const heatMapsMapped = heats.data<float>();
+    std::vector<cv::Mat> heatMaps = split(heatMapsMapped, heatMapsShape);
+
+    std::vector<cv::Mat> nmsHeatMaps = heatMaps;
+    if (nmsHeatmapsTensorName != heatmapsTensorName) {
+        const auto& nmsHeats = infResult.outputsData[nmsHeatmapsTensorName];
+        const ov::Shape& nmsHeatMapsShape = nmsHeats.get_shape();
+        float* const nmsHeatMapsMapped = nmsHeats.data<float>();
+        nmsHeatMaps = split(nmsHeatMapsMapped, nmsHeatMapsShape);
+    }
+    std::vector<HumanPose> poses = extractPoses(heatMaps, aembdsMaps, nmsHeatMaps);
+
+    // Rescale poses to the original image
+    const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+    const float outputScale = inputLayerSize.width / static_cast<float>(heatMapsShape[3]);
+    float shiftX = 0.0, shiftY = 0.0;
+    float scaleX = 1.0, scaleY = 1.0;
+
+    if (resizeMode == RESIZE_KEEP_ASPECT_LETTERBOX) {
+        scaleX = scaleY = std::min(scale.scaleX, scale.scaleY);
+        if (aspectRatio >= 1.0)
+            shiftX = static_cast<float>((targetSize * scaleX * aspectRatio - scale.inputImgWidth * scaleX) / 2);
+        else
+            shiftY = static_cast<float>((targetSize * scaleY / aspectRatio - scale.inputImgHeight * scaleY) / 2);
+        scaleX = scaleY *= outputScale;
+    } else {
+        scaleX = scale.scaleX * outputScale;
+        scaleY = scale.scaleY * outputScale;
+    }
+
+    for (auto& pose : poses) {
+        for (auto& keypoint : pose.keypoints) {
+            if (keypoint != cv::Point2f(-1, -1)) {
+                keypoint.x = keypoint.x * scaleX + shiftX;
+                keypoint.y = keypoint.y * scaleY + shiftY;
+            }
+        }
+        result->poses.push_back(pose);
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
+
+std::string HpeAssociativeEmbedding::findTensorByName(const std::string& tensorName,
+                                                      const std::vector<std::string>& outputsNames) {
+    std::vector<std::string> suitableLayers;
+    for (auto& outputName : outputsNames) {
+        if (outputName.rfind(tensorName, 0) == 0) {
+            suitableLayers.push_back(outputName);
+        }
+    }
+    if (suitableLayers.empty()) {
+        throw std::runtime_error("Suitable tensor for " + tensorName + " output is not found");
+    } else if (suitableLayers.size() > 1) {
+        throw std::runtime_error("More than 1 tensor matched to " + tensorName + " output");
+    }
+    return suitableLayers[0];
+}
+
+std::vector<cv::Mat> HpeAssociativeEmbedding::split(float* data, const ov::Shape& shape) {
+    std::vector<cv::Mat> flattenData(shape[1]);
+    for (size_t i = 0; i < flattenData.size(); i++) {
+        flattenData[i] = cv::Mat(shape[2], shape[3], CV_32FC1, data + i * shape[2] * shape[3]);
+    }
+    return flattenData;
+}
+
+std::vector<HumanPose> HpeAssociativeEmbedding::extractPoses(std::vector<cv::Mat>& heatMaps,
+                                                             const std::vector<cv::Mat>& aembdsMaps,
+                                                             const std::vector<cv::Mat>& nmsHeatMaps) const {
+    std::vector<std::vector<Peak>> allPeaks(numJoints);
+    for (int i = 0; i < numJoints; i++) {
+        findPeaks(nmsHeatMaps, aembdsMaps, allPeaks, i, maxNumPeople, detectionThreshold);
+    }
+    std::vector<Pose> allPoses = matchByTag(allPeaks, maxNumPeople, numJoints, tagThreshold);
+    // swap for all poses
+    for (auto& pose : allPoses) {
+        for (size_t j = 0; j < numJoints; j++) {
+            Peak& peak = pose.getPeak(j);
+            std::swap(peak.keypoint.x, peak.keypoint.y);
+        }
+    }
+    std::vector<HumanPose> poses;
+    for (size_t i = 0; i < allPoses.size(); i++) {
+        Pose& pose = allPoses[i];
+        // Filtering poses with low mean scores
+        if (pose.getMeanScore() <= confidenceThreshold) {
+            continue;
+        }
+        for (size_t j = 0; j < heatMaps.size(); j++) {
+            heatMaps[j] = cv::abs(heatMaps[j]);
+        }
+        adjustAndRefine(allPoses, heatMaps, aembdsMaps, i, delta);
+        std::vector<cv::Point2f> keypoints;
+        for (size_t j = 0; j < numJoints; j++) {
+            Peak& peak = pose.getPeak(j);
+            keypoints.push_back(peak.keypoint);
+        }
+        poses.push_back({keypoints, pose.getMeanScore()});
+    }
+    return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp
new file mode 100644
index 0000000..d8b4cb6
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/hpe_model_openpose.cpp
@@ -0,0 +1,256 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/hpe_model_openpose.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/openpose_decoder.h"
+#include "models/results.h"
+
+const cv::Vec3f HPEOpenPose::meanPixel = cv::Vec3f::all(128);
+const float HPEOpenPose::minPeaksDistance = 3.0f;
+const float HPEOpenPose::midPointsScoreThreshold = 0.05f;
+const float HPEOpenPose::foundMidPointsRatioThreshold = 0.8f;
+const float HPEOpenPose::minSubsetScore = 0.2f;
+
+HPEOpenPose::HPEOpenPose(const std::string& modelFileName,
+                         double aspectRatio,
+                         int targetSize,
+                         float confidenceThreshold,
+                         const std::string& layout)
+    : ImageModel(modelFileName, false, layout),
+      aspectRatio(aspectRatio),
+      targetSize(targetSize),
+      confidenceThreshold(confidenceThreshold) {
+        resizeMode = RESIZE_KEEP_ASPECT;
+        interpolationMode = cv::INTER_CUBIC;
+      }
+
+void HPEOpenPose::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("HPE OpenPose model wrapper supports topologies with only 1 input");
+    }
+    inputsNames.push_back(model->input().get_any_name());
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+        inputShape[ov::layout::channels_idx(inputLayout)] != 3)
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 2) {
+        throw std::runtime_error("HPE OpenPose supports topologies with only 2 outputs");
+    }
+
+    const ov::Layout outputLayout("NCHW");
+    for (const auto& output : model->outputs()) {
+        const auto& outTensorName = output.get_any_name();
+        ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout);
+        outputsNames.push_back(outTensorName);
+    }
+    model = ppp.build();
+
+    const size_t batchId = ov::layout::batch_idx(outputLayout);
+    const size_t channelsId = ov::layout::channels_idx(outputLayout);
+    const size_t widthId = ov::layout::width_idx(outputLayout);
+    const size_t heightId = ov::layout::height_idx(outputLayout);
+
+    ov::Shape heatmapsOutputShape = model->outputs().front().get_shape();
+    ov::Shape pafsOutputShape = model->outputs().back().get_shape();
+    if (heatmapsOutputShape[channelsId] > pafsOutputShape[channelsId]) {
+        std::swap(heatmapsOutputShape, pafsOutputShape);
+        std::swap(outputsNames[0], outputsNames[1]);
+    }
+
+    if (heatmapsOutputShape.size() != 4 || heatmapsOutputShape[batchId] != 1 ||
+        heatmapsOutputShape[ov::layout::channels_idx(outputLayout)] != keypointsNumber + 1) {
+        throw std::logic_error("1x" + std::to_string(keypointsNumber + 1) +
+                               "xHFMxWFM dimension of model's heatmap is expected");
+    }
+    if (pafsOutputShape.size() != 4 || pafsOutputShape[batchId] != 1 ||
+        pafsOutputShape[channelsId] != 2 * (keypointsNumber + 1)) {
+        throw std::logic_error("1x" + std::to_string(2 * (keypointsNumber + 1)) +
+                               "xHFMxWFM dimension of model's output is expected");
+    }
+    if (pafsOutputShape[heightId] != heatmapsOutputShape[heightId] ||
+        pafsOutputShape[widthId] != heatmapsOutputShape[widthId]) {
+        throw std::logic_error("output and heatmap are expected to have matching last two dimensions");
+    }
+
+    changeInputSize(model);
+}
+
+void HPEOpenPose::changeInputSize(std::shared_ptr<ov::Model>& model) {
+    ov::Shape inputShape = model->input().get_shape();
+    const ov::Layout& layout = ov::layout::get_layout(model->inputs().front());
+    const auto batchId = ov::layout::batch_idx(layout);
+    const auto heightId = ov::layout::height_idx(layout);
+    const auto widthId = ov::layout::width_idx(layout);
+
+    if (!targetSize) {
+        targetSize = inputShape[heightId];
+    }
+    int height = static_cast<int>((targetSize + stride - 1) / stride) * stride;
+    int inputWidth = static_cast<int>(std::round(targetSize * aspectRatio));
+    int width = static_cast<int>((inputWidth + stride - 1) / stride) * stride;
+    inputShape[batchId] = 1;
+    inputShape[heightId] = height;
+    inputShape[widthId] = width;
+    inputLayerSize = cv::Size(width, height);
+    model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> HPEOpenPose::preprocess(const InputData& inputData, ov::InferRequest& request) {
+    auto& image = inputData.asRef<ImageInputData>().inputImage;
+    cv::Rect roi;
+    auto paddedImage =
+        resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi);
+    if (inputLayerSize.width < roi.width)
+        throw std::runtime_error("The image aspect ratio doesn't fit current model shape");
+
+    if (inputLayerSize.width - stride >= roi.width) {
+        slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+    }
+
+    request.set_input_tensor(wrapMat2Tensor(paddedImage));
+    return std::make_shared<InternalScaleData>(paddedImage.cols,
+                                               paddedImage.rows,
+                                               image.cols / static_cast<float>(roi.width),
+                                               image.rows / static_cast<float>(roi.height));
+}
+
+std::unique_ptr<ResultBase> HPEOpenPose::postprocess(InferenceResult& infResult) {
+    HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData);
+
+    const auto& heatMapsMapped = infResult.outputsData[outputsNames[0]];
+    const auto& outputMapped = infResult.outputsData[outputsNames[1]];
+
+    const ov::Shape& outputShape = outputMapped.get_shape();
+    const ov::Shape& heatMapShape = heatMapsMapped.get_shape();
+
+    float* const predictions = outputMapped.data<float>();
+    float* const heats = heatMapsMapped.data<float>();
+
+    std::vector<cv::Mat> heatMaps(keypointsNumber);
+    for (size_t i = 0; i < heatMaps.size(); i++) {
+        heatMaps[i] =
+            cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, heats + i * heatMapShape[2] * heatMapShape[3]);
+    }
+    resizeFeatureMaps(heatMaps);
+
+    std::vector<cv::Mat> pafs(outputShape[1]);
+    for (size_t i = 0; i < pafs.size(); i++) {
+        pafs[i] =
+            cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, predictions + i * heatMapShape[2] * heatMapShape[3]);
+    }
+    resizeFeatureMaps(pafs);
+
+    std::vector<HumanPose> poses = extractPoses(heatMaps, pafs);
+
+    const auto& scale = infResult.internalModelData->asRef<InternalScaleData>();
+    float scaleX = stride / upsampleRatio * scale.scaleX;
+    float scaleY = stride / upsampleRatio * scale.scaleY;
+    for (auto& pose : poses) {
+        for (auto& keypoint : pose.keypoints) {
+            if (keypoint != cv::Point2f(-1, -1)) {
+                keypoint.x *= scaleX;
+                keypoint.y *= scaleY;
+            }
+        }
+    }
+    for (size_t i = 0; i < poses.size(); ++i) {
+        result->poses.push_back(poses[i]);
+    }
+
+    return std::unique_ptr<ResultBase>(result);
+}
+
+void HPEOpenPose::resizeFeatureMaps(std::vector<cv::Mat>& featureMaps) const {
+    for (auto& featureMap : featureMaps) {
+        cv::resize(featureMap, featureMap, cv::Size(), upsampleRatio, upsampleRatio, cv::INTER_CUBIC);
+    }
+}
+
+class FindPeaksBody : public cv::ParallelLoopBody {
+public:
+    FindPeaksBody(const std::vector<cv::Mat>& heatMaps,
+                  float minPeaksDistance,
+                  std::vector<std::vector<Peak>>& peaksFromHeatMap,
+                  float confidenceThreshold)
+        : heatMaps(heatMaps),
+          minPeaksDistance(minPeaksDistance),
+          peaksFromHeatMap(peaksFromHeatMap),
+          confidenceThreshold(confidenceThreshold) {}
+
+    void operator()(const cv::Range& range) const override {
+        for (int i = range.start; i < range.end; i++) {
+            findPeaks(heatMaps, minPeaksDistance, peaksFromHeatMap, i, confidenceThreshold);
+        }
+    }
+
+private:
+    const std::vector<cv::Mat>& heatMaps;
+    float minPeaksDistance;
+    std::vector<std::vector<Peak>>& peaksFromHeatMap;
+    float confidenceThreshold;
+};
+
+std::vector<HumanPose> HPEOpenPose::extractPoses(const std::vector<cv::Mat>& heatMaps,
+                                                 const std::vector<cv::Mat>& pafs) const {
+    std::vector<std::vector<Peak>> peaksFromHeatMap(heatMaps.size());
+    FindPeaksBody findPeaksBody(heatMaps, minPeaksDistance, peaksFromHeatMap, confidenceThreshold);
+    cv::parallel_for_(cv::Range(0, static_cast<int>(heatMaps.size())), findPeaksBody);
+    int peaksBefore = 0;
+    for (size_t heatmapId = 1; heatmapId < heatMaps.size(); heatmapId++) {
+        peaksBefore += static_cast<int>(peaksFromHeatMap[heatmapId - 1].size());
+        for (auto& peak : peaksFromHeatMap[heatmapId]) {
+            peak.id += peaksBefore;
+        }
+    }
+    std::vector<HumanPose> poses = groupPeaksToPoses(peaksFromHeatMap,
+                                                     pafs,
+                                                     keypointsNumber,
+                                                     midPointsScoreThreshold,
+                                                     foundMidPointsRatioThreshold,
+                                                     minJointsNumber,
+                                                     minSubsetScore);
+    return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/image_model.cpp b/python/openvino/runtime/common/models/src/image_model.cpp
new file mode 100644
index 0000000..511faf3
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/image_model.cpp
@@ -0,0 +1,57 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/image_model.h"
+
+#include <stdexcept>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+
+ImageModel::ImageModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout)
+    : ModelBase(modelFileName, layout),
+      useAutoResize(useAutoResize) {}
+
+std::shared_ptr<InternalModelData> ImageModel::preprocess(const InputData& inputData, ov::InferRequest& request) {
+    const auto& origImg = inputData.asRef<ImageInputData>().inputImage;
+    auto img = inputTransform(origImg);
+
+    if (!useAutoResize) {
+        // /* Resize and copy data from the image to the input tensor */
+        const ov::Tensor& frameTensor = request.get_tensor(inputsNames[0]);  // first input should be image
+        const ov::Shape& tensorShape = frameTensor.get_shape();
+        const ov::Layout layout("NHWC");
+        const size_t width = tensorShape[ov::layout::width_idx(layout)];
+        const size_t height = tensorShape[ov::layout::height_idx(layout)];
+        const size_t channels = tensorShape[ov::layout::channels_idx(layout)];
+        if (static_cast<size_t>(img.channels()) != channels) {
+            throw std::runtime_error("The number of channels for model input and image must match");
+        }
+        if (channels != 1 && channels != 3) {
+            throw std::runtime_error("Unsupported number of channels");
+        }
+        img = resizeImageExt(img, width, height, resizeMode, interpolationMode);
+    }
+    request.set_tensor(inputsNames[0], wrapMat2Tensor(img));
+    return std::make_shared<InternalImageModelData>(origImg.cols, origImg.rows);
+}
diff --git a/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp
new file mode 100644
index 0000000..8eb3ae1
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/jpeg_restoration_model.cpp
@@ -0,0 +1,167 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/image_model.h"
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/jpeg_restoration_model.h"
+#include "models/results.h"
+
+JPEGRestorationModel::JPEGRestorationModel(const std::string& modelFileName,
+                                           const cv::Size& inputImgSize,
+                                           bool _jpegCompression,
+                                           const std::string& layout)
+    : ImageModel(modelFileName, false, layout) {
+    netInputHeight = inputImgSize.height;
+    netInputWidth = inputImgSize.width;
+    jpegCompression = _jpegCompression;
+}
+
+void JPEGRestorationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output -------------------------------------------------
+    // --------------------------- Prepare input  ------------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 input");
+    }
+    inputsNames.push_back(model->input().get_any_name());
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    const ov::Layout& inputLayout = getInputLayout(model->input());
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+        inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 1) {
+        throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 output");
+    }
+    const ov::Shape& outputShape = model->output().get_shape();
+    const ov::Layout outputLayout{"NCHW"};
+    if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+        outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's output is expected");
+    }
+
+    outputsNames.push_back(model->output().get_any_name());
+    ppp.output().tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+
+    changeInputSize(model);
+}
+
+void JPEGRestorationModel::changeInputSize(std::shared_ptr<ov::Model>& model) {
+    ov::Shape inputShape = model->input().get_shape();
+    const ov::Layout& layout = ov::layout::get_layout(model->input());
+
+    const auto batchId = ov::layout::batch_idx(layout);
+    const auto heightId = ov::layout::height_idx(layout);
+    const auto widthId = ov::layout::width_idx(layout);
+
+    if (inputShape[heightId] % stride || inputShape[widthId] % stride) {
+        throw std::logic_error("The shape of the model input must be divisible by stride");
+    }
+
+    netInputHeight = static_cast<int>((netInputHeight + stride - 1) / stride) * stride;
+    netInputWidth = static_cast<int>((netInputWidth + stride - 1) / stride) * stride;
+
+    inputShape[batchId] = 1;
+    inputShape[heightId] = netInputHeight;
+    inputShape[widthId] = netInputWidth;
+
+    model->reshape(inputShape);
+}
+
+std::shared_ptr<InternalModelData> JPEGRestorationModel::preprocess(const InputData& inputData,
+                                                                    ov::InferRequest& request) {
+    cv::Mat image = inputData.asRef<ImageInputData>().inputImage;
+    const size_t h = image.rows;
+    const size_t w = image.cols;
+    cv::Mat resizedImage;
+    if (jpegCompression) {
+        std::vector<uchar> encimg;
+        std::vector<int> params{cv::IMWRITE_JPEG_QUALITY, 40};
+        cv::imencode(".jpg", image, encimg, params);
+        image = cv::imdecode(cv::Mat(encimg), 3);
+    }
+
+    if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) {
+        int bottom = netInputHeight - h;
+        int right = netInputWidth - w;
+        cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0);
+    } else {
+        slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+        cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight));
+    }
+    request.set_input_tensor(wrapMat2Tensor(resizedImage));
+
+    return std::make_shared<InternalImageModelData>(image.cols, image.rows);
+}
+
+std::unique_ptr<ResultBase> JPEGRestorationModel::postprocess(InferenceResult& infResult) {
+    ImageResult* result = new ImageResult;
+    *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+    const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+    const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+    std::vector<cv::Mat> imgPlanes;
+    const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+    const size_t outHeight = static_cast<int>(outputShape[2]);
+    const size_t outWidth = static_cast<int>(outputShape[3]);
+    const size_t numOfPixels = outWidth * outHeight;
+    imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+    cv::Mat resultImg;
+    cv::merge(imgPlanes, resultImg);
+
+    if (netInputHeight - stride < static_cast<size_t>(inputImgSize.inputImgHeight) &&
+        static_cast<size_t>(inputImgSize.inputImgHeight) <= netInputHeight &&
+        netInputWidth - stride < static_cast<size_t>(inputImgSize.inputImgWidth) &&
+        static_cast<size_t>(inputImgSize.inputImgWidth) <= netInputWidth) {
+        result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+    } else {
+        cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+    }
+
+    result->resultImage.convertTo(result->resultImage, CV_8UC3, 255);
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/model_base.cpp b/python/openvino/runtime/common/models/src/model_base.cpp
new file mode 100644
index 0000000..c2ebd1b
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/model_base.cpp
@@ -0,0 +1,67 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/model_base.h"
+
+#include <utility>
+
+#include <openvino/openvino.hpp>
+
+#include <utils/common.hpp>
+#include <utils/config_factory.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+std::shared_ptr<ov::Model> ModelBase::prepareModel(ov::Core& core) {
+    // --------------------------- Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
+    /** Read model **/
+    slog::info << "Reading model " << modelFileName << slog::endl;
+    std::shared_ptr<ov::Model> model = core.read_model(modelFileName);
+    logBasicModelInfo(model);
+    // -------------------------- Reading all outputs names and customizing I/O tensors (in inherited classes)
+    prepareInputsOutputs(model);
+
+    /** Set batch size to 1 **/
+    ov::set_batch(model, 1);
+
+    return model;
+}
+
+ov::CompiledModel ModelBase::compileModel(const ModelConfig& config, ov::Core& core) {
+    this->config = config;
+    auto model = prepareModel(core);
+    compiledModel = core.compile_model(model, config.deviceName, config.compiledModelConfig);
+    logCompiledModelInfo(compiledModel, modelFileName, config.deviceName);
+    return compiledModel;
+}
+
+ov::Layout ModelBase::getInputLayout(const ov::Output<ov::Node>& input) {
+    const ov::Shape& inputShape = input.get_shape();
+    ov::Layout layout = ov::layout::get_layout(input);
+    if (layout.empty()) {
+        if (inputsLayouts.empty()) {
+            layout = getLayoutFromShape(inputShape);
+            slog::warn << "Automatically detected layout '" << layout.to_string() << "' for input '"
+                       << input.get_any_name() << "' will be used." << slog::endl;
+        } else if (inputsLayouts.size() == 1) {
+            layout = inputsLayouts.begin()->second;
+        } else {
+            layout = inputsLayouts[input.get_any_name()];
+        }
+    }
+
+    return layout;
+}
diff --git a/python/openvino/runtime/common/models/src/openpose_decoder.cpp b/python/openvino/runtime/common/models/src/openpose_decoder.cpp
new file mode 100644
index 0000000..6d51607
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/openpose_decoder.cpp
@@ -0,0 +1,345 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/openpose_decoder.h"
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <utils/common.hpp>
+
+#include "models/results.h"
+
+Peak::Peak(const int id, const cv::Point2f& pos, const float score) : id(id), pos(pos), score(score) {}
+
+HumanPoseByPeaksIndices::HumanPoseByPeaksIndices(const int keypointsNumber)
+    : peaksIndices(std::vector<int>(keypointsNumber, -1)),
+      nJoints(0),
+      score(0.0f) {}
+
+TwoJointsConnection::TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score)
+    : firstJointIdx(firstJointIdx),
+      secondJointIdx(secondJointIdx),
+      score(score) {}
+
+void findPeaks(const std::vector<cv::Mat>& heatMaps,
+               const float minPeaksDistance,
+               std::vector<std::vector<Peak>>& allPeaks,
+               int heatMapId,
+               float confidenceThreshold) {
+    std::vector<cv::Point> peaks;
+    const cv::Mat& heatMap = heatMaps[heatMapId];
+    const float* heatMapData = heatMap.ptr<float>();
+    size_t heatMapStep = heatMap.step1();
+    for (int y = -1; y < heatMap.rows + 1; y++) {
+        for (int x = -1; x < heatMap.cols + 1; x++) {
+            float val = 0;
+            if (x >= 0 && y >= 0 && x < heatMap.cols && y < heatMap.rows) {
+                val = heatMapData[y * heatMapStep + x];
+                val = val >= confidenceThreshold ? val : 0;
+            }
+
+            float left_val = 0;
+            if (y >= 0 && x < (heatMap.cols - 1) && y < heatMap.rows) {
+                left_val = heatMapData[y * heatMapStep + x + 1];
+                left_val = left_val >= confidenceThreshold ? left_val : 0;
+            }
+
+            float right_val = 0;
+            if (x > 0 && y >= 0 && y < heatMap.rows) {
+                right_val = heatMapData[y * heatMapStep + x - 1];
+                right_val = right_val >= confidenceThreshold ? right_val : 0;
+            }
+
+            float top_val = 0;
+            if (x >= 0 && x < heatMap.cols && y < (heatMap.rows - 1)) {
+                top_val = heatMapData[(y + 1) * heatMapStep + x];
+                top_val = top_val >= confidenceThreshold ? top_val : 0;
+            }
+
+            float bottom_val = 0;
+            if (x >= 0 && y > 0 && x < heatMap.cols) {
+                bottom_val = heatMapData[(y - 1) * heatMapStep + x];
+                bottom_val = bottom_val >= confidenceThreshold ? bottom_val : 0;
+            }
+
+            if ((val > left_val) && (val > right_val) && (val > top_val) && (val > bottom_val)) {
+                peaks.push_back(cv::Point(x, y));
+            }
+        }
+    }
+    std::sort(peaks.begin(), peaks.end(), [](const cv::Point& a, const cv::Point& b) {
+        return a.x < b.x;
+    });
+    std::vector<bool> isActualPeak(peaks.size(), true);
+    int peakCounter = 0;
+    std::vector<Peak>& peaksWithScoreAndID = allPeaks[heatMapId];
+    for (size_t i = 0; i < peaks.size(); i++) {
+        if (isActualPeak[i]) {
+            for (size_t j = i + 1; j < peaks.size(); j++) {
+                if (sqrt((peaks[i].x - peaks[j].x) * (peaks[i].x - peaks[j].x) +
+                         (peaks[i].y - peaks[j].y) * (peaks[i].y - peaks[j].y)) < minPeaksDistance) {
+                    isActualPeak[j] = false;
+                }
+            }
+            peaksWithScoreAndID.push_back(Peak(peakCounter++, peaks[i], heatMap.at<float>(peaks[i])));
+        }
+    }
+}
+
+std::vector<HumanPose> groupPeaksToPoses(const std::vector<std::vector<Peak>>& allPeaks,
+                                         const std::vector<cv::Mat>& pafs,
+                                         const size_t keypointsNumber,
+                                         const float midPointsScoreThreshold,
+                                         const float foundMidPointsRatioThreshold,
+                                         const int minJointsNumber,
+                                         const float minSubsetScore) {
+    static const std::pair<int, int> limbIdsHeatmap[] = {{2, 3},
+                                                         {2, 6},
+                                                         {3, 4},
+                                                         {4, 5},
+                                                         {6, 7},
+                                                         {7, 8},
+                                                         {2, 9},
+                                                         {9, 10},
+                                                         {10, 11},
+                                                         {2, 12},
+                                                         {12, 13},
+                                                         {13, 14},
+                                                         {2, 1},
+                                                         {1, 15},
+                                                         {15, 17},
+                                                         {1, 16},
+                                                         {16, 18},
+                                                         {3, 17},
+                                                         {6, 18}};
+    static const std::pair<int, int> limbIdsPaf[] = {{31, 32},
+                                                     {39, 40},
+                                                     {33, 34},
+                                                     {35, 36},
+                                                     {41, 42},
+                                                     {43, 44},
+                                                     {19, 20},
+                                                     {21, 22},
+                                                     {23, 24},
+                                                     {25, 26},
+                                                     {27, 28},
+                                                     {29, 30},
+                                                     {47, 48},
+                                                     {49, 50},
+                                                     {53, 54},
+                                                     {51, 52},
+                                                     {55, 56},
+                                                     {37, 38},
+                                                     {45, 46}};
+
+    std::vector<Peak> candidates;
+    for (const auto& peaks : allPeaks) {
+        candidates.insert(candidates.end(), peaks.begin(), peaks.end());
+    }
+    std::vector<HumanPoseByPeaksIndices> subset(0, HumanPoseByPeaksIndices(keypointsNumber));
+    for (size_t k = 0; k < arraySize(limbIdsPaf); k++) {
+        std::vector<TwoJointsConnection> connections;
+        const int mapIdxOffset = keypointsNumber + 1;
+        std::pair<cv::Mat, cv::Mat> scoreMid = {pafs[limbIdsPaf[k].first - mapIdxOffset],
+                                                pafs[limbIdsPaf[k].second - mapIdxOffset]};
+        const int idxJointA = limbIdsHeatmap[k].first - 1;
+        const int idxJointB = limbIdsHeatmap[k].second - 1;
+        const std::vector<Peak>& candA = allPeaks[idxJointA];
+        const std::vector<Peak>& candB = allPeaks[idxJointB];
+        const size_t nJointsA = candA.size();
+        const size_t nJointsB = candB.size();
+        if (nJointsA == 0 && nJointsB == 0) {
+            continue;
+        } else if (nJointsA == 0) {
+            for (size_t i = 0; i < nJointsB; i++) {
+                int num = 0;
+                for (size_t j = 0; j < subset.size(); j++) {
+                    if (subset[j].peaksIndices[idxJointB] == candB[i].id) {
+                        num++;
+                        continue;
+                    }
+                }
+                if (num == 0) {
+                    HumanPoseByPeaksIndices personKeypoints(keypointsNumber);
+                    personKeypoints.peaksIndices[idxJointB] = candB[i].id;
+                    personKeypoints.nJoints = 1;
+                    personKeypoints.score = candB[i].score;
+                    subset.push_back(personKeypoints);
+                }
+            }
+            continue;
+        } else if (nJointsB == 0) {
+            for (size_t i = 0; i < nJointsA; i++) {
+                int num = 0;
+                for (size_t j = 0; j < subset.size(); j++) {
+                    if (subset[j].peaksIndices[idxJointA] == candA[i].id) {
+                        num++;
+                        continue;
+                    }
+                }
+                if (num == 0) {
+                    HumanPoseByPeaksIndices personKeypoints(keypointsNumber);
+                    personKeypoints.peaksIndices[idxJointA] = candA[i].id;
+                    personKeypoints.nJoints = 1;
+                    personKeypoints.score = candA[i].score;
+                    subset.push_back(personKeypoints);
+                }
+            }
+            continue;
+        }
+
+        std::vector<TwoJointsConnection> tempJointConnections;
+        for (size_t i = 0; i < nJointsA; i++) {
+            for (size_t j = 0; j < nJointsB; j++) {
+                cv::Point2f pt = candA[i].pos * 0.5 + candB[j].pos * 0.5;
+                cv::Point mid = cv::Point(cvRound(pt.x), cvRound(pt.y));
+                cv::Point2f vec = candB[j].pos - candA[i].pos;
+                double norm_vec = cv::norm(vec);
+                if (norm_vec == 0) {
+                    continue;
+                }
+                vec /= norm_vec;
+                float score = vec.x * scoreMid.first.at<float>(mid) + vec.y * scoreMid.second.at<float>(mid);
+                int height_n = pafs[0].rows / 2;
+                float suc_ratio = 0.0f;
+                float mid_score = 0.0f;
+                const int mid_num = 10;
+                const float scoreThreshold = -100.0f;
+                if (score > scoreThreshold) {
+                    float p_sum = 0;
+                    int p_count = 0;
+                    cv::Size2f step((candB[j].pos.x - candA[i].pos.x) / (mid_num - 1),
+                                    (candB[j].pos.y - candA[i].pos.y) / (mid_num - 1));
+                    for (int n = 0; n < mid_num; n++) {
+                        cv::Point midPoint(cvRound(candA[i].pos.x + n * step.width),
+                                           cvRound(candA[i].pos.y + n * step.height));
+                        cv::Point2f pred(scoreMid.first.at<float>(midPoint), scoreMid.second.at<float>(midPoint));
+                        score = vec.x * pred.x + vec.y * pred.y;
+                        if (score > midPointsScoreThreshold) {
+                            p_sum += score;
+                            p_count++;
+                        }
+                    }
+                    suc_ratio = static_cast<float>(p_count / mid_num);
+                    float ratio = p_count > 0 ? p_sum / p_count : 0.0f;
+                    mid_score = ratio + static_cast<float>(std::min(height_n / norm_vec - 1, 0.0));
+                }
+                if (mid_score > 0 && suc_ratio > foundMidPointsRatioThreshold) {
+                    tempJointConnections.push_back(TwoJointsConnection(i, j, mid_score));
+                }
+            }
+        }
+        if (!tempJointConnections.empty()) {
+            std::sort(tempJointConnections.begin(),
+                      tempJointConnections.end(),
+                      [](const TwoJointsConnection& a, const TwoJointsConnection& b) {
+                          return (a.score > b.score);
+                      });
+        }
+        size_t num_limbs = std::min(nJointsA, nJointsB);
+        size_t cnt = 0;
+        std::vector<int> occurA(nJointsA, 0);
+        std::vector<int> occurB(nJointsB, 0);
+        for (size_t row = 0; row < tempJointConnections.size(); row++) {
+            if (cnt == num_limbs) {
+                break;
+            }
+            const int& indexA = tempJointConnections[row].firstJointIdx;
+            const int& indexB = tempJointConnections[row].secondJointIdx;
+            const float& score = tempJointConnections[row].score;
+            if (occurA[indexA] == 0 && occurB[indexB] == 0) {
+                connections.push_back(TwoJointsConnection(candA[indexA].id, candB[indexB].id, score));
+                cnt++;
+                occurA[indexA] = 1;
+                occurB[indexB] = 1;
+            }
+        }
+        if (connections.empty()) {
+            continue;
+        }
+
+        bool extraJointConnections = (k == 17 || k == 18);
+        if (k == 0) {
+            subset = std::vector<HumanPoseByPeaksIndices>(connections.size(), HumanPoseByPeaksIndices(keypointsNumber));
+            for (size_t i = 0; i < connections.size(); i++) {
+                const int& indexA = connections[i].firstJointIdx;
+                const int& indexB = connections[i].secondJointIdx;
+                subset[i].peaksIndices[idxJointA] = indexA;
+                subset[i].peaksIndices[idxJointB] = indexB;
+                subset[i].nJoints = 2;
+                subset[i].score = candidates[indexA].score + candidates[indexB].score + connections[i].score;
+            }
+        } else if (extraJointConnections) {
+            for (size_t i = 0; i < connections.size(); i++) {
+                const int& indexA = connections[i].firstJointIdx;
+                const int& indexB = connections[i].secondJointIdx;
+                for (size_t j = 0; j < subset.size(); j++) {
+                    if (subset[j].peaksIndices[idxJointA] == indexA && subset[j].peaksIndices[idxJointB] == -1) {
+                        subset[j].peaksIndices[idxJointB] = indexB;
+                    } else if (subset[j].peaksIndices[idxJointB] == indexB && subset[j].peaksIndices[idxJointA] == -1) {
+                        subset[j].peaksIndices[idxJointA] = indexA;
+                    }
+                }
+            }
+            continue;
+        } else {
+            for (size_t i = 0; i < connections.size(); i++) {
+                const int& indexA = connections[i].firstJointIdx;
+                const int& indexB = connections[i].secondJointIdx;
+                bool num = false;
+                for (size_t j = 0; j < subset.size(); j++) {
+                    if (subset[j].peaksIndices[idxJointA] == indexA) {
+                        subset[j].peaksIndices[idxJointB] = indexB;
+                        subset[j].nJoints++;
+                        subset[j].score += candidates[indexB].score + connections[i].score;
+                        num = true;
+                    }
+                }
+                if (!num) {
+                    HumanPoseByPeaksIndices hpWithScore(keypointsNumber);
+                    hpWithScore.peaksIndices[idxJointA] = indexA;
+                    hpWithScore.peaksIndices[idxJointB] = indexB;
+                    hpWithScore.nJoints = 2;
+                    hpWithScore.score = candidates[indexA].score + candidates[indexB].score + connections[i].score;
+                    subset.push_back(hpWithScore);
+                }
+            }
+        }
+    }
+    std::vector<HumanPose> poses;
+    for (const auto& subsetI : subset) {
+        if (subsetI.nJoints < minJointsNumber || subsetI.score / subsetI.nJoints < minSubsetScore) {
+            continue;
+        }
+        int position = -1;
+        HumanPose pose{std::vector<cv::Point2f>(keypointsNumber, cv::Point2f(-1.0f, -1.0f)),
+                       subsetI.score * std::max(0, subsetI.nJoints - 1)};
+        for (const auto& peakIdx : subsetI.peaksIndices) {
+            position++;
+            if (peakIdx >= 0) {
+                pose.keypoints[position] = candidates[peakIdx].pos;
+                pose.keypoints[position].x += 0.5;
+                pose.keypoints[position].y += 0.5;
+            }
+        }
+        poses.push_back(pose);
+    }
+    return poses;
+}
diff --git a/python/openvino/runtime/common/models/src/segmentation_model.cpp b/python/openvino/runtime/common/models/src/segmentation_model.cpp
new file mode 100644
index 0000000..82a153b
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/segmentation_model.cpp
@@ -0,0 +1,157 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/segmentation_model.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+SegmentationModel::SegmentationModel(const std::string& modelFileName, bool useAutoResize, const std::string& layout)
+    : ImageModel(modelFileName, useAutoResize, layout) {}
+
+std::vector<std::string> SegmentationModel::loadLabels(const std::string& labelFilename) {
+    std::vector<std::string> labelsList;
+
+    /* Read labels (if any) */
+    if (!labelFilename.empty()) {
+        std::ifstream inputFile(labelFilename);
+        if (!inputFile.is_open())
+            throw std::runtime_error("Can't open the labels file: " + labelFilename);
+        std::string label;
+        while (std::getline(inputFile, label)) {
+            labelsList.push_back(label);
+        }
+        if (labelsList.empty())
+            throw std::logic_error("File is empty: " + labelFilename);
+    }
+
+    return labelsList;
+}
+
+void SegmentationModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output ---------------------------------------------
+    // --------------------------- Prepare input  -----------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("Segmentation model wrapper supports topologies with only 1 input");
+    }
+    const auto& input = model->input();
+    inputsNames.push_back(input.get_any_name());
+
+    const ov::Layout& inputLayout = getInputLayout(input);
+    const ov::Shape& inputShape = input.get_shape();
+    if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"});
+
+    if (useAutoResize) {
+        ppp.input().tensor().set_spatial_dynamic_shape();
+
+        ppp.input()
+            .preprocess()
+            .convert_element_type(ov::element::f32)
+            .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR);
+    }
+
+    ppp.input().model().set_layout(inputLayout);
+    model = ppp.build();
+    // --------------------------- Prepare output  -----------------------------------------------------
+    if (model->outputs().size() != 1) {
+        throw std::logic_error("Segmentation model wrapper supports topologies with only 1 output");
+    }
+
+    const auto& output = model->output();
+    outputsNames.push_back(output.get_any_name());
+
+    const ov::Shape& outputShape = output.get_shape();
+    ov::Layout outputLayout("");
+    switch (outputShape.size()) {
+        case 3:
+            outputLayout = "CHW";
+            outChannels = 1;
+            outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]);
+            outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]);
+            break;
+        case 4:
+            outputLayout = "NCHW";
+            outChannels = static_cast<int>(outputShape[ov::layout::channels_idx(outputLayout)]);
+            outHeight = static_cast<int>(outputShape[ov::layout::height_idx(outputLayout)]);
+            outWidth = static_cast<int>(outputShape[ov::layout::width_idx(outputLayout)]);
+            break;
+        default:
+            throw std::logic_error("Unexpected output tensor shape. Only 4D and 3D outputs are supported.");
+    }
+}
+
+std::unique_ptr<ResultBase> SegmentationModel::postprocess(InferenceResult& infResult) {
+    ImageResult* result = new ImageResult(infResult.frameId, infResult.metaData);
+    const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+    const auto& outTensor = infResult.getFirstOutputTensor();
+
+    result->resultImage = cv::Mat(outHeight, outWidth, CV_8UC1);
+
+    if (outChannels == 1 && outTensor.get_element_type() == ov::element::i32) {
+        cv::Mat predictions(outHeight, outWidth, CV_32SC1, outTensor.data<int32_t>());
+        predictions.convertTo(result->resultImage, CV_8UC1);
+    } else if (outChannels == 1 && outTensor.get_element_type() == ov::element::i64) {
+        cv::Mat predictions(outHeight, outWidth, CV_32SC1);
+        const auto data = outTensor.data<int64_t>();
+        for (size_t i = 0; i < predictions.total(); ++i) {
+            reinterpret_cast<int32_t*>(predictions.data)[i] = int32_t(data[i]);
+        }
+        predictions.convertTo(result->resultImage, CV_8UC1);
+    } else if (outTensor.get_element_type() == ov::element::f32) {
+        const float* data = outTensor.data<float>();
+        for (int rowId = 0; rowId < outHeight; ++rowId) {
+            for (int colId = 0; colId < outWidth; ++colId) {
+                int classId = 0;
+                float maxProb = -1.0f;
+                for (int chId = 0; chId < outChannels; ++chId) {
+                    float prob = data[chId * outHeight * outWidth + rowId * outWidth + colId];
+                    if (prob > maxProb) {
+                        classId = chId;
+                        maxProb = prob;
+                    }
+                }  // nChannels
+
+                result->resultImage.at<uint8_t>(rowId, colId) = classId;
+            }  // width
+        }  // height
+    }
+
+    cv::resize(result->resultImage,
+               result->resultImage,
+               cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight),
+               0,
+               0,
+               cv::INTER_NEAREST);
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/style_transfer_model.cpp b/python/openvino/runtime/common/models/src/style_transfer_model.cpp
new file mode 100644
index 0000000..53e8561
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/style_transfer_model.cpp
@@ -0,0 +1,107 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/style_transfer_model.h"
+
+#include <stddef.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+StyleTransferModel::StyleTransferModel(const std::string& modelFileName, const std::string& layout)
+    : ImageModel(modelFileName, false, layout) {}
+
+void StyleTransferModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output ---------------------------------------------
+    // --------------------------- Prepare input --------------------------------------------------
+    if (model->inputs().size() != 1) {
+        throw std::logic_error("Style transfer model wrapper supports topologies with only 1 input");
+    }
+
+    inputsNames.push_back(model->input().get_any_name());
+
+    const ov::Shape& inputShape = model->input().get_shape();
+    ov::Layout inputLayout = getInputLayout(model->input());
+
+    if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 ||
+        inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's input is expected");
+    }
+
+    netInputWidth = inputShape[ov::layout::width_idx(inputLayout)];
+    netInputHeight = inputShape[ov::layout::height_idx(inputLayout)];
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    ppp.input().preprocess().convert_element_type(ov::element::f32);
+    ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+    ppp.input().model().set_layout(inputLayout);
+
+    // --------------------------- Prepare output  -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 1) {
+        throw std::logic_error("Style transfer model wrapper supports topologies with only 1 output");
+    }
+    outputsNames.push_back(model->output().get_any_name());
+
+    const ov::Shape& outputShape = model->output().get_shape();
+    ov::Layout outputLayout{"NCHW"};
+    if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 ||
+        outputShape[ov::layout::channels_idx(outputLayout)] != 3) {
+        throw std::logic_error("3-channel 4-dimensional model's output is expected");
+    }
+
+    ppp.output().tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+}
+
+std::unique_ptr<ResultBase> StyleTransferModel::postprocess(InferenceResult& infResult) {
+    ImageResult* result = new ImageResult;
+    *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+    const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+    const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+    const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape();
+    size_t outHeight = static_cast<int>(outputShape[2]);
+    size_t outWidth = static_cast<int>(outputShape[3]);
+    size_t numOfPixels = outWidth * outHeight;
+
+    std::vector<cv::Mat> imgPlanes;
+    imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+                                     cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))};
+    cv::Mat resultImg;
+    cv::merge(imgPlanes, resultImg);
+    cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+
+    result->resultImage.convertTo(result->resultImage, CV_8UC3);
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/models/src/super_resolution_model.cpp b/python/openvino/runtime/common/models/src/super_resolution_model.cpp
new file mode 100644
index 0000000..164991a
--- /dev/null
+++ b/python/openvino/runtime/common/models/src/super_resolution_model.cpp
@@ -0,0 +1,207 @@
+/*
+// Copyright (C) 2021-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/super_resolution_model.h"
+
+#include <stddef.h>
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <utils/image_utils.h>
+#include <utils/ocv_common.hpp>
+#include <utils/slog.hpp>
+
+#include "models/input_data.h"
+#include "models/internal_model_data.h"
+#include "models/results.h"
+
+SuperResolutionModel::SuperResolutionModel(const std::string& modelFileName,
+                                           const cv::Size& inputImgSize,
+                                           const std::string& layout)
+    : ImageModel(modelFileName, false, layout) {
+    netInputHeight = inputImgSize.height;
+    netInputWidth = inputImgSize.width;
+}
+
+void SuperResolutionModel::prepareInputsOutputs(std::shared_ptr<ov::Model>& model) {
+    // --------------------------- Configure input & output ---------------------------------------------
+    // --------------------------- Prepare input --------------------------------------------------
+    const ov::OutputVector& inputs = model->inputs();
+    if (inputs.size() != 1 && inputs.size() != 2) {
+        throw std::logic_error("Super resolution model wrapper supports topologies with 1 or 2 inputs only");
+    }
+    std::string lrInputTensorName = inputs.begin()->get_any_name();
+    inputsNames.push_back(lrInputTensorName);
+    ov::Shape lrShape = inputs.begin()->get_shape();
+    if (lrShape.size() != 4) {
+        throw std::logic_error("Number of dimensions for an input must be 4");
+    }
+    // in case of 2 inputs they have the same layouts
+    ov::Layout inputLayout = getInputLayout(model->inputs().front());
+
+    auto channelsId = ov::layout::channels_idx(inputLayout);
+    auto heightId = ov::layout::height_idx(inputLayout);
+    auto widthId = ov::layout::width_idx(inputLayout);
+
+    if (lrShape[channelsId] != 1 && lrShape[channelsId] != 3) {
+        throw std::logic_error("Input layer is expected to have 1 or 3 channels");
+    }
+
+    // A model like single-image-super-resolution-???? may take bicubic interpolation of the input image as the
+    // second input
+    std::string bicInputTensorName;
+    if (inputs.size() == 2) {
+        bicInputTensorName = (++inputs.begin())->get_any_name();
+        inputsNames.push_back(bicInputTensorName);
+        ov::Shape bicShape = (++inputs.begin())->get_shape();
+        if (bicShape.size() != 4) {
+            throw std::logic_error("Number of dimensions for both inputs must be 4");
+        }
+        if (lrShape[widthId] >= bicShape[widthId] && lrShape[heightId] >= bicShape[heightId]) {
+            std::swap(bicShape, lrShape);
+            inputsNames[0].swap(inputsNames[1]);
+        } else if (!(lrShape[widthId] <= bicShape[widthId] && lrShape[heightId] <= bicShape[heightId])) {
+            throw std::logic_error("Each spatial dimension of one input must surpass or be equal to a spatial"
+                                   "dimension of another input");
+        }
+    }
+
+    ov::preprocess::PrePostProcessor ppp(model);
+    for (const auto& input : inputs) {
+        ppp.input(input.get_any_name()).tensor().set_element_type(ov::element::u8).set_layout("NHWC");
+
+        ppp.input(input.get_any_name()).model().set_layout(inputLayout);
+    }
+
+    // --------------------------- Prepare output -----------------------------------------------------
+    const ov::OutputVector& outputs = model->outputs();
+    if (outputs.size() != 1) {
+        throw std::logic_error("Super resolution model wrapper supports topologies with only 1 output");
+    }
+
+    outputsNames.push_back(outputs.begin()->get_any_name());
+    ppp.output().tensor().set_element_type(ov::element::f32);
+    model = ppp.build();
+
+    const ov::Shape& outShape = model->output().get_shape();
+
+    const ov::Layout outputLayout("NCHW");
+    const auto outWidth = outShape[ov::layout::width_idx(outputLayout)];
+    const auto inWidth = lrShape[ov::layout::width_idx(outputLayout)];
+    changeInputSize(model, static_cast<int>(outWidth / inWidth));
+}
+
+void SuperResolutionModel::changeInputSize(std::shared_ptr<ov::Model>& model, int coeff) {
+    std::map<std::string, ov::PartialShape> shapes;
+    const ov::Layout& layout = ov::layout::get_layout(model->inputs().front());
+    const auto batchId = ov::layout::batch_idx(layout);
+    const auto heightId = ov::layout::height_idx(layout);
+    const auto widthId = ov::layout::width_idx(layout);
+
+    const ov::OutputVector& inputs = model->inputs();
+    std::string lrInputTensorName = inputs.begin()->get_any_name();
+    ov::Shape lrShape = inputs.begin()->get_shape();
+
+    if (inputs.size() == 2) {
+        std::string bicInputTensorName = (++inputs.begin())->get_any_name();
+        ov::Shape bicShape = (++inputs.begin())->get_shape();
+        if (lrShape[heightId] >= bicShape[heightId] && lrShape[widthId] >= bicShape[widthId]) {
+            std::swap(bicShape, lrShape);
+            std::swap(bicInputTensorName, lrInputTensorName);
+        }
+        bicShape[batchId] = 1;
+        bicShape[heightId] = coeff * netInputHeight;
+        bicShape[widthId] = coeff * netInputWidth;
+        shapes[bicInputTensorName] = ov::PartialShape(bicShape);
+    }
+
+    lrShape[batchId] = 1;
+    lrShape[heightId] = netInputHeight;
+    lrShape[widthId] = netInputWidth;
+    shapes[lrInputTensorName] = ov::PartialShape(lrShape);
+
+    model->reshape(shapes);
+}
+
+std::shared_ptr<InternalModelData> SuperResolutionModel::preprocess(const InputData& inputData,
+                                                                    ov::InferRequest& request) {
+    auto imgData = inputData.asRef<ImageInputData>();
+    auto& img = imgData.inputImage;
+
+    const ov::Tensor lrInputTensor = request.get_tensor(inputsNames[0]);
+    const ov::Layout layout("NHWC");
+
+    if (img.channels() != static_cast<int>(lrInputTensor.get_shape()[ov::layout::channels_idx(layout)])) {
+        cv::cvtColor(img, img, cv::COLOR_BGR2GRAY);
+    }
+
+    if (static_cast<size_t>(img.cols) != netInputWidth || static_cast<size_t>(img.rows) != netInputHeight) {
+        slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl;
+    }
+    const size_t height = lrInputTensor.get_shape()[ov::layout::height_idx(layout)];
+    const size_t width = lrInputTensor.get_shape()[ov::layout::width_idx(layout)];
+    img = resizeImageExt(img, width, height);
+    request.set_tensor(inputsNames[0], wrapMat2Tensor(img));
+
+    if (inputsNames.size() == 2) {
+        const ov::Tensor bicInputTensor = request.get_tensor(inputsNames[1]);
+        const int h = static_cast<int>(bicInputTensor.get_shape()[ov::layout::height_idx(layout)]);
+        const int w = static_cast<int>(bicInputTensor.get_shape()[ov::layout::width_idx(layout)]);
+        cv::Mat resized;
+        cv::resize(img, resized, cv::Size(w, h), 0, 0, cv::INTER_CUBIC);
+        request.set_tensor(inputsNames[1], wrapMat2Tensor(resized));
+    }
+
+    return std::make_shared<InternalImageModelData>(img.cols, img.rows);
+}
+
+std::unique_ptr<ResultBase> SuperResolutionModel::postprocess(InferenceResult& infResult) {
+    ImageResult* result = new ImageResult;
+    *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+    const auto outputData = infResult.getFirstOutputTensor().data<float>();
+
+    std::vector<cv::Mat> imgPlanes;
+    const ov::Shape& outShape = infResult.getFirstOutputTensor().get_shape();
+    const size_t outChannels = static_cast<int>(outShape[1]);
+    const size_t outHeight = static_cast<int>(outShape[2]);
+    const size_t outWidth = static_cast<int>(outShape[3]);
+    const size_t numOfPixels = outWidth * outHeight;
+    if (outChannels == 3) {
+        imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])),
+                                         cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+                                         cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))};
+    } else {
+        imgPlanes = std::vector<cv::Mat>{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))};
+        // Post-processing for text-image-super-resolution models
+        cv::threshold(imgPlanes[0], imgPlanes[0], 0.5f, 1.0f, cv::THRESH_BINARY);
+    }
+
+    for (auto& img : imgPlanes) {
+        img.convertTo(img, CV_8UC1, 255);
+    }
+    cv::Mat resultImg;
+    cv::merge(imgPlanes, resultImg);
+    result->resultImage = resultImg;
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/python/openvino/runtime/common/monitors/CMakeLists.txt b/python/openvino/runtime/common/monitors/CMakeLists.txt
new file mode 100644
index 0000000..1bfe0b9
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+find_package(OpenCV REQUIRED COMPONENTS core imgproc)
+
+set(SOURCES
+    src/cpu_monitor.cpp
+    src/memory_monitor.cpp
+    src/presenter.cpp)
+
+set(HEADERS
+    include/monitors/cpu_monitor.h
+    include/monitors/memory_monitor.h
+    include/monitors/presenter.h)
+
+if(WIN32)
+    list(APPEND SOURCES src/query_wrapper.cpp)
+    list(APPEND HEADERS include/monitors/query_wrapper.h)
+endif()
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${SOURCES})
+source_group("include" FILES ${HEADERS})
+
+add_library(monitors STATIC ${SOURCES} ${HEADERS})
+target_include_directories(monitors PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_link_libraries(monitors PRIVATE opencv_core opencv_imgproc)
+if(WIN32)
+    target_link_libraries(monitors PRIVATE pdh)
+
+    target_compile_definitions(monitors PRIVATE
+    # Prevents Windows.h from adding unnecessary includes
+    WIN32_LEAN_AND_MEAN
+    # Prevents Windows.h from defining min/max as macros
+    NOMINMAX
+  )
+endif()
diff --git a/python/openvino/runtime/common/monitors/include/monitors/cpu_monitor.h b/python/openvino/runtime/common/monitors/include/monitors/cpu_monitor.h
new file mode 100644
index 0000000..38d2845
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/include/monitors/cpu_monitor.h
@@ -0,0 +1,28 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+class CpuMonitor {
+public:
+    CpuMonitor();
+    ~CpuMonitor();
+    void setHistorySize(std::size_t size);
+    std::size_t getHistorySize() const;
+    void collectData();
+    std::deque<std::vector<double>> getLastHistory() const;
+    std::vector<double> getMeanCpuLoad() const;
+
+private:
+    unsigned samplesNumber;
+    unsigned historySize;
+    std::vector<double> cpuLoadSum;
+    std::deque<std::vector<double>> cpuLoadHistory;
+    class PerformanceCounter;
+    std::unique_ptr<PerformanceCounter> performanceCounter;
+};
diff --git a/python/openvino/runtime/common/monitors/include/monitors/memory_monitor.h b/python/openvino/runtime/common/monitors/include/monitors/memory_monitor.h
new file mode 100644
index 0000000..9eda10f
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/include/monitors/memory_monitor.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <deque>
+#include <memory>
+
+class MemoryMonitor {
+public:
+    MemoryMonitor();
+    ~MemoryMonitor();
+    void setHistorySize(std::size_t size);
+    std::size_t getHistorySize() const;
+    void collectData();
+    std::deque<std::pair<double, double>> getLastHistory() const;
+    double getMeanMem() const; // in GiB
+    double getMeanSwap() const;
+    double getMaxMem() const;
+    double getMaxSwap() const;
+    double getMemTotal() const;
+    double getMaxMemTotal() const; // a system may have hotpluggable memory
+private:
+    unsigned samplesNumber;
+    std::size_t historySize;
+    double memSum, swapSum;
+    double maxMem, maxSwap;
+    double memTotal;
+    double maxMemTotal;
+    std::deque<std::pair<double, double>> memSwapUsageHistory;
+    class PerformanceCounter;
+    std::unique_ptr<PerformanceCounter> performanceCounter;
+};
diff --git a/python/openvino/runtime/common/monitors/include/monitors/presenter.h b/python/openvino/runtime/common/monitors/include/monitors/presenter.h
new file mode 100644
index 0000000..c6587a0
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/include/monitors/presenter.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <ostream>
+#include <set>
+
+#include <opencv2/imgproc.hpp>
+
+#include "cpu_monitor.h"
+#include "memory_monitor.h"
+
+enum class MonitorType{CpuAverage, DistributionCpu, Memory};
+
+class Presenter {
+public:
+    explicit Presenter(std::set<MonitorType> enabledMonitors = {},
+        int yPos = 20,
+        cv::Size graphSize = {150, 60},
+        std::size_t historySize = 20);
+    explicit Presenter(const std::string& keys,
+        int yPos = 20,
+        cv::Size graphSize = {150, 60},
+        std::size_t historySize = 20);
+    void addRemoveMonitor(MonitorType monitor);
+    void handleKey(int key); // handles C, D, M, H keys
+    void drawGraphs(cv::Mat& frame);
+    std::vector<std::string> reportMeans() const;
+
+    const int yPos;
+    const cv::Size graphSize;
+    const int graphPadding;
+private:
+    std::chrono::steady_clock::time_point prevTimeStamp;
+    std::size_t historySize;
+    CpuMonitor cpuMonitor;
+    bool distributionCpuEnabled;
+    MemoryMonitor memoryMonitor;
+    std::ostringstream strStream;
+};
diff --git a/python/openvino/runtime/common/monitors/include/monitors/query_wrapper.h b/python/openvino/runtime/common/monitors/include/monitors/query_wrapper.h
new file mode 100644
index 0000000..d69f548
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/include/monitors/query_wrapper.h
@@ -0,0 +1,17 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <Pdh.h>
+class QueryWrapper {
+public:
+    QueryWrapper();
+    ~QueryWrapper();
+    QueryWrapper(const QueryWrapper&) = delete;
+    QueryWrapper& operator=(const QueryWrapper&) = delete;
+    operator PDH_HQUERY() const;
+private:
+    PDH_HQUERY query;
+};
diff --git a/python/openvino/runtime/common/monitors/src/cpu_monitor.cpp b/python/openvino/runtime/common/monitors/src/cpu_monitor.cpp
new file mode 100644
index 0000000..e5172a2
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/src/cpu_monitor.cpp
@@ -0,0 +1,206 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "monitors/cpu_monitor.h"
+
+#include <algorithm>
+#ifdef _WIN32
+#include "monitors/query_wrapper.h"
+#include <string>
+#include <system_error>
+#include <PdhMsg.h>
+#include <Windows.h>
+
+namespace {
+const std::size_t nCores = []() {
+        SYSTEM_INFO sysinfo;
+        GetSystemInfo(&sysinfo);
+        return sysinfo.dwNumberOfProcessors;
+    }();
+}
+
+class CpuMonitor::PerformanceCounter {
+public:
+    PerformanceCounter() : coreTimeCounters(nCores) {
+        PDH_STATUS status;
+        for (std::size_t i = 0; i < nCores; ++i) {
+            std::wstring fullCounterPath{L"\\Processor(" + std::to_wstring(i) + L")\\% Processor Time"};
+            status = PdhAddCounterW(query, fullCounterPath.c_str(), 0, &coreTimeCounters[i]);
+            if (ERROR_SUCCESS != status) {
+                throw std::system_error(status, std::system_category(), "PdhAddCounterW() failed");
+            }
+            status = PdhSetCounterScaleFactor(coreTimeCounters[i], -2); // scale counter to [0, 1]
+            if (ERROR_SUCCESS != status) {
+                throw std::system_error(status, std::system_category(), "PdhSetCounterScaleFactor() failed");
+            }
+        }
+        status = PdhCollectQueryData(query);
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhCollectQueryData() failed");
+        }
+    }
+
+    std::vector<double> getCpuLoad() {
+        PDH_STATUS status;
+        status = PdhCollectQueryData(query);
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhCollectQueryData() failed");
+        }
+
+        PDH_FMT_COUNTERVALUE displayValue;
+        std::vector<double> cpuLoad(coreTimeCounters.size());
+        for (std::size_t i = 0; i < coreTimeCounters.size(); ++i) {
+            status = PdhGetFormattedCounterValue(coreTimeCounters[i], PDH_FMT_DOUBLE, NULL,
+                &displayValue);
+            switch (status) {
+                case ERROR_SUCCESS: break;
+                // PdhGetFormattedCounterValue() can sometimes return PDH_CALC_NEGATIVE_DENOMINATOR for some reason
+                case PDH_CALC_NEGATIVE_DENOMINATOR: return {};
+                default:
+                    throw std::system_error(status, std::system_category(), "PdhGetFormattedCounterValue() failed");
+            }
+            if (PDH_CSTATUS_VALID_DATA != displayValue.CStatus && PDH_CSTATUS_NEW_DATA != displayValue.CStatus) {
+                throw std::runtime_error("Error in counter data");
+            }
+
+            cpuLoad[i] = displayValue.doubleValue;
+        }
+        return cpuLoad;
+    }
+
+private:
+    QueryWrapper query;
+    std::vector<PDH_HCOUNTER> coreTimeCounters;
+};
+
+#elif __linux__
+#include <chrono>
+#include <regex>
+#include <utility>
+#include <fstream>
+#include <unistd.h>
+
+namespace {
+const long clockTicks = sysconf(_SC_CLK_TCK);
+
+const std::size_t nCores = sysconf(_SC_NPROCESSORS_CONF);
+
+std::vector<unsigned long> getIdleCpuStat() {
+    std::vector<unsigned long> idleCpuStat(nCores);
+    std::ifstream procStat("/proc/stat");
+    std::string line;
+    std::smatch match;
+    std::regex coreJiffies("^cpu(\\d+)\\s+"
+        "(\\d+)\\s+"
+        "(\\d+)\\s+"
+        "(\\d+)\\s+"
+        "(\\d+)\\s+" // idle
+        "(\\d+)"); // iowait
+
+    while (std::getline(procStat, line)) {
+        if (std::regex_search(line, match, coreJiffies)) {
+            // it doesn't handle overflow of sum and overflows of /proc/stat values
+            unsigned long idleInfo = stoul(match[5]) + stoul(match[6]),
+                coreId = stoul(match[1]);
+            if (nCores <= coreId) {
+                throw std::runtime_error("The number of cores has changed");
+            }
+            idleCpuStat[coreId] = idleInfo;
+        }
+    }
+    return idleCpuStat;
+}
+}
+
+class CpuMonitor::PerformanceCounter {
+public:
+    PerformanceCounter() : prevIdleCpuStat{getIdleCpuStat()}, prevTimePoint{std::chrono::steady_clock::now()} {}
+
+    std::vector<double> getCpuLoad() {
+        std::vector<unsigned long> idleCpuStat = getIdleCpuStat();
+        auto timePoint = std::chrono::steady_clock::now();
+        // don't update data too frequently which may result in negative values for cpuLoad.
+        // It may happen when collectData() is called just after setHistorySize().
+        if (timePoint - prevTimePoint > std::chrono::milliseconds{100}) {
+            std::vector<double> cpuLoad(nCores);
+            for (std::size_t i = 0; i < idleCpuStat.size(); ++i) {
+                double idleDiff = idleCpuStat[i] - prevIdleCpuStat[i];
+                typedef std::chrono::duration<double, std::chrono::seconds::period> Sec;
+                cpuLoad[i] = 1.0
+                    - idleDiff / clockTicks / std::chrono::duration_cast<Sec>(timePoint - prevTimePoint).count();
+            }
+            prevIdleCpuStat = std::move(idleCpuStat);
+            prevTimePoint = timePoint;
+            return cpuLoad;
+        }
+        return {};
+    }
+private:
+    std::vector<unsigned long> prevIdleCpuStat;
+    std::chrono::steady_clock::time_point prevTimePoint;
+};
+
+#else
+// not implemented
+namespace {
+const std::size_t nCores{0};
+}
+
+class CpuMonitor::PerformanceCounter {
+public:
+    std::vector<double> getCpuLoad() {return {};};
+};
+#endif
+
+CpuMonitor::CpuMonitor() :
+    samplesNumber{0},
+    historySize{0},
+    cpuLoadSum(nCores, 0) {}
+
+// PerformanceCounter is incomplete in header and destructor can't be defined implicitly
+CpuMonitor::~CpuMonitor() = default;
+
+void CpuMonitor::setHistorySize(std::size_t size) {
+    if (0 == historySize && 0 != size) {
+        performanceCounter.reset(new PerformanceCounter);
+    } else if (0 != historySize && 0 == size) {
+        performanceCounter.reset();
+    }
+    historySize = size;
+    std::ptrdiff_t newSize = static_cast<std::ptrdiff_t>(std::min(size, cpuLoadHistory.size()));
+    cpuLoadHistory.erase(cpuLoadHistory.begin(), cpuLoadHistory.end() - newSize);
+}
+
+void CpuMonitor::collectData() {
+    std::vector<double> cpuLoad = performanceCounter->getCpuLoad();
+
+    if (!cpuLoad.empty()) {
+        for (std::size_t i = 0; i < cpuLoad.size(); ++i) {
+            cpuLoadSum[i] += cpuLoad[i];
+        }
+        ++samplesNumber;
+
+        cpuLoadHistory.push_back(std::move(cpuLoad));
+        if (cpuLoadHistory.size() > historySize) {
+            cpuLoadHistory.pop_front();
+        }
+    }
+}
+
+std::size_t CpuMonitor::getHistorySize() const {
+    return historySize;
+}
+
+std::deque<std::vector<double>> CpuMonitor::getLastHistory() const {
+    return cpuLoadHistory;
+}
+
+std::vector<double> CpuMonitor::getMeanCpuLoad() const {
+    std::vector<double> meanCpuLoad;
+    meanCpuLoad.reserve(cpuLoadSum.size());
+    for (double coreLoad : cpuLoadSum) {
+        meanCpuLoad.push_back(samplesNumber ? coreLoad / samplesNumber : 0);
+    }
+    return meanCpuLoad;
+}
diff --git a/python/openvino/runtime/common/monitors/src/memory_monitor.cpp b/python/openvino/runtime/common/monitors/src/memory_monitor.cpp
new file mode 100644
index 0000000..70879d6
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/src/memory_monitor.cpp
@@ -0,0 +1,213 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "monitors/memory_monitor.h"
+
+struct MemState {
+    double memTotal, usedMem, usedSwap;
+};
+
+#ifdef _WIN32
+#include "monitors/query_wrapper.h"
+#include <algorithm>
+#define PSAPI_VERSION 2
+#include <system_error>
+#include <Windows.h>
+#include <PdhMsg.h>
+#include <Psapi.h>
+
+namespace {
+double getMemTotal() {
+    PERFORMANCE_INFORMATION performanceInformation;
+    if (!GetPerformanceInfo(&performanceInformation, sizeof(performanceInformation))) {
+        throw std::runtime_error("GetPerformanceInfo() failed");
+    }
+    return static_cast<double>(performanceInformation.PhysicalTotal * performanceInformation.PageSize)
+        / (1024 * 1024 * 1024);
+}
+}
+
+class MemoryMonitor::PerformanceCounter {
+public:
+    PerformanceCounter() {
+        PDH_STATUS status = PdhAddCounterW(query, L"\\Paging File(_Total)\\% Usage", 0, &pagingFileUsageCounter);
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhAddCounterW() failed");
+        }
+        status = PdhSetCounterScaleFactor(pagingFileUsageCounter, -2); // scale counter to [0, 1]
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhSetCounterScaleFactor() failed");
+        }
+    }
+
+    MemState getMemState() {
+        PERFORMANCE_INFORMATION performanceInformation;
+        if (!GetPerformanceInfo(&performanceInformation, sizeof(performanceInformation))) {
+            throw std::runtime_error("GetPerformanceInfo() failed");
+        }
+
+        PDH_STATUS status;
+        status = PdhCollectQueryData(query);
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhCollectQueryData() failed");
+        }
+        PDH_FMT_COUNTERVALUE displayValue;
+        status = PdhGetFormattedCounterValue(pagingFileUsageCounter, PDH_FMT_DOUBLE, NULL, &displayValue);
+        if (ERROR_SUCCESS != status) {
+            throw std::system_error(status, std::system_category(), "PdhGetFormattedCounterValue() failed");
+        }
+        if (PDH_CSTATUS_VALID_DATA != displayValue.CStatus && PDH_CSTATUS_NEW_DATA != displayValue.CStatus) {
+            throw std::runtime_error("Error in counter data");
+        }
+
+        double pagingFilesSize = static_cast<double>(
+            (performanceInformation.CommitLimit - performanceInformation.PhysicalTotal)
+            * performanceInformation.PageSize) / (1024 * 1024 * 1024);
+        return {static_cast<double>(performanceInformation.PhysicalTotal * performanceInformation.PageSize)
+                / (1024 * 1024 * 1024),
+            static_cast<double>(
+                (performanceInformation.PhysicalTotal - performanceInformation.PhysicalAvailable)
+                * performanceInformation.PageSize) / (1024 * 1024 * 1024),
+            pagingFilesSize * displayValue.doubleValue};
+    }
+private:
+    QueryWrapper query;
+    PDH_HCOUNTER pagingFileUsageCounter;
+};
+
+#elif __linux__
+#include <fstream>
+#include <utility>
+#include <vector>
+#include <regex>
+
+namespace {
+std::pair<std::pair<double, double>, std::pair<double, double>> getAvailableMemSwapTotalMemSwap() {
+    double memAvailable = 0, swapFree = 0, memTotal = 0, swapTotal = 0;
+    std::regex memRegex("^(.+):\\s+(\\d+) kB$");
+    std::string line;
+    std::smatch match;
+    std::ifstream meminfo("/proc/meminfo");
+    while (std::getline(meminfo, line)) {
+        if (std::regex_match(line, match, memRegex)) {
+            if ("MemAvailable" == match[1]) {
+                memAvailable = stod(match[2]) / (1024 * 1024);
+            } else if ("SwapFree" == match[1]) {
+                swapFree = stod(match[2]) / (1024 * 1024);
+            } else if ("MemTotal" == match[1]) {
+                memTotal = stod(match[2]) / (1024 * 1024);
+            } else if ("SwapTotal" == match[1]) {
+                swapTotal = stod(match[2]) / (1024 * 1024);
+            }
+        }
+    }
+    if (0 == memTotal) {
+        throw std::runtime_error("Can't get MemTotal");
+    }
+    return {{memAvailable, swapFree}, {memTotal, swapTotal}};
+}
+
+double getMemTotal() {
+    return getAvailableMemSwapTotalMemSwap().second.first;
+}
+}
+
+class MemoryMonitor::PerformanceCounter {
+public:
+    MemState getMemState() {
+        std::pair<std::pair<double, double>, std::pair<double, double>> availableMemSwapTotalMemSwap
+            = getAvailableMemSwapTotalMemSwap();
+        double memTotal = availableMemSwapTotalMemSwap.second.first;
+        double swapTotal = availableMemSwapTotalMemSwap.second.second;
+        return {memTotal, memTotal - availableMemSwapTotalMemSwap.first.first, swapTotal - availableMemSwapTotalMemSwap.first.second};
+    }
+};
+
+#else
+// not implemented
+namespace {
+double getMemTotal() {return 0.0;}
+}
+
+class MemoryMonitor::PerformanceCounter {
+public:
+    MemState getMemState() {return {0.0, 0.0, 0.0};}
+};
+#endif
+
+MemoryMonitor::MemoryMonitor() :
+    samplesNumber{0},
+    historySize{0},
+    memSum{0.0},
+    swapSum{0.0},
+    maxMem{0.0},
+    maxSwap{0.0},
+    memTotal{0.0},
+    maxMemTotal{0.0} {}
+
+// PerformanceCounter is incomplete in header and destructor can't be defined implicitly
+MemoryMonitor::~MemoryMonitor() = default;
+
+void MemoryMonitor::setHistorySize(std::size_t size) {
+    if (0 == historySize && 0 != size) {
+        performanceCounter.reset(new MemoryMonitor::PerformanceCounter);
+        // memTotal is not initialized in constructor because for linux its initialization involves constructing
+        // std::regex which is unimplemented and throws an exception for gcc 4.8.5 (default for CentOS 7.4).
+        // Delaying initialization triggers the error only when the monitor is used
+        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53631
+        memTotal = ::getMemTotal();
+    } else if (0 != historySize && 0 == size) {
+        performanceCounter.reset();
+    }
+    historySize = size;
+    std::size_t newSize = std::min(size, memSwapUsageHistory.size());
+    memSwapUsageHistory.erase(memSwapUsageHistory.begin(), memSwapUsageHistory.end() - newSize);
+}
+
+void MemoryMonitor::collectData() {
+    MemState memState = performanceCounter->getMemState();
+    maxMemTotal = std::max(maxMemTotal, memState.memTotal);
+    memSum += memState.usedMem;
+    swapSum += memState.usedSwap;
+    ++samplesNumber;
+    maxMem = std::max(maxMem, memState.usedMem);
+    maxSwap = std::max(maxSwap, memState.usedSwap);
+
+    memSwapUsageHistory.emplace_back(memState.usedMem, memState.usedSwap);
+    if (memSwapUsageHistory.size() > historySize) {
+        memSwapUsageHistory.pop_front();
+    }
+}
+
+std::size_t MemoryMonitor::getHistorySize() const {
+    return historySize;
+}
+
+std::deque<std::pair<double, double>> MemoryMonitor::getLastHistory() const {
+    return memSwapUsageHistory;
+}
+
+double MemoryMonitor::getMeanMem() const {
+    return samplesNumber ? memSum / samplesNumber : 0;
+}
+
+double MemoryMonitor::getMeanSwap() const {
+    return samplesNumber ? swapSum / samplesNumber : 0;
+}
+
+double MemoryMonitor::getMaxMem() const {
+    return maxMem;
+}
+
+double MemoryMonitor::getMaxSwap() const {
+    return maxSwap;
+}
+
+double MemoryMonitor::getMemTotal() const {
+    return memTotal;
+}
+
+double MemoryMonitor::getMaxMemTotal() const {
+    return maxMemTotal;
+}
diff --git a/python/openvino/runtime/common/monitors/src/presenter.cpp b/python/openvino/runtime/common/monitors/src/presenter.cpp
new file mode 100644
index 0000000..61f5e15
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/src/presenter.cpp
@@ -0,0 +1,330 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cctype>
+#include <chrono>
+#include <iomanip>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "monitors/presenter.h"
+
+namespace {
+const std::map<int, MonitorType> keyToMonitorType{
+    {'C', MonitorType::CpuAverage},
+    {'D', MonitorType::DistributionCpu},
+    {'M', MonitorType::Memory}};
+
+std::set<MonitorType> strKeysToMonitorSet(const std::string& keys) {
+    std::set<MonitorType> enabledMonitors;
+    if (keys == "h") {
+        return enabledMonitors;
+    }
+    for (unsigned char key: keys) {
+        if (key == 'h') {
+            throw std::runtime_error("Unacceptable combination of monitor types-can't show and hide info at the same time");
+        }
+        auto iter = keyToMonitorType.find(std::toupper(key));
+        if (keyToMonitorType.end() == iter) {
+            throw std::runtime_error("Unknown monitor type");
+        } else {
+            enabledMonitors.insert(iter->second);
+        }
+    }
+    return enabledMonitors;
+}
+}
+
+Presenter::Presenter(std::set<MonitorType> enabledMonitors,
+        int yPos,
+        cv::Size graphSize,
+        std::size_t historySize) :
+            yPos{yPos},
+            graphSize{graphSize},
+            graphPadding{std::max(1, static_cast<int>(graphSize.width * 0.05))},
+            historySize{historySize},
+            distributionCpuEnabled{false},
+            strStream{std::ios_base::app} {
+    for (MonitorType monitor : enabledMonitors) {
+        addRemoveMonitor(monitor);
+    }
+}
+
+Presenter::Presenter(const std::string& keys, int yPos, cv::Size graphSize, std::size_t historySize) :
+    Presenter{strKeysToMonitorSet(keys), yPos, graphSize, historySize} {}
+
+void Presenter::addRemoveMonitor(MonitorType monitor) {
+    unsigned updatedHistorySize = 1;
+    if (historySize > 1) {
+        int sampleStep = std::max(1, static_cast<int>(graphSize.width / (historySize - 1)));
+        // +1 to plot graphSize.width/sampleStep segments
+        // add round up to and an extra element if don't reach graph edge
+        updatedHistorySize = (graphSize.width + sampleStep - 1) / sampleStep + 1;
+    }
+    switch(monitor) {
+        case MonitorType::CpuAverage: {
+            if (cpuMonitor.getHistorySize() > 1 && distributionCpuEnabled) {
+                cpuMonitor.setHistorySize(1);
+            } else if (cpuMonitor.getHistorySize() > 1 && !distributionCpuEnabled) {
+                cpuMonitor.setHistorySize(0);
+            } else { // cpuMonitor.getHistorySize() <= 1
+                cpuMonitor.setHistorySize(updatedHistorySize);
+            }
+            break;
+        }
+        case MonitorType::DistributionCpu: {
+            if (distributionCpuEnabled) {
+                distributionCpuEnabled = false;
+                if (1 == cpuMonitor.getHistorySize()) { // cpuMonitor was used only for DistributionCpu => disable it
+                    cpuMonitor.setHistorySize(0);
+                }
+            } else {
+                distributionCpuEnabled = true;
+                cpuMonitor.setHistorySize(std::max(std::size_t{1}, cpuMonitor.getHistorySize()));
+            }
+            break;
+        }
+        case MonitorType::Memory: {
+            if (memoryMonitor.getHistorySize() > 1) {
+                memoryMonitor.setHistorySize(0);
+            } else {
+                memoryMonitor.setHistorySize(updatedHistorySize);
+            }
+            break;
+        }
+    }
+}
+
+void Presenter::handleKey(int key) {
+    key = std::toupper(key);
+    if ('H' == key) {
+        if (0 == cpuMonitor.getHistorySize() && memoryMonitor.getHistorySize() <= 1) {
+            addRemoveMonitor(MonitorType::CpuAverage);
+            addRemoveMonitor(MonitorType::DistributionCpu);
+            addRemoveMonitor(MonitorType::Memory);
+        } else {
+            cpuMonitor.setHistorySize(0);
+            distributionCpuEnabled = false;
+            memoryMonitor.setHistorySize(0);
+        }
+    } else {
+        auto iter = keyToMonitorType.find(key);
+        if (keyToMonitorType.end() != iter) {
+            addRemoveMonitor(iter->second);
+        }
+    }
+}
+
+void Presenter::drawGraphs(cv::Mat& frame) {
+    const std::chrono::steady_clock::time_point curTimeStamp = std::chrono::steady_clock::now();
+    if (curTimeStamp - prevTimeStamp >= std::chrono::milliseconds{1000}) {
+        prevTimeStamp = curTimeStamp;
+        if (0 != cpuMonitor.getHistorySize()) {
+            cpuMonitor.collectData();
+        }
+        if (memoryMonitor.getHistorySize() > 1) {
+            memoryMonitor.collectData();
+        }
+    }
+
+    int numberOfEnabledMonitors = (cpuMonitor.getHistorySize() > 1) + distributionCpuEnabled
+        + (memoryMonitor.getHistorySize() > 1);
+    int panelWidth = graphSize.width * numberOfEnabledMonitors
+        + std::max(0, numberOfEnabledMonitors - 1) * graphPadding;
+    while (panelWidth > frame.cols) {
+        panelWidth = std::max(0, panelWidth - graphSize.width - graphPadding);
+        --numberOfEnabledMonitors; // can't draw all monitors
+    }
+    int graphPos = std::max(0, (frame.cols - 1 - panelWidth) / 2);
+    int textGraphSplittingLine = graphSize.height / 5;
+    int graphRectHeight = graphSize.height - textGraphSplittingLine;
+    int sampleStep = 1;
+    unsigned possibleHistorySize = 1;
+    if (historySize > 1) {
+        sampleStep = std::max(1, static_cast<int>(graphSize.width / (historySize - 1)));
+        possibleHistorySize = (graphSize.width + sampleStep - 1) / sampleStep + 1;
+    }
+
+    if (cpuMonitor.getHistorySize() > 1 && possibleHistorySize > 1 && --numberOfEnabledMonitors >= 0) {
+        std::deque<std::vector<double>> lastHistory = cpuMonitor.getLastHistory();
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
+        graph = graph / 2 + cv::Scalar{127, 127, 127};
+
+        int lineXPos = graph.cols - 1;
+        std::vector<cv::Point> averageLoad(lastHistory.size());
+
+        for (int i = lastHistory.size() - 1; i >= 0; --i) {
+            double mean = std::accumulate(lastHistory[i].begin(), lastHistory[i].end(), 0.0) / lastHistory[i].size();
+            averageLoad[i] = {lineXPos, graphSize.height - static_cast<int>(mean * graphRectHeight)};
+            lineXPos -= sampleStep;
+        }
+
+        cv::polylines(graph, averageLoad, false, {255, 0, 0}, 2);
+        cv::rectangle(frame, cv::Rect{
+                cv::Point{graphPos, yPos + textGraphSplittingLine},
+                cv::Size{graphSize.width, graphSize.height - textGraphSplittingLine}
+            }, {0, 0, 0});
+        strStream.str("CPU");
+        if (!lastHistory.empty()) {
+            strStream << ": " << std::fixed << std::setprecision(1)
+                << std::accumulate(lastHistory.back().begin(), lastHistory.back().end(), 0.0)
+                    / lastHistory.back().size() * 100 << '%';
+        }
+        int baseline;
+        int textWidth = cv::getTextSize(strStream.str(),
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            1,
+            &baseline).width;
+        cv::putText(graph,
+            strStream.str(),
+            cv::Point{(graphSize.width - textWidth) / 2, textGraphSplittingLine - 1},
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            {70, 0, 0},
+            1);
+        graphPos += graphSize.width + graphPadding;
+    }
+
+    if (distributionCpuEnabled && --numberOfEnabledMonitors >= 0) {
+        std::deque<std::vector<double>> lastHistory = cpuMonitor.getLastHistory();
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
+        graph = graph / 2 + cv::Scalar{127, 127, 127};
+
+        if (!lastHistory.empty()) {
+            int rectXPos = 0;
+            int step = (graph.cols + lastHistory.back().size() - 1) / lastHistory.back().size(); // round up
+            double sum = 0;
+            for (double coreLoad : lastHistory.back()) {
+                sum += coreLoad;
+                int height = static_cast<int>(graphRectHeight * coreLoad);
+                cv::Rect pillar{cv::Point{rectXPos, graph.rows - height}, cv::Size{step, height}};
+                cv::rectangle(graph, pillar, {255, 0, 0}, cv::FILLED);
+                cv::rectangle(graph, pillar, {0, 0, 0});
+                rectXPos += step;
+            }
+            sum /= lastHistory.back().size();
+            int yLine = graph.rows - static_cast<int>(graphRectHeight * sum);
+            cv::line(graph, cv::Point{0, yLine}, cv::Point{graph.cols, yLine}, {0, 255, 0}, 2);
+        }
+        cv::Rect border{cv::Point{graphPos, yPos + textGraphSplittingLine},
+            cv::Size{graphSize.width, graphSize.height - textGraphSplittingLine}};
+        cv::rectangle(frame, border, {0, 0, 0});
+        strStream.str("Core load");
+        if (!lastHistory.empty()) {
+            strStream << ": " << std::fixed << std::setprecision(1)
+                << std::accumulate(lastHistory.back().begin(), lastHistory.back().end(), 0.0)
+                    / lastHistory.back().size() * 100 << '%';
+        }
+        int baseline;
+        int textWidth = cv::getTextSize(strStream.str(),
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            1,
+            &baseline).width;
+        cv::putText(graph,
+            strStream.str(),
+            cv::Point{(graphSize.width - textWidth) / 2, textGraphSplittingLine - 1},
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            {0, 70, 0});
+        graphPos += graphSize.width + graphPadding;
+    }
+
+    if (memoryMonitor.getHistorySize() > 1 && possibleHistorySize > 1 && --numberOfEnabledMonitors >= 0) {
+        std::deque<std::pair<double, double>> lastHistory = memoryMonitor.getLastHistory();
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
+        graph = graph / 2 + cv::Scalar{127, 127, 127};
+        int histxPos = graph.cols - 1;
+        double range = std::min(memoryMonitor.getMaxMemTotal() + memoryMonitor.getMaxSwap(),
+            (memoryMonitor.getMaxMem() + memoryMonitor.getMaxSwap()) * 1.2);
+        if (lastHistory.size() > 1) {
+            for (auto memUsageIt = lastHistory.rbegin(); memUsageIt != lastHistory.rend() - 1; ++memUsageIt) {
+                constexpr double SWAP_THRESHOLD = 10.0 / 1024; // 10 MiB
+                cv::Vec3b color =
+                    (memoryMonitor.getMemTotal() * 0.95 > memUsageIt->first) || (memUsageIt->second < SWAP_THRESHOLD) ?
+                        cv::Vec3b{0, 255, 255} :
+                        cv::Vec3b{0, 0, 255};
+                cv::Point right{histxPos,
+                    graph.rows - static_cast<int>(graphRectHeight * (memUsageIt->first + memUsageIt->second) / range)};
+                cv::Point left{histxPos - sampleStep,
+                    graph.rows - static_cast<int>(
+                        graphRectHeight * ((memUsageIt + 1)->first + (memUsageIt + 1)->second) / range)};
+                cv::line(graph, right, left, color, 2);
+                histxPos -= sampleStep;
+            }
+        }
+
+        cv::Rect border{cv::Point{graphPos, yPos + textGraphSplittingLine},
+            cv::Size{graphSize.width, graphSize.height - textGraphSplittingLine}};
+        cv::rectangle(frame, {border}, {0, 0, 0});
+        if (lastHistory.empty()) {
+            strStream.str("Memory");
+        } else {
+            strStream.str("");
+            strStream << std::fixed << std::setprecision(1) << lastHistory.back().first << " + "
+                << lastHistory.back().second << " GiB";
+        }
+        int baseline;
+        int textWidth = cv::getTextSize(strStream.str(),
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            1,
+            &baseline).width;
+        cv::putText(graph,
+            strStream.str(),
+            cv::Point{(graphSize.width - textWidth) / 2, textGraphSplittingLine - 1},
+            cv::FONT_HERSHEY_SIMPLEX,
+            textGraphSplittingLine * 0.04,
+            {0, 35, 35});
+    }
+}
+
+std::vector<std::string> Presenter::reportMeans() const {
+    std::vector<std::string> collectedData;
+    if (cpuMonitor.getHistorySize() > 1 || distributionCpuEnabled || memoryMonitor.getHistorySize() > 1) {
+        collectedData.push_back("Resources usage:");
+    }
+    if (cpuMonitor.getHistorySize() > 1) {
+        std::ostringstream collectedDataStream;
+        collectedDataStream << std::fixed << std::setprecision(1);
+        collectedDataStream << "\tMean core utilization: ";
+        for (double mean : cpuMonitor.getMeanCpuLoad()) {
+            collectedDataStream << mean * 100 << "% ";
+        }
+        collectedData.push_back(collectedDataStream.str());
+    }
+    if (distributionCpuEnabled) {
+        std::ostringstream collectedDataStream;
+        collectedDataStream << std::fixed << std::setprecision(1);
+        std::vector<double> meanCpuLoad = cpuMonitor.getMeanCpuLoad();
+        double mean = std::accumulate(meanCpuLoad.begin(), meanCpuLoad.end(), 0.0) / meanCpuLoad.size();
+        collectedDataStream << "\tMean CPU utilization: " << mean * 100 << "%";
+        collectedData.push_back(collectedDataStream.str());
+    }
+    if (memoryMonitor.getHistorySize() > 1) {
+        std::ostringstream collectedDataStream;
+        collectedDataStream << std::fixed << std::setprecision(1);
+        collectedDataStream << "\tMemory mean usage: " << memoryMonitor.getMeanMem() << " GiB";
+        collectedData.push_back(collectedDataStream.str());
+        collectedDataStream.str("");
+        collectedDataStream << "\tMean swap usage: " << memoryMonitor.getMeanSwap() << " GiB";
+        collectedData.push_back(collectedDataStream.str());
+    }
+
+    return collectedData;
+}
diff --git a/python/openvino/runtime/common/monitors/src/query_wrapper.cpp b/python/openvino/runtime/common/monitors/src/query_wrapper.cpp
new file mode 100644
index 0000000..5c238d1
--- /dev/null
+++ b/python/openvino/runtime/common/monitors/src/query_wrapper.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "monitors/query_wrapper.h"
+
+#include <Windows.h>
+#include <system_error>
+
+QueryWrapper::QueryWrapper() {
+    PDH_STATUS status = PdhOpenQuery(NULL, NULL, &query);
+    if (ERROR_SUCCESS != status) {
+        throw std::system_error(status, std::system_category(), "PdhOpenQuery() failed");
+    }
+}
+QueryWrapper::~QueryWrapper() {
+    PdhCloseQuery(query);
+}
+
+QueryWrapper::operator PDH_HQUERY() const {
+    return query;
+}
diff --git a/python/openvino/runtime/common/pipelines/CMakeLists.txt b/python/openvino/runtime/common/pipelines/CMakeLists.txt
new file mode 100644
index 0000000..b8b128a
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+file(GLOB SOURCES ./src/*.cpp)
+file(GLOB HEADERS ./include/pipelines/*.h)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${SOURCES})
+source_group("include" FILES ${HEADERS})
+
+add_library(pipelines STATIC ${SOURCES} ${HEADERS})
+target_include_directories(pipelines PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_link_libraries(pipelines PRIVATE openvino::runtime models utils opencv_core opencv_imgproc)
diff --git a/python/openvino/runtime/common/pipelines/include/pipelines/async_pipeline.h b/python/openvino/runtime/common/pipelines/include/pipelines/async_pipeline.h
new file mode 100644
index 0000000..6661c00
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/include/pipelines/async_pipeline.h
@@ -0,0 +1,121 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <stdint.h>
+
+#include <condition_variable>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include <openvino/openvino.hpp>
+
+#include <models/results.h>
+#include <utils/performance_metrics.hpp>
+
+#include "pipelines/requests_pool.h"
+
+class ModelBase;
+struct InputData;
+struct MetaData;
+struct ModelConfig;
+
+/// This is base class for asynchronous pipeline
+/// Derived classes should add functions for data submission and output processing
+class AsyncPipeline {
+public:
+    /// Loads model and performs required initialization
+    /// @param modelInstance pointer to model object. Object it points to should not be destroyed manually after passing
+    /// pointer to this function.
+    /// @param config - fine tuning configuration for model
+    /// @param core - reference to ov::Core instance to use.
+    /// If it is omitted, new instance of  ov::Core will be created inside.
+    AsyncPipeline(std::unique_ptr<ModelBase>&& modelInstance, const ModelConfig& config, ov::Core& core);
+    virtual ~AsyncPipeline();
+
+    /// Waits until either output data becomes available or pipeline allows to submit more input data.
+    /// @param shouldKeepOrder if true, function will treat results as ready only if next sequential result (frame) is
+    /// ready (so results can be extracted in the same order as they were submitted). Otherwise, function will return if
+    /// any result is ready.
+    void waitForData(bool shouldKeepOrder = true);
+
+    /// @returns true if there's available infer requests in the pool
+    /// and next frame can be submitted for processing, false otherwise.
+    bool isReadyToProcess() {
+        return requestsPool->isIdleRequestAvailable();
+    }
+
+    /// Waits for all currently submitted requests to be completed.
+    ///
+    void waitForTotalCompletion() {
+        if (requestsPool)
+            requestsPool->waitForTotalCompletion();
+    }
+
+    /// Submits data to the model for inference
+    /// @param inputData - input data to be submitted
+    /// @param metaData - shared pointer to metadata container.
+    /// Might be null. This pointer will be passed through pipeline and put to the final result structure.
+    /// @returns -1 if image cannot be scheduled for processing (there's no free InferRequest available).
+    /// Otherwise returns unique sequential frame ID for this particular request. Same frame ID will be written in the
+    /// result structure.
+    virtual int64_t submitData(const InputData& inputData, const std::shared_ptr<MetaData>& metaData);
+
+    /// Gets available data from the queue
+    /// @param shouldKeepOrder if true, function will treat results as ready only if next sequential result (frame) is
+    /// ready (so results can be extracted in the same order as they were submitted). Otherwise, function will return if
+    /// any result is ready.
+    virtual std::unique_ptr<ResultBase> getResult(bool shouldKeepOrder = true);
+
+    PerformanceMetrics getInferenceMetircs() {
+        return inferenceMetrics;
+    }
+    PerformanceMetrics getPreprocessMetrics() {
+        return preprocessMetrics;
+    }
+    PerformanceMetrics getPostprocessMetrics() {
+        return postprocessMetrics;
+    }
+
+protected:
+    /// Returns processed result, if available
+    /// @param shouldKeepOrder if true, function will return processed data sequentially,
+    /// keeping original frames order (as they were submitted). Otherwise, function will return processed data in random
+    /// order.
+    /// @returns InferenceResult with processed information or empty InferenceResult (with negative frameID) if there's
+    /// no any results yet.
+    virtual InferenceResult getInferenceResult(bool shouldKeepOrder);
+
+    std::unique_ptr<RequestsPool> requestsPool;
+    std::unordered_map<int64_t, InferenceResult> completedInferenceResults;
+
+    ov::CompiledModel compiledModel;
+
+    std::mutex mtx;
+    std::condition_variable condVar;
+
+    int64_t inputFrameId = 0;
+    int64_t outputFrameId = 0;
+
+    std::exception_ptr callbackException = nullptr;
+
+    std::unique_ptr<ModelBase> model;
+    PerformanceMetrics inferenceMetrics;
+    PerformanceMetrics preprocessMetrics;
+    PerformanceMetrics postprocessMetrics;
+};
diff --git a/python/openvino/runtime/common/pipelines/include/pipelines/metadata.h b/python/openvino/runtime/common/pipelines/include/pipelines/metadata.h
new file mode 100644
index 0000000..aca18ee
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/include/pipelines/metadata.h
@@ -0,0 +1,51 @@
+/*
+// Copyright (C) 2018-2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+#include <utils/ocv_common.hpp>
+
+struct MetaData {
+    virtual ~MetaData() {}
+
+    template <class T>
+    T& asRef() {
+        return dynamic_cast<T&>(*this);
+    }
+
+    template <class T>
+    const T& asRef() const {
+        return dynamic_cast<const T&>(*this);
+    }
+};
+
+struct ImageMetaData : public MetaData {
+    cv::Mat img;
+    std::chrono::steady_clock::time_point timeStamp;
+
+    ImageMetaData() {}
+
+    ImageMetaData(cv::Mat img, std::chrono::steady_clock::time_point timeStamp) : img(img), timeStamp(timeStamp) {}
+};
+
+struct ClassificationImageMetaData : public ImageMetaData {
+    unsigned int groundTruthId;
+
+    ClassificationImageMetaData(cv::Mat img,
+                                std::chrono::steady_clock::time_point timeStamp,
+                                unsigned int groundTruthId)
+        : ImageMetaData(img, timeStamp),
+          groundTruthId(groundTruthId) {}
+};
diff --git a/python/openvino/runtime/common/pipelines/include/pipelines/requests_pool.h b/python/openvino/runtime/common/pipelines/include/pipelines/requests_pool.h
new file mode 100644
index 0000000..d9b220e
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/include/pipelines/requests_pool.h
@@ -0,0 +1,67 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include <stddef.h>
+
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+/// This is class storing requests pool for asynchronous pipeline
+///
+class RequestsPool {
+public:
+    RequestsPool(ov::CompiledModel& compiledModel, unsigned int size);
+    ~RequestsPool();
+
+    /// Returns idle request from the pool. Returned request is automatically marked as In Use (this status will be
+    /// reset after request processing completion) This function is thread safe as long as request is used only until
+    /// setRequestIdle call
+    /// @returns pointer to request with idle state or nullptr if all requests are in use.
+    ov::InferRequest getIdleRequest();
+
+    /// Sets particular request to Idle state
+    /// This function is thread safe as long as request provided is not used after call to this function
+    /// @param request - request to be returned to idle state
+    void setRequestIdle(const ov::InferRequest& request);
+
+    /// Returns number of requests in use. This function is thread safe.
+    /// @returns number of requests in use
+    size_t getInUseRequestsCount();
+
+    /// Returns number of requests in use. This function is thread safe.
+    /// @returns number of requests in use
+    bool isIdleRequestAvailable();
+
+    /// Waits for completion of every non-idle requests in pool.
+    /// getIdleRequest should not be called together with this function or after it to avoid race condition or invalid
+    /// state
+    /// @returns number of requests in use
+    void waitForTotalCompletion();
+
+    /// Returns list of all infer requests in the pool.
+    /// @returns list of all infer requests in the pool.
+    std::vector<ov::InferRequest> getInferRequestsList();
+
+private:
+    std::vector<std::pair<ov::InferRequest, bool>> requests;
+    size_t numRequestsInUse;
+    std::mutex mtx;
+};
diff --git a/python/openvino/runtime/common/pipelines/src/async_pipeline.cpp b/python/openvino/runtime/common/pipelines/src/async_pipeline.cpp
new file mode 100644
index 0000000..3259280
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/src/async_pipeline.cpp
@@ -0,0 +1,166 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "pipelines/async_pipeline.h"
+
+#include <chrono>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+#include <models/model_base.h>
+#include <models/results.h>
+#include <utils/config_factory.h>
+#include <utils/performance_metrics.hpp>
+#include <utils/slog.hpp>
+
+struct InputData;
+struct MetaData;
+
+AsyncPipeline::AsyncPipeline(std::unique_ptr<ModelBase>&& modelInstance, const ModelConfig& config, ov::Core& core)
+    : model(std::move(modelInstance)) {
+    compiledModel = model->compileModel(config, core);
+    // --------------------------- Create infer requests ------------------------------------------------
+    unsigned int nireq = config.maxAsyncRequests;
+    if (nireq == 0) {
+        try {
+            nireq = compiledModel.get_property(ov::optimal_number_of_infer_requests);
+        } catch (const ov::Exception& ex) {
+            throw std::runtime_error(
+                std::string("Every device used with the demo should support compiled model's property "
+                            "'OPTIMAL_NUMBER_OF_INFER_REQUESTS'. Failed to query the property with error: ") +
+                ex.what());
+        }
+    }
+    slog::info << "\tNumber of inference requests: " << nireq << slog::endl;
+    requestsPool.reset(new RequestsPool(compiledModel, nireq));
+    // --------------------------- Call onLoadCompleted to complete initialization of model -------------
+    model->onLoadCompleted(requestsPool->getInferRequestsList());
+}
+
+AsyncPipeline::~AsyncPipeline() {
+    waitForTotalCompletion();
+}
+
+void AsyncPipeline::waitForData(bool shouldKeepOrder) {
+    std::unique_lock<std::mutex> lock(mtx);
+
+    condVar.wait(lock, [&]() {
+        return callbackException != nullptr || requestsPool->isIdleRequestAvailable() ||
+               (shouldKeepOrder ? completedInferenceResults.find(outputFrameId) != completedInferenceResults.end()
+                                : !completedInferenceResults.empty());
+    });
+
+    if (callbackException) {
+        std::rethrow_exception(callbackException);
+    }
+}
+
+int64_t AsyncPipeline::submitData(const InputData& inputData, const std::shared_ptr<MetaData>& metaData) {
+    auto frameID = inputFrameId;
+
+    auto request = requestsPool->getIdleRequest();
+    if (!request) {
+        return -1;
+    }
+
+    auto startTime = std::chrono::steady_clock::now();
+    auto internalModelData = model->preprocess(inputData, request);
+    preprocessMetrics.update(startTime);
+
+    request.set_callback(
+        [this, request, frameID, internalModelData, metaData, startTime](std::exception_ptr ex) mutable {
+            {
+                const std::lock_guard<std::mutex> lock(mtx);
+                inferenceMetrics.update(startTime);
+                try {
+                    if (ex) {
+                        std::rethrow_exception(ex);
+                    }
+                    InferenceResult result;
+
+                    result.frameId = frameID;
+                    result.metaData = std::move(metaData);
+                    result.internalModelData = std::move(internalModelData);
+
+                    for (const auto& outName : model->getOutputsNames()) {
+                        auto tensor = request.get_tensor(outName);
+                        result.outputsData.emplace(outName, tensor);
+                    }
+
+                    completedInferenceResults.emplace(frameID, result);
+                    requestsPool->setRequestIdle(request);
+                } catch (...) {
+                    if (!callbackException) {
+                        callbackException = std::current_exception();
+                    }
+                }
+            }
+            condVar.notify_one();
+        });
+
+    inputFrameId++;
+    if (inputFrameId < 0)
+        inputFrameId = 0;
+
+    request.start_async();
+
+    return frameID;
+}
+
+std::unique_ptr<ResultBase> AsyncPipeline::getResult(bool shouldKeepOrder) {
+    auto infResult = AsyncPipeline::getInferenceResult(shouldKeepOrder);
+    if (infResult.IsEmpty()) {
+        return std::unique_ptr<ResultBase>();
+    }
+    auto startTime = std::chrono::steady_clock::now();
+    auto result = model->postprocess(infResult);
+    postprocessMetrics.update(startTime);
+
+    *result = static_cast<ResultBase&>(infResult);
+    return result;
+}
+
+InferenceResult AsyncPipeline::getInferenceResult(bool shouldKeepOrder) {
+    InferenceResult retVal;
+    {
+        const std::lock_guard<std::mutex> lock(mtx);
+
+        const auto& it =
+            shouldKeepOrder ? completedInferenceResults.find(outputFrameId) : completedInferenceResults.begin();
+
+        if (it != completedInferenceResults.end()) {
+            retVal = std::move(it->second);
+            completedInferenceResults.erase(it);
+        }
+    }
+
+    if (!retVal.IsEmpty()) {
+        outputFrameId = retVal.frameId;
+        outputFrameId++;
+        if (outputFrameId < 0) {
+            outputFrameId = 0;
+        }
+    }
+
+    return retVal;
+}
diff --git a/python/openvino/runtime/common/pipelines/src/requests_pool.cpp b/python/openvino/runtime/common/pipelines/src/requests_pool.cpp
new file mode 100644
index 0000000..93230c9
--- /dev/null
+++ b/python/openvino/runtime/common/pipelines/src/requests_pool.cpp
@@ -0,0 +1,94 @@
+/*
+// Copyright (C) 2020-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "pipelines/requests_pool.h"
+
+#include <algorithm>
+#include <exception>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+
+RequestsPool::RequestsPool(ov::CompiledModel& compiledModel, unsigned int size) : numRequestsInUse(0) {
+    for (unsigned int infReqId = 0; infReqId < size; ++infReqId) {
+        requests.emplace_back(compiledModel.create_infer_request(), false);
+    }
+}
+
+RequestsPool::~RequestsPool() {
+    // Setting empty callback to free resources allocated for previously assigned lambdas
+    for (auto& pair : requests) {
+        pair.first.set_callback([](std::exception_ptr) {});
+    }
+}
+
+ov::InferRequest RequestsPool::getIdleRequest() {
+    std::lock_guard<std::mutex> lock(mtx);
+
+    const auto& it = std::find_if(requests.begin(), requests.end(), [](const std::pair<ov::InferRequest, bool>& x) {
+        return !x.second;
+    });
+    if (it == requests.end()) {
+        return ov::InferRequest();
+    } else {
+        it->second = true;
+        numRequestsInUse++;
+        return it->first;
+    }
+}
+
+void RequestsPool::setRequestIdle(const ov::InferRequest& request) {
+    std::lock_guard<std::mutex> lock(mtx);
+    const auto& it = std::find_if(this->requests.begin(),
+                                  this->requests.end(),
+                                  [&request](const std::pair<ov::InferRequest, bool>& x) {
+                                      return x.first == request;
+                                  });
+    it->second = false;
+    numRequestsInUse--;
+}
+
+size_t RequestsPool::getInUseRequestsCount() {
+    std::lock_guard<std::mutex> lock(mtx);
+    return numRequestsInUse;
+}
+
+bool RequestsPool::isIdleRequestAvailable() {
+    std::lock_guard<std::mutex> lock(mtx);
+    return numRequestsInUse < requests.size();
+}
+
+void RequestsPool::waitForTotalCompletion() {
+    // Do not synchronize here to avoid deadlock (despite synchronization in other functions)
+    // Request status will be changed to idle in callback,
+    // upon completion of request we're waiting for. Synchronization is applied there
+    for (auto pair : requests) {
+        if (pair.second) {
+            pair.first.wait();
+        }
+    }
+}
+
+std::vector<ov::InferRequest> RequestsPool::getInferRequestsList() {
+    std::lock_guard<std::mutex> lock(mtx);
+    std::vector<ov::InferRequest> retVal;
+    retVal.reserve(requests.size());
+    for (auto& pair : requests) {
+        retVal.push_back(pair.first);
+    }
+
+    return retVal;
+}
diff --git a/python/openvino/runtime/common/utils/CMakeLists.txt b/python/openvino/runtime/common/utils/CMakeLists.txt
new file mode 100644
index 0000000..e1e7293
--- /dev/null
+++ b/python/openvino/runtime/common/utils/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+set(TARGET_NAME "ie_samples_utils")
+
+file(GLOB_RECURSE SOURCES "*.cpp" "*.hpp" "*.h")
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SOURCES})
+
+add_library(${TARGET_NAME} STATIC EXCLUDE_FROM_ALL ${SOURCES})
+set_target_properties(${TARGET_NAME} PROPERTIES FOLDER "src")
+
+target_include_directories(${TARGET_NAME}
+    PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+find_package(OpenVINO REQUIRED COMPONENTS Runtime)
+
+if(TARGET gflags)
+    set(GFLAGS_TARGET gflags)
+else()
+    if(EXISTS /etc/debian_version)
+        set(gflags_component nothreads_static)
+    else()
+        find_package(gflags QUIET OPTIONAL_COMPONENTS nothreads_static)
+        if(NOT gflags_FOUND)
+            set(gflags_component shared)
+        else()
+            set(gflags_component nothreads_static)
+        endif()
+    endif()
+    find_package(gflags QUIET OPTIONAL_COMPONENTS ${gflags_component})
+    if(gflags_FOUND)
+        if(TARGET ${GFLAGS_TARGET})
+            # nothing
+        elseif(TARGET gflags_nothreads-static)
+            # Debian 9: gflag_component is ignored
+            set(GFLAGS_TARGET gflags_nothreads-static)
+        elseif(TARGET gflags-shared)
+            # gflags shared case for CentOS / RHEL / Fedora
+            set(GFLAGS_TARGET gflags-shared)
+        else()
+            message(FATAL_ERROR "Internal error: failed to find imported target 'gflags' using '${gflags_component}' component")
+        endif()
+
+        message(STATUS "gflags (${gflags_VERSION}) is found at ${gflags_DIR} using '${gflags_component}' component")
+    endif()
+
+    if(NOT gflags_FOUND)
+        if(EXISTS "$ENV{INTEL_OPENVINO_DIR}/samples/cpp/thirdparty/gflags")
+            add_subdirectory("$ENV{INTEL_OPENVINO_DIR}/samples/cpp/thirdparty/gflags" "${CMAKE_CURRENT_BINARY_DIR}/gflag")
+            set(GFLAGS_TARGET gflags_nothreads_static)
+        else()
+            message(FATAL_ERROR "Failed to find 'gflags' library using '${gflags_component}' component")
+        endif()
+    endif()
+endif()
+
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime ${GFLAGS_TARGET})
+
+if(COMMAND add_clang_format_target)
+    add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME})
+endif()
diff --git a/python/openvino/runtime/common/utils/include/samples/args_helper.hpp b/python/openvino/runtime/common/utils/include/samples/args_helper.hpp
new file mode 100644
index 0000000..6626140
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/args_helper.hpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality
+ * @file args_helper.hpp
+ */
+
+#pragma once
+
+// clang-format off
+#include <string>
+#include <vector>
+
+#include "openvino/openvino.hpp"
+
+#include "samples/slog.hpp"
+// clang-format on
+
+/**
+ * @brief This function checks input args and existence of specified files in a given folder
+ * @param arg path to a file to be checked for existence
+ * @return files updated vector of verified input files
+ */
+void readInputFilesArguments(std::vector<std::string>& files, const std::string& arg);
+
+/**
+ * @brief This function find -i/--images key in input args
+ *        It's necessary to process multiple values for single key
+ * @return files updated vector of verified input files
+ */
+void parseInputFilesArguments(std::vector<std::string>& files);
+std::map<std::string, std::string> parseArgMap(std::string argMap);
+
+void printInputAndOutputsInfo(const ov::Model& network);
+
+void configurePrePostProcessing(std::shared_ptr<ov::Model>& function,
+                                const std::string& ip,
+                                const std::string& op,
+                                const std::string& iop,
+                                const std::string& il,
+                                const std::string& ol,
+                                const std::string& iol,
+                                const std::string& iml,
+                                const std::string& oml,
+                                const std::string& ioml);
+
+void printInputAndOutputsInfo(const ov::Model& network);
+ov::element::Type getPrecision2(const std::string& value);
+
+template <class T>
+void printInputAndOutputsInfoShort(const T& network) {
+    slog::info << "Network inputs:" << slog::endl;
+    for (auto&& input : network.inputs()) {
+        std::string in_name;
+        std::string node_name;
+
+        // Workaround for "tensor has no name" issue
+        try {
+            for (const auto& name : input.get_names()) {
+                in_name += name + " , ";
+            }
+            in_name = in_name.substr(0, in_name.size() - 3);
+        } catch (const ov::Exception&) {
+        }
+
+        try {
+            node_name = input.get_node()->get_friendly_name();
+        } catch (const ov::Exception&) {
+        }
+
+        if (in_name == "") {
+            in_name = "***NO_NAME***";
+        }
+        if (node_name == "") {
+            node_name = "***NO_NAME***";
+        }
+
+        slog::info << "    " << in_name << " (node: " << node_name << ") : " << input.get_element_type() << " / "
+                   << ov::layout::get_layout(input).to_string() << " / " << input.get_partial_shape() << slog::endl;
+    }
+
+    slog::info << "Network outputs:" << slog::endl;
+    for (auto&& output : network.outputs()) {
+        std::string out_name;
+        std::string node_name;
+
+        // Workaround for "tensor has no name" issue
+        try {
+            for (const auto& name : output.get_names()) {
+                out_name += name + " , ";
+            }
+            out_name = out_name.substr(0, out_name.size() - 3);
+        } catch (const ov::Exception&) {
+        }
+        try {
+            node_name = output.get_node()->get_input_node_ptr(0)->get_friendly_name();
+        } catch (const ov::Exception&) {
+        }
+
+        if (out_name == "") {
+            out_name = "***NO_NAME***";
+        }
+        if (node_name == "") {
+            node_name = "***NO_NAME***";
+        }
+
+        slog::info << "    " << out_name << " (node: " << node_name << ") : " << output.get_element_type() << " / "
+                   << ov::layout::get_layout(output).to_string() << " / " << output.get_partial_shape() << slog::endl;
+    }
+}
diff --git a/python/openvino/runtime/common/utils/include/samples/classification_results.h b/python/openvino/runtime/common/utils/include/samples/classification_results.h
new file mode 100644
index 0000000..e1bc20f
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/classification_results.h
@@ -0,0 +1,205 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with output classification results
+ * @file classification_results.h
+ */
+#pragma once
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "openvino/openvino.hpp"
+
+/**
+ * @class ClassificationResult
+ * @brief A ClassificationResult creates an output table with results
+ */
+class ClassificationResult {
+private:
+    const std::string _classidStr = "classid";
+    const std::string _probabilityStr = "probability";
+    const std::string _labelStr = "label";
+    size_t _nTop;
+    ov::Tensor _outTensor;
+    const std::vector<std::string> _labels;
+    const std::vector<std::string> _imageNames;
+    const size_t _batchSize;
+    std::vector<unsigned> _results;
+
+    void printHeader() {
+        std::cout << _classidStr << " " << _probabilityStr;
+        if (!_labels.empty())
+            std::cout << " " << _labelStr;
+        std::string classidColumn(_classidStr.length(), '-');
+        std::string probabilityColumn(_probabilityStr.length(), '-');
+        std::string labelColumn(_labelStr.length(), '-');
+        std::cout << std::endl << classidColumn << " " << probabilityColumn;
+        if (!_labels.empty())
+            std::cout << " " << labelColumn;
+        std::cout << std::endl;
+    }
+
+    /**
+     * @brief Gets the top n results from a tensor
+     *
+     * @param n Top n count
+     * @param input 1D tensor that contains probabilities
+     * @param output Vector of indexes for the top n places
+     */
+    template <class T>
+    void topResults(unsigned int n, const ov::Tensor& input, std::vector<unsigned>& output) {
+        ov::Shape shape = input.get_shape();
+        size_t input_rank = shape.size();
+        OPENVINO_ASSERT(input_rank != 0 && shape[0] != 0, "Input tensor has incorrect dimensions!");
+        size_t batchSize = shape[0];
+        std::vector<unsigned> indexes(input.get_size() / batchSize);
+
+        n = static_cast<unsigned>(std::min<size_t>((size_t)n, input.get_size()));
+        output.resize(n * batchSize);
+
+        for (size_t i = 0; i < batchSize; i++) {
+            const size_t offset = i * (input.get_size() / batchSize);
+            const T* batchData = input.data<const T>();
+            batchData += offset;
+
+            std::iota(std::begin(indexes), std::end(indexes), 0);
+            std::partial_sort(std::begin(indexes),
+                              std::begin(indexes) + n,
+                              std::end(indexes),
+                              [&batchData](unsigned l, unsigned r) {
+                                  return batchData[l] > batchData[r];
+                              });
+            for (unsigned j = 0; j < n; j++) {
+                output.at(i * n + j) = indexes.at(j);
+            }
+        }
+    }
+
+    /**
+     * @brief Gets the top n results from a blob
+     *
+     * @param n Top n count
+     * @param input 1D blob that contains probabilities
+     * @param output Vector of indexes for the top n places
+     */
+    void topResults(unsigned int n, const ov::Tensor& input, std::vector<unsigned>& output) {
+#define TENSOR_TOP_RESULT(elem_type)                                                  \
+    case ov::element::Type_t::elem_type: {                                            \
+        using tensor_type = ov::fundamental_type_for<ov::element::Type_t::elem_type>; \
+        topResults<tensor_type>(n, input, output);                                    \
+        break;                                                                        \
+    }
+
+        switch (input.get_element_type()) {
+            TENSOR_TOP_RESULT(f32);
+            TENSOR_TOP_RESULT(f64);
+            TENSOR_TOP_RESULT(f16);
+            TENSOR_TOP_RESULT(i16);
+            TENSOR_TOP_RESULT(u8);
+            TENSOR_TOP_RESULT(i8);
+            TENSOR_TOP_RESULT(u16);
+            TENSOR_TOP_RESULT(i32);
+            TENSOR_TOP_RESULT(u32);
+            TENSOR_TOP_RESULT(i64);
+            TENSOR_TOP_RESULT(u64);
+        default:
+            OPENVINO_ASSERT(false, "cannot locate tensor with element type: ", input.get_element_type());
+        }
+
+#undef TENSOR_TOP_RESULT
+    }
+
+public:
+    explicit ClassificationResult(const ov::Tensor& output_tensor,
+                                  const std::vector<std::string>& image_names = {},
+                                  size_t batch_size = 1,
+                                  size_t num_of_top = 10,
+                                  const std::vector<std::string>& labels = {})
+        : _nTop(num_of_top),
+          _outTensor(output_tensor),
+          _labels(labels),
+          _imageNames(image_names),
+          _batchSize(batch_size),
+          _results() {
+        OPENVINO_ASSERT(_imageNames.size() == _batchSize, "Batch size should be equal to the number of images.");
+
+        topResults(_nTop, _outTensor, _results);
+    }
+
+    /**
+     * @brief prints formatted classification results
+     */
+    void show() {
+        /** Print the result iterating over each batch **/
+        std::ios::fmtflags fmt(std::cout.flags());
+        std::cout << std::endl << "Top " << _nTop << " results:" << std::endl << std::endl;
+        for (size_t image_id = 0; image_id < _batchSize; ++image_id) {
+            std::string out(_imageNames[image_id].begin(), _imageNames[image_id].end());
+            std::cout << "Image " << out;
+            std::cout.flush();
+            std::cout.clear();
+            std::cout << std::endl << std::endl;
+            printHeader();
+
+            for (size_t id = image_id * _nTop, cnt = 0; id < (image_id + 1) * _nTop; ++cnt, ++id) {
+                std::cout.precision(7);
+                // Getting probability for resulting class
+                const auto index = _results.at(id) + image_id * (_outTensor.get_size() / _batchSize);
+                const auto result = _outTensor.data<const float>()[index];
+
+                std::cout << std::setw(static_cast<int>(_classidStr.length())) << std::left << _results.at(id) << " ";
+                std::cout << std::left << std::setw(static_cast<int>(_probabilityStr.length())) << std::fixed << result;
+
+                if (!_labels.empty()) {
+                    std::cout << " " + _labels[_results.at(id)];
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+        std::cout.flags(fmt);
+    }
+
+    void print() {
+        /** Print the result iterating over each batch **/
+        std::ios::fmtflags fmt(std::cout.flags());
+        std::cout << std::endl << "Top " << _nTop << " results:" << std::endl << std::endl;
+        for (size_t image_id = 0; image_id < _batchSize; ++image_id) {
+            std::string out(_imageNames[image_id].begin(), _imageNames[image_id].end());
+            std::cout << "Image " << out;
+            std::cout.flush();
+            std::cout.clear();
+            std::cout << std::endl << std::endl;
+            printHeader();
+
+            for (size_t id = image_id * _nTop, cnt = 0; id < (image_id + 1) * _nTop; ++cnt, ++id) {
+                std::cout.precision(7);
+                // Getting probability for resulting class
+                const auto result = _outTensor.data<float>();
+                std::cout << std::setw(static_cast<int>(_classidStr.length())) << std::left << _results.at(id) << " ";
+                std::cout << std::left << std::setw(static_cast<int>(_probabilityStr.length())) << std::fixed << result;
+
+                if (!_labels.empty()) {
+                    std::cout << " " + _labels[_results.at(id)];
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+        std::cout.flags(fmt);
+    }
+
+    /**
+     * @brief returns the classification results in a vector
+     */
+    std::vector<unsigned> getResults() {
+        return _results;
+    }
+};
diff --git a/python/openvino/runtime/common/utils/include/samples/common.hpp b/python/openvino/runtime/common/utils/include/samples/common.hpp
new file mode 100644
index 0000000..448fd96
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/common.hpp
@@ -0,0 +1,1429 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality
+ * @file common.hpp
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+using std::setprecision;
+
+// clang-format off
+#include <inference_engine.hpp>
+#include "openvino/openvino.hpp"
+#include "slog.hpp"
+// clang-format on
+
+// @brief performance counters sort
+static constexpr char pcSort[] = "sort";
+static constexpr char pcNoSort[] = "no_sort";
+static constexpr char pcSimpleSort[] = "simple_sort";
+
+#ifndef UNUSED
+#    if defined(_MSC_VER) && !defined(__clang__)
+#        define UNUSED
+#    else
+#        define UNUSED __attribute__((unused))
+#    endif
+#endif
+
+/**
+ * @brief Unicode string wrappers
+ */
+#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
+#    define tchar                wchar_t
+#    define tstring              std::wstring
+#    define tmain                wmain
+#    define TSTRING2STRING(tstr) wstring2string(tstr)
+#else
+#    define tchar                char
+#    define tstring              std::string
+#    define tmain                main
+#    define TSTRING2STRING(tstr) tstr
+#endif
+
+#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
+
+/**
+ * @brief Convert wstring to string
+ * @param ref on wstring
+ * @return string
+ */
+inline std::string wstring2string(const std::wstring& wstr) {
+    std::string str;
+    for (auto&& wc : wstr)
+        str += static_cast<char>(wc);
+    return str;
+}
+#endif
+
+/**
+ * @brief trim from start (in place)
+ * @param s - string to trim
+ */
+inline void ltrim(std::string& s) {
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int c) {
+                return !std::isspace(c);
+            }));
+}
+
+/**
+ * @brief trim from end (in place)
+ * @param s - string to trim
+ */
+inline void rtrim(std::string& s) {
+    s.erase(std::find_if(s.rbegin(),
+                         s.rend(),
+                         [](int c) {
+                             return !std::isspace(c);
+                         })
+                .base(),
+            s.end());
+}
+
+/**
+ * @brief trim from both ends (in place)
+ * @param s - string to trim
+ */
+inline std::string& trim(std::string& s) {
+    ltrim(s);
+    rtrim(s);
+    return s;
+}
+/**
+ * @brief Gets filename without extension
+ * @param filepath - full file name
+ * @return filename without extension
+ */
+inline std::string fileNameNoExt(const std::string& filepath) {
+    auto pos = filepath.rfind('.');
+    if (pos == std::string::npos)
+        return filepath;
+    return filepath.substr(0, pos);
+}
+
+/**
+ * @brief Get extension from filename
+ * @param filename - name of the file which extension should be extracted
+ * @return string with extracted file extension
+ */
+inline std::string fileExt(const std::string& filename) {
+    auto pos = filename.rfind('.');
+    if (pos == std::string::npos)
+        return "";
+    return filename.substr(pos + 1);
+}
+
+inline slog::LogStream& operator<<(slog::LogStream& os, const ov::Version& version) {
+    os << "Build ................................. ";
+    os << version.buildNumber << slog::endl;
+
+    return os;
+}
+
+inline slog::LogStream& operator<<(slog::LogStream& os, const std::map<std::string, ov::Version>& versions) {
+    for (auto&& version : versions) {
+        os << version.first << slog::endl;
+        os << version.second << slog::endl;
+    }
+
+    return os;
+}
+
+/**
+ * @class Color
+ * @brief A Color class stores channels of a given color
+ */
+class Color {
+private:
+    unsigned char _r;
+    unsigned char _g;
+    unsigned char _b;
+
+public:
+    /**
+     * A default constructor.
+     * @param r - value for red channel
+     * @param g - value for green channel
+     * @param b - value for blue channel
+     */
+    Color(unsigned char r, unsigned char g, unsigned char b) : _r(r), _g(g), _b(b) {}
+
+    inline unsigned char red() {
+        return _r;
+    }
+
+    inline unsigned char blue() {
+        return _b;
+    }
+
+    inline unsigned char green() {
+        return _g;
+    }
+};
+
+// TODO : keep only one version of writeOutputBMP
+
+/**
+ * @brief Writes output data to image
+ * @param name - image name
+ * @param data - output data
+ * @param classesNum - the number of classes
+ * @return false if error else true
+ */
+static UNUSED void writeOutputBmp(std::vector<std::vector<size_t>> data, size_t classesNum, std::ostream& outFile) {
+    unsigned int seed = (unsigned int)time(NULL);
+    // Known colors for training classes from Cityscape dataset
+    static std::vector<Color> colors = {
+        {128, 64, 128}, {232, 35, 244}, {70, 70, 70},   {156, 102, 102}, {153, 153, 190}, {153, 153, 153},
+        {30, 170, 250}, {0, 220, 220},  {35, 142, 107}, {152, 251, 152}, {180, 130, 70},  {60, 20, 220},
+        {0, 0, 255},    {142, 0, 0},    {70, 0, 0},     {100, 60, 0},    {90, 0, 0},      {230, 0, 0},
+        {32, 11, 119},  {0, 74, 111},   {81, 0, 81}};
+
+    while (classesNum > colors.size()) {
+        static std::mt19937 rng(seed);
+        std::uniform_int_distribution<int> dist(0, 255);
+        Color color(dist(rng), dist(rng), dist(rng));
+        colors.push_back(color);
+    }
+
+    unsigned char file[14] = {
+        'B',
+        'M',  // magic
+        0,
+        0,
+        0,
+        0,  // size in bytes
+        0,
+        0,  // app data
+        0,
+        0,  // app data
+        40 + 14,
+        0,
+        0,
+        0  // start of data offset
+    };
+    unsigned char info[40] = {
+        40,   0,    0, 0,  // info hd size
+        0,    0,    0, 0,  // width
+        0,    0,    0, 0,  // height
+        1,    0,           // number color planes
+        24,   0,           // bits per pixel
+        0,    0,    0, 0,  // compression is none
+        0,    0,    0, 0,  // image bits size
+        0x13, 0x0B, 0, 0,  // horz resolution in pixel / m
+        0x13, 0x0B, 0, 0,  // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72 dpi)
+        0,    0,    0, 0,  // #colors in palette
+        0,    0,    0, 0,  // #important colors
+    };
+
+    auto height = data.size();
+    auto width = data.at(0).size();
+
+    OPENVINO_ASSERT(
+        height < (size_t)std::numeric_limits<int32_t>::max && width < (size_t)std::numeric_limits<int32_t>::max,
+        "File size is too big: ",
+        height,
+        " X ",
+        width);
+
+    int padSize = static_cast<int>(4 - (width * 3) % 4) % 4;
+    int sizeData = static_cast<int>(width * height * 3 + height * padSize);
+    int sizeAll = sizeData + sizeof(file) + sizeof(info);
+
+    file[2] = (unsigned char)(sizeAll);
+    file[3] = (unsigned char)(sizeAll >> 8);
+    file[4] = (unsigned char)(sizeAll >> 16);
+    file[5] = (unsigned char)(sizeAll >> 24);
+
+    info[4] = (unsigned char)(width);
+    info[5] = (unsigned char)(width >> 8);
+    info[6] = (unsigned char)(width >> 16);
+    info[7] = (unsigned char)(width >> 24);
+
+    int32_t negativeHeight = -(int32_t)height;
+    info[8] = (unsigned char)(negativeHeight);
+    info[9] = (unsigned char)(negativeHeight >> 8);
+    info[10] = (unsigned char)(negativeHeight >> 16);
+    info[11] = (unsigned char)(negativeHeight >> 24);
+
+    info[20] = (unsigned char)(sizeData);
+    info[21] = (unsigned char)(sizeData >> 8);
+    info[22] = (unsigned char)(sizeData >> 16);
+    info[23] = (unsigned char)(sizeData >> 24);
+
+    outFile.write(reinterpret_cast<char*>(file), sizeof(file));
+    outFile.write(reinterpret_cast<char*>(info), sizeof(info));
+
+    unsigned char pad[3] = {0, 0, 0};
+
+    for (size_t y = 0; y < height; y++) {
+        for (size_t x = 0; x < width; x++) {
+            unsigned char pixel[3];
+            size_t index = data.at(y).at(x);
+            pixel[0] = colors.at(index).red();
+            pixel[1] = colors.at(index).green();
+            pixel[2] = colors.at(index).blue();
+            outFile.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        outFile.write(reinterpret_cast<char*>(pad), padSize);
+    }
+}
+
+/**
+ * @brief Writes output data to BMP image
+ * @param name - image name
+ * @param data - output data
+ * @param height - height of the target image
+ * @param width - width of the target image
+ * @return false if error else true
+ */
+static UNUSED bool writeOutputBmp(std::string name, unsigned char* data, size_t height, size_t width) {
+    std::ofstream outFile;
+    outFile.open(name, std::ofstream::binary);
+    if (!outFile.is_open()) {
+        return false;
+    }
+
+    unsigned char file[14] = {
+        'B',
+        'M',  // magic
+        0,
+        0,
+        0,
+        0,  // size in bytes
+        0,
+        0,  // app data
+        0,
+        0,  // app data
+        40 + 14,
+        0,
+        0,
+        0  // start of data offset
+    };
+    unsigned char info[40] = {
+        40,   0,    0, 0,  // info hd size
+        0,    0,    0, 0,  // width
+        0,    0,    0, 0,  // height
+        1,    0,           // number color planes
+        24,   0,           // bits per pixel
+        0,    0,    0, 0,  // compression is none
+        0,    0,    0, 0,  // image bits size
+        0x13, 0x0B, 0, 0,  // horz resolution in pixel / m
+        0x13, 0x0B, 0, 0,  // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72 dpi)
+        0,    0,    0, 0,  // #colors in palette
+        0,    0,    0, 0,  // #important colors
+    };
+
+    OPENVINO_ASSERT(
+        height < (size_t)std::numeric_limits<int32_t>::max && width < (size_t)std::numeric_limits<int32_t>::max,
+        "File size is too big: ",
+        height,
+        " X ",
+        width);
+
+    int padSize = static_cast<int>(4 - (width * 3) % 4) % 4;
+    int sizeData = static_cast<int>(width * height * 3 + height * padSize);
+    int sizeAll = sizeData + sizeof(file) + sizeof(info);
+
+    file[2] = (unsigned char)(sizeAll);
+    file[3] = (unsigned char)(sizeAll >> 8);
+    file[4] = (unsigned char)(sizeAll >> 16);
+    file[5] = (unsigned char)(sizeAll >> 24);
+
+    info[4] = (unsigned char)(width);
+    info[5] = (unsigned char)(width >> 8);
+    info[6] = (unsigned char)(width >> 16);
+    info[7] = (unsigned char)(width >> 24);
+
+    int32_t negativeHeight = -(int32_t)height;
+    info[8] = (unsigned char)(negativeHeight);
+    info[9] = (unsigned char)(negativeHeight >> 8);
+    info[10] = (unsigned char)(negativeHeight >> 16);
+    info[11] = (unsigned char)(negativeHeight >> 24);
+
+    info[20] = (unsigned char)(sizeData);
+    info[21] = (unsigned char)(sizeData >> 8);
+    info[22] = (unsigned char)(sizeData >> 16);
+    info[23] = (unsigned char)(sizeData >> 24);
+
+    outFile.write(reinterpret_cast<char*>(file), sizeof(file));
+    outFile.write(reinterpret_cast<char*>(info), sizeof(info));
+
+    unsigned char pad[3] = {0, 0, 0};
+
+    for (size_t y = 0; y < height; y++) {
+        for (size_t x = 0; x < width; x++) {
+            unsigned char pixel[3];
+            pixel[0] = data[y * width * 3 + x * 3];
+            pixel[1] = data[y * width * 3 + x * 3 + 1];
+            pixel[2] = data[y * width * 3 + x * 3 + 2];
+
+            outFile.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        outFile.write(reinterpret_cast<char*>(pad), padSize);
+    }
+    return true;
+}
+
+/**
+ * @brief Adds colored rectangles to the image
+ * @param data - data where rectangles are put
+ * @param height - height of the rectangle
+ * @param width - width of the rectangle
+ * @param rectangles - vector points for the rectangle, should be 4x compared to num classes
+ * @param classes - vector of classes
+ * @param thickness - thickness of a line (in pixels) to be used for bounding boxes
+ */
+static UNUSED void addRectangles(unsigned char* data,
+                                 size_t height,
+                                 size_t width,
+                                 std::vector<int> rectangles,
+                                 std::vector<int> classes,
+                                 int thickness = 1) {
+    std::vector<Color> colors = {// colors to be used for bounding boxes
+                                 {128, 64, 128},  {232, 35, 244}, {70, 70, 70},  {156, 102, 102}, {153, 153, 190},
+                                 {153, 153, 153}, {30, 170, 250}, {0, 220, 220}, {35, 142, 107},  {152, 251, 152},
+                                 {180, 130, 70},  {60, 20, 220},  {0, 0, 255},   {142, 0, 0},     {70, 0, 0},
+                                 {100, 60, 0},    {90, 0, 0},     {230, 0, 0},   {32, 11, 119},   {0, 74, 111},
+                                 {81, 0, 81}};
+
+    if (rectangles.size() % 4 != 0 || rectangles.size() / 4 != classes.size()) {
+        return;
+    }
+
+    for (size_t i = 0; i < classes.size(); i++) {
+        int x = rectangles.at(i * 4);
+        int y = rectangles.at(i * 4 + 1);
+        int w = rectangles.at(i * 4 + 2);
+        int h = rectangles.at(i * 4 + 3);
+
+        int cls = classes.at(i) % colors.size();  // color of a bounding box line
+
+        if (x < 0)
+            x = 0;
+        if (y < 0)
+            y = 0;
+        if (w < 0)
+            w = 0;
+        if (h < 0)
+            h = 0;
+
+        if (static_cast<std::size_t>(x) >= width) {
+            x = static_cast<int>(width - 1);
+            w = 0;
+            thickness = 1;
+        }
+        if (static_cast<std::size_t>(y) >= height) {
+            y = static_cast<int>(height - 1);
+            h = 0;
+            thickness = 1;
+        }
+
+        if (static_cast<std::size_t>(x + w) >= width) {
+            w = static_cast<int>(width - x - 1);
+        }
+        if (static_cast<std::size_t>(y + h) >= height) {
+            h = static_cast<int>(height - y - 1);
+        }
+
+        thickness = std::min(std::min(thickness, w / 2 + 1), h / 2 + 1);
+
+        size_t shift_first;
+        size_t shift_second;
+        for (int t = 0; t < thickness; t++) {
+            shift_first = (y + t) * width * 3;
+            shift_second = (y + h - t) * width * 3;
+            for (int ii = x; ii < x + w + 1; ii++) {
+                data[shift_first + ii * 3] = colors.at(cls).red();
+                data[shift_first + ii * 3 + 1] = colors.at(cls).green();
+                data[shift_first + ii * 3 + 2] = colors.at(cls).blue();
+                data[shift_second + ii * 3] = colors.at(cls).red();
+                data[shift_second + ii * 3 + 1] = colors.at(cls).green();
+                data[shift_second + ii * 3 + 2] = colors.at(cls).blue();
+            }
+        }
+
+        for (int t = 0; t < thickness; t++) {
+            shift_first = (x + t) * 3;
+            shift_second = (x + w - t) * 3;
+            for (int ii = y; ii < y + h + 1; ii++) {
+                data[shift_first + ii * width * 3] = colors.at(cls).red();
+                data[shift_first + ii * width * 3 + 1] = colors.at(cls).green();
+                data[shift_first + ii * width * 3 + 2] = colors.at(cls).blue();
+                data[shift_second + ii * width * 3] = colors.at(cls).red();
+                data[shift_second + ii * width * 3 + 1] = colors.at(cls).green();
+                data[shift_second + ii * width * 3 + 2] = colors.at(cls).blue();
+            }
+        }
+    }
+}
+
+// DLA PATCH BEGIN - Re-implement functions needed for dla_benchmark that was removed from OPENVINO 2022.3.0
+inline std::size_t getTensorWidth(const InferenceEngine::TensorDesc& desc) {
+    const auto& layout = desc.getLayout();
+    const auto& dims = desc.getDims();
+    const auto& size = dims.size();
+    if ((size >= 2) && (layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || layout == InferenceEngine::Layout::NCDHW ||
+                        layout == InferenceEngine::Layout::NDHWC || layout == InferenceEngine::Layout::OIHW || layout == InferenceEngine::Layout::GOIHW ||
+                        layout == InferenceEngine::Layout::OIDHW || layout == InferenceEngine::Layout::GOIDHW || layout == InferenceEngine::Layout::CHW ||
+                        layout == InferenceEngine::Layout::HW)) {
+        // Regardless of layout, dimensions are stored in fixed order
+        return dims.back();
+    } else {
+        IE_THROW() << "Tensor does not have width dimension";
+    }
+    return 0;
+}
+
+inline std::size_t getTensorHeight(const InferenceEngine::TensorDesc& desc) {
+    const auto& layout = desc.getLayout();
+    const auto& dims = desc.getDims();
+    const auto& size = dims.size();
+    if ((size >= 2) && (layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || layout == InferenceEngine::Layout::NCDHW ||
+                        layout == InferenceEngine::Layout::NDHWC || layout == InferenceEngine::Layout::OIHW || layout == InferenceEngine::Layout::GOIHW ||
+                        layout == InferenceEngine::Layout::OIDHW || layout == InferenceEngine::Layout::GOIDHW || layout == InferenceEngine::Layout::CHW ||
+                        layout == InferenceEngine::Layout::HW)) {
+        // Regardless of layout, dimensions are stored in fixed order
+        return dims.at(size - 2);
+    } else {
+        IE_THROW() << "Tensor does not have height dimension";
+    }
+    return 0;
+}
+
+inline std::size_t getTensorChannels(const InferenceEngine::TensorDesc& desc) {
+    const auto& layout = desc.getLayout();
+    if (layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || layout == InferenceEngine::Layout::NCDHW ||
+        layout == InferenceEngine::Layout::NDHWC || layout == InferenceEngine::Layout::C || layout == InferenceEngine::Layout::CHW ||
+        layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::CN) {
+        // Regardless of layout, dimensions are stored in fixed order
+        const auto& dims = desc.getDims();
+        switch (desc.getLayoutByDims(dims)) {
+        case InferenceEngine::Layout::C:
+            return dims.at(0);
+        case InferenceEngine::Layout::NC:
+            return dims.at(1);
+        case InferenceEngine::Layout::CHW:
+            return dims.at(0);
+        case InferenceEngine::Layout::NCHW:
+            return dims.at(1);
+        case InferenceEngine::Layout::NCDHW:
+            return dims.at(1);
+        case InferenceEngine::Layout::SCALAR:   // [[fallthrough]]
+        case InferenceEngine::Layout::BLOCKED:  // [[fallthrough]]
+        default:
+            IE_THROW() << "Tensor does not have channels dimension";
+        }
+    } else {
+        IE_THROW() << "Tensor does not have channels dimension";
+    }
+    return 0;
+}
+
+inline std::size_t getTensorBatch(const InferenceEngine::TensorDesc& desc) {
+    const auto& layout = desc.getLayout();
+    if (layout == InferenceEngine::Layout::NCHW || layout == InferenceEngine::Layout::NHWC || layout == InferenceEngine::Layout::NCDHW ||
+        layout == InferenceEngine::Layout::NDHWC || layout == InferenceEngine::Layout::NC || layout == InferenceEngine::Layout::CN) {
+        // Regardless of layout, dimensions are stored in fixed order
+        const auto& dims = desc.getDims();
+        switch (desc.getLayoutByDims(dims)) {
+        case InferenceEngine::Layout::NC:
+            return dims.at(0);
+        case InferenceEngine::Layout::NCHW:
+            return dims.at(0);
+        case InferenceEngine::Layout::NCDHW:
+            return dims.at(0);
+        case InferenceEngine::Layout::CHW:      // [[fallthrough]]
+        case InferenceEngine::Layout::C:        // [[fallthrough]]
+        case InferenceEngine::Layout::SCALAR:   // [[fallthrough]]
+        case InferenceEngine::Layout::BLOCKED:  // [[fallthrough]]
+        default:
+            IE_THROW() << "Tensor does not have channels dimension";
+        }
+    } else {
+        IE_THROW() << "Tensor does not have channels dimension";
+    }
+    return 0;
+}
+
+// DLA PATCH END
+
+/**
+ * Write output data to image
+ * \param name - image name
+ * \param data - output data
+ * \param classesNum - the number of classes
+ * \return false if error else true
+ */
+
+static UNUSED bool writeOutputBmp(unsigned char* data, size_t height, size_t width, std::ostream& outFile) {
+    unsigned char file[14] = {
+        'B',
+        'M',  // magic
+        0,
+        0,
+        0,
+        0,  // size in bytes
+        0,
+        0,  // app data
+        0,
+        0,  // app data
+        40 + 14,
+        0,
+        0,
+        0  // start of data offset
+    };
+    unsigned char info[40] = {
+        40,   0,    0, 0,  // info hd size
+        0,    0,    0, 0,  // width
+        0,    0,    0, 0,  // height
+        1,    0,           // number color planes
+        24,   0,           // bits per pixel
+        0,    0,    0, 0,  // compression is none
+        0,    0,    0, 0,  // image bits size
+        0x13, 0x0B, 0, 0,  // horz resolution in pixel / m
+        0x13, 0x0B, 0, 0,  // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72 dpi)
+        0,    0,    0, 0,  // #colors in palette
+        0,    0,    0, 0,  // #important colors
+    };
+
+    OPENVINO_ASSERT(
+        height < (size_t)std::numeric_limits<int32_t>::max && width < (size_t)std::numeric_limits<int32_t>::max,
+        "File size is too big: ",
+        height,
+        " X ",
+        width);
+
+    int padSize = static_cast<int>(4 - (width * 3) % 4) % 4;
+    int sizeData = static_cast<int>(width * height * 3 + height * padSize);
+    int sizeAll = sizeData + sizeof(file) + sizeof(info);
+
+    file[2] = (unsigned char)(sizeAll);
+    file[3] = (unsigned char)(sizeAll >> 8);
+    file[4] = (unsigned char)(sizeAll >> 16);
+    file[5] = (unsigned char)(sizeAll >> 24);
+
+    info[4] = (unsigned char)(width);
+    info[5] = (unsigned char)(width >> 8);
+    info[6] = (unsigned char)(width >> 16);
+    info[7] = (unsigned char)(width >> 24);
+
+    int32_t negativeHeight = -(int32_t)height;
+    info[8] = (unsigned char)(negativeHeight);
+    info[9] = (unsigned char)(negativeHeight >> 8);
+    info[10] = (unsigned char)(negativeHeight >> 16);
+    info[11] = (unsigned char)(negativeHeight >> 24);
+
+    info[20] = (unsigned char)(sizeData);
+    info[21] = (unsigned char)(sizeData >> 8);
+    info[22] = (unsigned char)(sizeData >> 16);
+    info[23] = (unsigned char)(sizeData >> 24);
+
+    outFile.write(reinterpret_cast<char*>(file), sizeof(file));
+    outFile.write(reinterpret_cast<char*>(info), sizeof(info));
+
+    unsigned char pad[3] = {0, 0, 0};
+
+    for (size_t y = 0; y < height; y++) {
+        for (size_t x = 0; x < width; x++) {
+            unsigned char pixel[3];
+            pixel[0] = data[y * width * 3 + x * 3];
+            pixel[1] = data[y * width * 3 + x * 3 + 1];
+            pixel[2] = data[y * width * 3 + x * 3 + 2];
+            outFile.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        outFile.write(reinterpret_cast<char*>(pad), padSize);
+    }
+
+    return true;
+}
+
+static UNUSED void printPerformanceCounts(const std::map<std::string, ov::ProfilingInfo>& performanceMap,
+                                          std::ostream& stream,
+                                          std::string deviceName,
+                                          bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceMap) {
+        std::string toPrint(it.first);
+        const int maxLayerName = 30;
+
+        if (it.first.length() >= maxLayerName) {
+            toPrint = it.first.substr(0, maxLayerName - 4);
+            toPrint += "...";
+        }
+
+        stream << std::setw(maxLayerName) << std::left << toPrint;
+        switch (it.second.status) {
+        case ov::ProfilingInfo::Status::EXECUTED:
+            stream << std::setw(15) << std::left << "EXECUTED";
+            break;
+        case ov::ProfilingInfo::Status::NOT_RUN:
+            stream << std::setw(15) << std::left << "NOT_RUN";
+            break;
+        case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+            stream << std::setw(15) << std::left << "OPTIMIZED_OUT";
+            break;
+        }
+        stream << std::setw(30) << std::left << "layerType: " + std::string(it.second.node_type) + " ";
+        stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.second.real_time.count());
+        stream << std::setw(20) << std::left << "cpu: " + std::to_string(it.second.cpu_time.count());
+        stream << " execType: " << it.second.exec_type << std::endl;
+        if (it.second.real_time.count() > 0) {
+            totalTime += it.second.real_time;
+        }
+    }
+    stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count()) << " microseconds"
+           << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+/**
+ * @brief This class represents an object that is found by an object detection net
+ */
+class DetectedObject {
+public:
+    int objectType;
+    float xmin, xmax, ymin, ymax, prob;
+    bool difficult;
+
+    DetectedObject(int _objectType,
+                   float _xmin,
+                   float _ymin,
+                   float _xmax,
+                   float _ymax,
+                   float _prob,
+                   bool _difficult = false)
+        : objectType(_objectType),
+          xmin(_xmin),
+          xmax(_xmax),
+          ymin(_ymin),
+          ymax(_ymax),
+          prob(_prob),
+          difficult(_difficult) {}
+
+    DetectedObject(const DetectedObject& other) = default;
+
+    static float ioU(const DetectedObject& detectedObject1_, const DetectedObject& detectedObject2_) {
+        // Add small space to eliminate empty squares
+        float epsilon = 0;  // 1e-5f;
+
+        DetectedObject detectedObject1(detectedObject1_.objectType,
+                                       (detectedObject1_.xmin - epsilon),
+                                       (detectedObject1_.ymin - epsilon),
+                                       (detectedObject1_.xmax - epsilon),
+                                       (detectedObject1_.ymax - epsilon),
+                                       detectedObject1_.prob);
+        DetectedObject detectedObject2(detectedObject2_.objectType,
+                                       (detectedObject2_.xmin + epsilon),
+                                       (detectedObject2_.ymin + epsilon),
+                                       (detectedObject2_.xmax),
+                                       (detectedObject2_.ymax),
+                                       detectedObject2_.prob);
+
+        if (detectedObject1.objectType != detectedObject2.objectType) {
+            // objects are different, so the result is 0
+            return 0.0f;
+        }
+
+        if (detectedObject1.xmax < detectedObject1.xmin)
+            return 0.0;
+        if (detectedObject1.ymax < detectedObject1.ymin)
+            return 0.0;
+        if (detectedObject2.xmax < detectedObject2.xmin)
+            return 0.0;
+        if (detectedObject2.ymax < detectedObject2.ymin)
+            return 0.0;
+
+        float xmin = (std::max)(detectedObject1.xmin, detectedObject2.xmin);
+        float ymin = (std::max)(detectedObject1.ymin, detectedObject2.ymin);
+        float xmax = (std::min)(detectedObject1.xmax, detectedObject2.xmax);
+        float ymax = (std::min)(detectedObject1.ymax, detectedObject2.ymax);
+
+        // Caffe adds 1 to every length if the box isn't normalized. So do we...
+        float addendum;
+        if (xmax > 1 || ymax > 1)
+            addendum = 1;
+        else
+            addendum = 0;
+
+        // intersection
+        float intr;
+        if ((xmax >= xmin) && (ymax >= ymin)) {
+            intr = (addendum + xmax - xmin) * (addendum + ymax - ymin);
+        } else {
+            intr = 0.0f;
+        }
+
+        // union
+        float square1 = (addendum + detectedObject1.xmax - detectedObject1.xmin) *
+                        (addendum + detectedObject1.ymax - detectedObject1.ymin);
+        float square2 = (addendum + detectedObject2.xmax - detectedObject2.xmin) *
+                        (addendum + detectedObject2.ymax - detectedObject2.ymin);
+
+        float unn = square1 + square2 - intr;
+
+        return static_cast<float>(intr) / unn;
+    }
+
+    DetectedObject scale(float scale_x, float scale_y) const {
+        return DetectedObject(objectType,
+                              xmin * scale_x,
+                              ymin * scale_y,
+                              xmax * scale_x,
+                              ymax * scale_y,
+                              prob,
+                              difficult);
+    }
+};
+
+class ImageDescription {
+public:
+    const std::list<DetectedObject> alist;
+    const bool check_probs;
+
+    explicit ImageDescription(const std::list<DetectedObject>& _alist, bool _check_probs = false)
+        : alist(_alist),
+          check_probs(_check_probs) {}
+
+    static float ioUMultiple(const ImageDescription& detectedObjects, const ImageDescription& desiredObjects) {
+        const ImageDescription *detectedObjectsSmall, *detectedObjectsBig;
+        bool check_probs = desiredObjects.check_probs;
+
+        if (detectedObjects.alist.size() < desiredObjects.alist.size()) {
+            detectedObjectsSmall = &detectedObjects;
+            detectedObjectsBig = &desiredObjects;
+        } else {
+            detectedObjectsSmall = &desiredObjects;
+            detectedObjectsBig = &detectedObjects;
+        }
+
+        std::list<DetectedObject> doS = detectedObjectsSmall->alist;
+        std::list<DetectedObject> doB = detectedObjectsBig->alist;
+
+        float fullScore = 0.0f;
+        while (doS.size() > 0) {
+            float score = 0.0f;
+            std::list<DetectedObject>::iterator bestJ = doB.end();
+            for (auto j = doB.begin(); j != doB.end(); j++) {
+                float curscore = DetectedObject::ioU(*doS.begin(), *j);
+                if (score < curscore) {
+                    score = curscore;
+                    bestJ = j;
+                }
+            }
+
+            float coeff = 1.0;
+            if (check_probs) {
+                if (bestJ != doB.end()) {
+                    float mn = std::min((*bestJ).prob, (*doS.begin()).prob);
+                    float mx = std::max((*bestJ).prob, (*doS.begin()).prob);
+
+                    coeff = mn / mx;
+                }
+            }
+
+            doS.pop_front();
+            if (bestJ != doB.end())
+                doB.erase(bestJ);
+            fullScore += coeff * score;
+        }
+        fullScore /= detectedObjectsBig->alist.size();
+
+        return fullScore;
+    }
+
+    ImageDescription scale(float scale_x, float scale_y) const {
+        std::list<DetectedObject> slist;
+        for (auto& dob : alist) {
+            slist.push_back(dob.scale(scale_x, scale_y));
+        }
+        return ImageDescription(slist, check_probs);
+    }
+};
+
+struct AveragePrecisionCalculator {
+private:
+    enum MatchKind { TruePositive, FalsePositive };
+
+    /**
+     * Here we count all TP and FP matches for all the classes in all the images.
+     */
+    std::map<int, std::vector<std::pair<double, MatchKind>>> matches;
+
+    std::map<int, int> N;
+
+    double threshold;
+
+    static bool SortBBoxDescend(const DetectedObject& bbox1, const DetectedObject& bbox2) {
+        return bbox1.prob > bbox2.prob;
+    }
+
+    static bool SortPairDescend(const std::pair<double, MatchKind>& p1, const std::pair<double, MatchKind>& p2) {
+        return p1.first > p2.first;
+    }
+
+public:
+    explicit AveragePrecisionCalculator(double _threshold) : threshold(_threshold) {}
+
+    // gt_bboxes -> des
+    // bboxes -> det
+
+    void consumeImage(const ImageDescription& detectedObjects, const ImageDescription& desiredObjects) {
+        // Collecting IoU values
+        std::vector<bool> visited(desiredObjects.alist.size(), false);
+        std::vector<DetectedObject> bboxes{std::begin(detectedObjects.alist), std::end(detectedObjects.alist)};
+        std::sort(bboxes.begin(), bboxes.end(), SortBBoxDescend);
+
+        for (auto&& detObj : bboxes) {
+            // Searching for the best match to this detection
+            // Searching for desired object
+            float overlap_max = -1;
+            int jmax = -1;
+            auto desmax = desiredObjects.alist.end();
+
+            int j = 0;
+            for (auto desObj = desiredObjects.alist.begin(); desObj != desiredObjects.alist.end(); desObj++, j++) {
+                double iou = DetectedObject::ioU(detObj, *desObj);
+                if (iou > overlap_max) {
+                    overlap_max = static_cast<float>(iou);
+                    jmax = j;
+                    desmax = desObj;
+                }
+            }
+
+            MatchKind mk;
+            if (overlap_max >= threshold) {
+                if (!desmax->difficult) {
+                    if (!visited[jmax]) {
+                        mk = TruePositive;
+                        visited[jmax] = true;
+                    } else {
+                        mk = FalsePositive;
+                    }
+                    matches[detObj.objectType].push_back(std::make_pair(detObj.prob, mk));
+                }
+            } else {
+                mk = FalsePositive;
+                matches[detObj.objectType].push_back(std::make_pair(detObj.prob, mk));
+            }
+        }
+
+        for (auto desObj = desiredObjects.alist.begin(); desObj != desiredObjects.alist.end(); desObj++) {
+            if (!desObj->difficult) {
+                N[desObj->objectType]++;
+            }
+        }
+    }
+
+    std::map<int, double> calculateAveragePrecisionPerClass() const {
+        /**
+         * Precision-to-TP curve per class (a variation of precision-to-recall curve without
+         * dividing into N)
+         */
+        std::map<int, std::map<int, double>> precisionToTP;
+
+        std::map<int, double> res;
+
+        for (auto m : matches) {
+            // Sorting
+            std::sort(m.second.begin(), m.second.end(), SortPairDescend);
+
+            int clazz = m.first;
+            int TP = 0, FP = 0;
+
+            std::vector<double> prec;
+            std::vector<double> rec;
+
+            for (auto mm : m.second) {
+                // Here we are descending in a probability value
+                MatchKind mk = mm.second;
+                if (mk == TruePositive)
+                    TP++;
+                else if (mk == FalsePositive)
+                    FP++;
+
+                double precision = static_cast<double>(TP) / (TP + FP);
+                double recall = 0;
+                if (N.find(clazz) != N.end()) {
+                    recall = static_cast<double>(TP) / N.at(clazz);
+                }
+
+                prec.push_back(precision);
+                rec.push_back(recall);
+            }
+
+            int num = static_cast<int>(rec.size());
+
+            // 11point from Caffe
+            double ap = 0;
+            std::vector<float> max_precs(11, 0.);
+            int start_idx = num - 1;
+            for (int j = 10; j >= 0; --j) {
+                for (int i = start_idx; i >= 0; --i) {
+                    if (rec[i] < j / 10.) {
+                        start_idx = i;
+                        if (j > 0) {
+                            max_precs[j - 1] = max_precs[j];
+                        }
+                        break;
+                    } else {
+                        if (max_precs[j] < prec[i]) {
+                            max_precs[j] = static_cast<float>(prec[i]);
+                        }
+                    }
+                }
+            }
+            for (int j = 10; j >= 0; --j) {
+                ap += max_precs[j] / 11;
+            }
+            res[clazz] = ap;
+        }
+
+        return res;
+    }
+};
+
+/**
+ * @brief Adds colored rectangles to the image
+ * @param data - data where rectangles are put
+ * @param height - height of the rectangle
+ * @param width - width of the rectangle
+ * @param detectedObjects - vector of detected objects
+ */
+static UNUSED void addRectangles(unsigned char* data,
+                                 size_t height,
+                                 size_t width,
+                                 std::vector<DetectedObject> detectedObjects) {
+    std::vector<Color> colors = {{128, 64, 128},  {232, 35, 244}, {70, 70, 70},  {156, 102, 102}, {153, 153, 190},
+                                 {153, 153, 153}, {30, 170, 250}, {0, 220, 220}, {35, 142, 107},  {152, 251, 152},
+                                 {180, 130, 70},  {60, 20, 220},  {0, 0, 255},   {142, 0, 0},     {70, 0, 0},
+                                 {100, 60, 0},    {90, 0, 0},     {230, 0, 0},   {32, 11, 119},   {0, 74, 111},
+                                 {81, 0, 81}};
+
+    for (size_t i = 0; i < detectedObjects.size(); i++) {
+        int cls = detectedObjects[i].objectType % colors.size();
+
+        int xmin = static_cast<int>(detectedObjects[i].xmin * width);
+        int xmax = static_cast<int>(detectedObjects[i].xmax * width);
+        int ymin = static_cast<int>(detectedObjects[i].ymin * height);
+        int ymax = static_cast<int>(detectedObjects[i].ymax * height);
+
+        size_t shift_first = ymin * width * 3;
+        size_t shift_second = ymax * width * 3;
+        for (int x = xmin; x < xmax; x++) {
+            data[shift_first + x * 3] = colors.at(cls).red();
+            data[shift_first + x * 3 + 1] = colors.at(cls).green();
+            data[shift_first + x * 3 + 2] = colors.at(cls).blue();
+            data[shift_second + x * 3] = colors.at(cls).red();
+            data[shift_second + x * 3 + 1] = colors.at(cls).green();
+            data[shift_second + x * 3 + 2] = colors.at(cls).blue();
+        }
+
+        shift_first = xmin * 3;
+        shift_second = xmax * 3;
+        for (int y = ymin; y < ymax; y++) {
+            data[shift_first + y * width * 3] = colors.at(cls).red();
+            data[shift_first + y * width * 3 + 1] = colors.at(cls).green();
+            data[shift_first + y * width * 3 + 2] = colors.at(cls).blue();
+            data[shift_second + y * width * 3] = colors.at(cls).red();
+            data[shift_second + y * width * 3 + 1] = colors.at(cls).green();
+            data[shift_second + y * width * 3 + 2] = colors.at(cls).blue();
+        }
+    }
+}
+
+inline void showAvailableDevices() {
+    ov::Core core;
+    std::vector<std::string> devices = core.get_available_devices();
+
+    std::cout << std::endl;
+    std::cout << "Available target devices:";
+    for (const auto& device : devices) {
+        std::cout << "  " << device;
+    }
+    std::cout << std::endl;
+}
+
+/**
+ * @brief Parse text config file. The file must have the following format (with space a delimeter):
+ * CONFIG_NAME1 CONFIG_VALUE1
+ * CONFIG_NAME2 CONFIG_VALUE2
+ *
+ * @param configName - filename for a file with config options
+ * @param comment - lines starting with symbol `comment` are skipped
+ */
+std::map<std::string, std::string> parseConfig(const std::string& configName, char comment = '#');
+
+inline std::string getFullDeviceName(ov::Core& core, std::string device) {
+    try {
+        return core.get_property(device, ov::device::full_name);
+    } catch (ov::Exception&) {
+        return {};
+    }
+}
+
+static UNUSED void printPerformanceCounts(std::vector<ov::ProfilingInfo> performanceData,
+                                          std::ostream& stream,
+                                          std::string deviceName,
+                                          bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+    for (const auto& it : performanceData) {
+        std::string toPrint(it.node_name);
+        const int maxLayerName = 30;
+
+        if (it.node_name.length() >= maxLayerName) {
+            toPrint = it.node_name.substr(0, maxLayerName - 5);
+            toPrint += "...";
+        }
+
+        stream << std::setw(maxLayerName) << std::left << toPrint << " ";
+        switch (it.status) {
+        case ov::ProfilingInfo::Status::EXECUTED:
+            stream << std::setw(15) << std::left << "EXECUTED ";
+            break;
+        case ov::ProfilingInfo::Status::NOT_RUN:
+            stream << std::setw(15) << std::left << "NOT_RUN ";
+            break;
+        case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+            stream << std::setw(15) << std::left << "OPTIMIZED_OUT ";
+            break;
+        }
+        stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+        stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) + " ";
+        stream << std::setw(25) << std::left << "realTime (ms): " + std::to_string(it.real_time.count() / 1000.0) + " ";
+        stream << std::setw(25) << std::left << "cpuTime (ms): " + std::to_string(it.cpu_time.count() / 1000.0) + " ";
+        stream << std::endl;
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    stream << std::setw(25) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " milliseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED void printPerformanceCounts(ov::InferRequest request,
+                                          std::ostream& stream,
+                                          std::string deviceName,
+                                          bool bshowHeader = true) {
+    auto performanceMap = request.get_profiling_info();
+    printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader);
+}
+
+static inline std::string double_to_string(const double number) {
+    std::stringstream ss;
+    ss << std::fixed << std::setprecision(2) << number;
+    return ss.str();
+}
+
+template <typename T>
+using uniformDistribution = typename std::conditional<
+    std::is_floating_point<T>::value,
+    std::uniform_real_distribution<T>,
+    typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
+
+template <typename T, typename T2>
+static inline void fill_random(ov::Tensor& tensor,
+                               T rand_min = std::numeric_limits<uint8_t>::min(),
+                               T rand_max = std::numeric_limits<uint8_t>::max()) {
+    std::mt19937 gen(0);
+    size_t tensor_size = tensor.get_size();
+    if (0 == tensor_size) {
+        throw std::runtime_error(
+            "Models with dynamic shapes aren't supported. Input tensors must have specific shapes before inference");
+    }
+    T* data = tensor.data<T>();
+    uniformDistribution<T2> distribution(rand_min, rand_max);
+    for (size_t i = 0; i < tensor_size; i++) {
+        data[i] = static_cast<T>(distribution(gen));
+    }
+}
+
+static inline void fill_tensor_random(ov::Tensor tensor) {
+    switch (tensor.get_element_type()) {
+    case ov::element::f32:
+        fill_random<float, float>(tensor);
+        break;
+    case ov::element::f64:
+        fill_random<double, double>(tensor);
+        break;
+    case ov::element::f16:
+        fill_random<short, short>(tensor);
+        break;
+    case ov::element::i32:
+        fill_random<int32_t, int32_t>(tensor);
+        break;
+    case ov::element::i64:
+        fill_random<int64_t, int64_t>(tensor);
+        break;
+    case ov::element::u8:
+        // uniform_int_distribution<uint8_t> is not allowed in the C++17
+        // standard and vs2017/19
+        fill_random<uint8_t, uint32_t>(tensor);
+        break;
+    case ov::element::i8:
+        // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
+        // and vs2017/19
+        fill_random<int8_t, int32_t>(tensor, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+        break;
+    case ov::element::u16:
+        fill_random<uint16_t, uint16_t>(tensor);
+        break;
+    case ov::element::i16:
+        fill_random<int16_t, int16_t>(tensor);
+        break;
+    case ov::element::boolean:
+        fill_random<uint8_t, uint32_t>(tensor, 0, 1);
+        break;
+    default:
+        throw ov::Exception("Input type is not supported for a tensor");
+    }
+}
+
+static UNUSED void printPerformanceCountsNoSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                std::ostream& stream,
+                                                std::string deviceName,
+                                                bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        for (const auto& it : performanceData) {
+            std::string toPrint(it.node_name);
+            const int maxLayerName = 30;
+
+            if (it.node_name.length() >= maxLayerName) {
+                toPrint = it.node_name.substr(0, maxLayerName - 5);
+                toPrint += "...";
+            }
+
+            stream << std::setw(maxLayerName) << std::left << toPrint << " ";
+            switch (it.status) {
+            case ov::ProfilingInfo::Status::EXECUTED:
+                stream << std::setw(15) << std::left << "EXECUTED ";
+                break;
+            case ov::ProfilingInfo::Status::NOT_RUN:
+                stream << std::setw(15) << std::left << "NOT_RUN ";
+                break;
+            case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+                stream << std::setw(15) << std::left << "OPTIMIZED_OUT ";
+                break;
+            }
+            stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+            stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) + " ";
+            stream << std::setw(25) << std::left
+                   << "realTime (ms): " + std::to_string(it.real_time.count() / 1000.0) + " ";
+            stream << std::setw(25) << std::left
+                   << "cpuTime (ms): " + std::to_string(it.cpu_time.count() / 1000.0) + " ";
+
+            double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+            std::stringstream opt_proportion_ss;
+            opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+            std::string opt_proportion_str = opt_proportion_ss.str();
+            if (opt_proportion_str == "0.00") {
+                opt_proportion_str = "N/A";
+            }
+            stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+
+            stream << std::endl;
+        }
+    }
+    stream << std::setw(25) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " milliseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED bool sort_pc_descend(const ov::ProfilingInfo& profiling1, const ov::ProfilingInfo& profiling2) {
+    return profiling1.real_time > profiling2.real_time;
+}
+
+static UNUSED void printPerformanceCountsDescendSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                     std::ostream& stream,
+                                                     std::string deviceName,
+                                                     bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        // sort perfcounter
+        std::vector<ov::ProfilingInfo> sortPerfCounts{std::begin(performanceData), std::end(performanceData)};
+        std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend);
+
+        for (const auto& it : sortPerfCounts) {
+            std::string toPrint(it.node_name);
+            const int maxLayerName = 30;
+
+            if (it.node_name.length() >= maxLayerName) {
+                toPrint = it.node_name.substr(0, maxLayerName - 5);
+                toPrint += "...";
+            }
+
+            stream << std::setw(maxLayerName) << std::left << toPrint << " ";
+            switch (it.status) {
+            case ov::ProfilingInfo::Status::EXECUTED:
+                stream << std::setw(15) << std::left << "EXECUTED ";
+                break;
+            case ov::ProfilingInfo::Status::NOT_RUN:
+                stream << std::setw(15) << std::left << "NOT_RUN ";
+                break;
+            case ov::ProfilingInfo::Status::OPTIMIZED_OUT:
+                stream << std::setw(15) << std::left << "OPTIMIZED_OUT ";
+                break;
+            }
+            stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+            stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) + " ";
+            stream << std::setw(25) << std::left
+                   << "realTime (ms): " + std::to_string(it.real_time.count() / 1000.0) + " ";
+            stream << std::setw(25) << std::left
+                   << "cpuTime (ms): " + std::to_string(it.cpu_time.count() / 1000.0) + " ";
+
+            double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+            std::stringstream opt_proportion_ss;
+            opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+            std::string opt_proportion_str = opt_proportion_ss.str();
+            if (opt_proportion_str == "0.00") {
+                opt_proportion_str = "N/A";
+            }
+            stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+
+            stream << std::endl;
+        }
+    }
+    stream << std::setw(25) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " milliseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED void printPerformanceCountsSimpleSort(std::vector<ov::ProfilingInfo> performanceData,
+                                                    std::ostream& stream,
+                                                    std::string deviceName,
+                                                    bool bshowHeader = true) {
+    std::chrono::microseconds totalTime = std::chrono::microseconds::zero();
+    // Print performance counts
+    if (bshowHeader) {
+        stream << std::endl << "performance counts:" << std::endl << std::endl;
+    }
+    std::ios::fmtflags fmt(std::cout.flags());
+
+    for (const auto& it : performanceData) {
+        if (it.real_time.count() > 0) {
+            totalTime += it.real_time;
+        }
+    }
+    if (totalTime.count() != 0) {
+        // sort perfcounter
+        std::vector<ov::ProfilingInfo> sortPerfCounts{std::begin(performanceData), std::end(performanceData)};
+        std::sort(sortPerfCounts.begin(), sortPerfCounts.end(), sort_pc_descend);
+
+        for (const auto& it : sortPerfCounts) {
+            if (it.status == ov::ProfilingInfo::Status::EXECUTED) {
+                std::string toPrint(it.node_name);
+                const int maxLayerName = 30;
+
+                if (it.node_name.length() >= maxLayerName) {
+                    toPrint = it.node_name.substr(0, maxLayerName - 5);
+                    toPrint += "...";
+                }
+
+                stream << std::setw(maxLayerName) << std::left << toPrint << " ";
+                stream << std::setw(15) << std::left << "EXECUTED ";
+                stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " ";
+                stream << std::setw(30) << std::left << "execType: " + std::string(it.exec_type) + " ";
+                stream << std::setw(25) << std::left
+                       << "realTime (ms): " + std::to_string(it.real_time.count() / 1000.0) + " ";
+                stream << std::setw(25) << std::left
+                       << "cpuTime (ms): " + std::to_string(it.cpu_time.count() / 1000.0) + " ";
+
+                double opt_proportion = it.real_time.count() * 100.0 / totalTime.count();
+                std::stringstream opt_proportion_ss;
+                opt_proportion_ss << std::fixed << std::setprecision(2) << opt_proportion;
+                std::string opt_proportion_str = opt_proportion_ss.str();
+                if (opt_proportion_str == "0.00") {
+                    opt_proportion_str = "N/A";
+                }
+                stream << std::setw(20) << std::left << "proportion: " + opt_proportion_str + "%";
+
+                stream << std::endl;
+            }
+        }
+    }
+    stream << std::setw(25) << std::left << "Total time: " + std::to_string(totalTime.count() / 1000.0)
+           << " milliseconds" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Full device name: " << deviceName << std::endl;
+    std::cout << std::endl;
+    std::cout.flags(fmt);
+}
+
+static UNUSED void printPerformanceCountsSort(std::vector<ov::ProfilingInfo> performanceData,
+                                              std::ostream& stream,
+                                              std::string deviceName,
+                                              std::string sorttype,
+                                              bool bshowHeader = true) {
+    if (sorttype == pcNoSort) {
+        printPerformanceCountsNoSort(performanceData, stream, deviceName, bshowHeader);
+    } else if (sorttype == pcSort) {
+        printPerformanceCountsDescendSort(performanceData, stream, deviceName, bshowHeader);
+    } else if (sorttype == pcSimpleSort) {
+        printPerformanceCountsSimpleSort(performanceData, stream, deviceName, bshowHeader);
+    }
+}
diff --git a/python/openvino/runtime/common/utils/include/samples/console_progress.hpp b/python/openvino/runtime/common/utils/include/samples/console_progress.hpp
new file mode 100644
index 0000000..f62aeed
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/console_progress.hpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdio>
+#include <iomanip>
+#include <sstream>
+
+/**
+ * @class ConsoleProgress
+ * @brief A ConsoleProgress class provides functionality for printing progress dynamics
+ */
+class ConsoleProgress {
+    static const size_t DEFAULT_DETALIZATION = 20;
+    static const size_t DEFAULT_PERCENT_TO_UPDATE_PROGRESS = 1;
+
+    size_t total;
+    size_t cur_progress = 0;
+    size_t prev_progress = 0;
+    bool stream_output;
+    size_t detalization;
+    size_t percent_to_update;
+
+public:
+    /**
+     * @brief A constructor of ConsoleProgress class
+     * @param _total - maximum value that is correspondent to 100%
+     * @param _detalization - number of symbols(.) to use to represent progress
+     */
+    explicit ConsoleProgress(size_t _total,
+                             bool _stream_output = false,
+                             size_t _percent_to_update = DEFAULT_PERCENT_TO_UPDATE_PROGRESS,
+                             size_t _detalization = DEFAULT_DETALIZATION)
+        : total(_total),
+          detalization(_detalization),
+          percent_to_update(_percent_to_update) {
+        stream_output = _stream_output;
+        if (total == 0) {
+            total = 1;
+        }
+    }
+
+    /**
+     * @brief Shows progress with current data. Progress is shown from the beginning of the current
+     * line.
+     */
+    void showProgress() const {
+        std::stringstream strm;
+        if (!stream_output) {
+            strm << '\r';
+        }
+        strm << "Progress: [";
+        size_t i = 0;
+        for (; i < detalization * cur_progress / total; i++) {
+            strm << ".";
+        }
+        for (; i < detalization; i++) {
+            strm << " ";
+        }
+        strm << "] " << std::setw(3) << 100 * cur_progress / total << "% done";
+        if (stream_output) {
+            strm << std::endl;
+        }
+        std::fputs(strm.str().c_str(), stdout);
+        std::fflush(stdout);
+    }
+
+    /**
+     * @brief Updates current value and progressbar
+     */
+    void updateProgress() {
+        if (cur_progress > total)
+            cur_progress = total;
+        size_t prev_percent = 100 * prev_progress / total;
+        size_t cur_percent = 100 * cur_progress / total;
+
+        if (prev_progress == 0 || cur_progress == total || prev_percent + percent_to_update <= cur_percent) {
+            showProgress();
+            prev_progress = cur_progress;
+        }
+    }
+
+    /**
+     * @brief Adds value to currently represented and redraw progressbar
+     * @param add - value to add
+     */
+    void addProgress(int add) {
+        if (add < 0 && -add > static_cast<int>(cur_progress)) {
+            add = -static_cast<int>(cur_progress);
+        }
+        cur_progress += add;
+        updateProgress();
+    }
+
+    /**
+     * @brief Output end line.
+     * @return
+     */
+    void finish() {
+        std::stringstream strm;
+        strm << std::endl;
+        std::fputs(strm.str().c_str(), stdout);
+        std::fflush(stdout);
+    }
+};
diff --git a/python/openvino/runtime/common/utils/include/samples/csv_dumper.hpp b/python/openvino/runtime/common/utils/include/samples/csv_dumper.hpp
new file mode 100644
index 0000000..5c80134
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/csv_dumper.hpp
@@ -0,0 +1,98 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "samples/slog.hpp"
+
+/**
+ * @class CsvDumper
+ * @brief A CsvDumper class provides functionality for dumping the values in CSV files
+ */
+class CsvDumper {
+    std::ofstream file;
+    std::string filename;
+    bool canDump = true;
+    char delimiter = ';';
+
+    std::string generateFilename() {
+        std::stringstream filename;
+        filename << "dumpfile-";
+        filename << time(nullptr);
+        filename << ".csv";
+        return filename.str();
+    }
+
+public:
+    /**
+     * @brief A constructor. Disables dumping in case dump file cannot be created
+     * @param enabled - True if dumping is enabled by default.
+     * @param name - name of file to dump to. File won't be created if first parameter is false.
+     */
+    explicit CsvDumper(bool enabled = true, const std::string& name = "") : canDump(enabled) {
+        if (!canDump) {
+            return;
+        }
+        filename = (name == "" ? generateFilename() : name);
+        file.open(filename, std::ios::out);
+        if (!file) {
+            slog::warn << "Cannot create dump file! Disabling dump." << slog::endl;
+            canDump = false;
+        }
+    }
+
+    /**
+     * @brief Sets a delimiter to use in csv file
+     * @param c - Delimiter char
+     * @return
+     */
+    void setDelimiter(char c) {
+        delimiter = c;
+    }
+
+    /**
+     * @brief Overloads operator to organize streaming values to file. Does nothing if dumping is
+     * disabled Adds delimiter at the end of value provided
+     * @param add - value to add to dump
+     * @return reference to same object
+     */
+    template <class T>
+    CsvDumper& operator<<(const T& add) {
+        if (canDump) {
+            file << add << delimiter;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Finishes line in dump file. Does nothing if dumping is disabled
+     */
+    void endLine() {
+        if (canDump) {
+            file << "\n";
+        }
+    }
+
+    /**
+     * @brief Gets information if dump is enabled.
+     * @return true if dump is enabled and file was successfully created
+     */
+    bool dumpEnabled() {
+        return canDump;
+    }
+
+    /**
+     * @brief Gets name of a dump file
+     * @return name of a dump file
+     */
+    std::string getFilename() const {
+        return filename;
+    }
+};
diff --git a/python/openvino/runtime/common/utils/include/samples/latency_metrics.hpp b/python/openvino/runtime/common/utils/include/samples/latency_metrics.hpp
new file mode 100644
index 0000000..bca39d0
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/latency_metrics.hpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+// clang-format off
+#include "samples/common.hpp"
+#include "samples/slog.hpp"
+// clang-format on
+
+/// @brief Responsible for calculating different latency metrics
+class LatencyMetrics {
+public:
+    LatencyMetrics() {}
+
+    LatencyMetrics(const std::vector<double>& latencies,
+                   const std::string& data_shape = "",
+                   size_t percentile_boundary = 50)
+        : data_shape(data_shape),
+          percentile_boundary(percentile_boundary) {
+        fill_data(latencies, percentile_boundary);
+    }
+
+    void write_to_stream(std::ostream& stream) const;
+    void write_to_slog() const;
+
+    double median_or_percentile = 0;
+    double avg = 0;
+    double min = 0;
+    double max = 0;
+    std::string data_shape;
+
+private:
+    void fill_data(std::vector<double> latencies, size_t percentile_boundary);
+    size_t percentile_boundary = 50;
+};
diff --git a/python/openvino/runtime/common/utils/include/samples/ocv_common.hpp b/python/openvino/runtime/common/utils/include/samples/ocv_common.hpp
new file mode 100644
index 0000000..94f3b1f
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/ocv_common.hpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality using OpenCV
+ * @file ocv_common.hpp
+ */
+
+#pragma once
+
+#include <opencv2/opencv.hpp>
+
+#include "openvino/openvino.hpp"
+#include "samples/common.hpp"
+
+/**
+ * @brief Sets image data stored in cv::Mat object to a given Blob object.
+ * @param orig_image - given cv::Mat object with an image data.
+ * @param blob - Blob object which to be filled by an image data.
+ * @param batchIndex - batch index of an image inside of the blob.
+ */
+template <typename T>
+void matU8ToBlob(const cv::Mat& orig_image, InferenceEngine::Blob::Ptr& blob, int batchIndex = 0) {
+    InferenceEngine::SizeVector blobSize = blob->getTensorDesc().getDims();
+    const size_t width = blobSize[3];
+    const size_t height = blobSize[2];
+    const size_t channels = blobSize[1];
+    InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+    OPENVINO_ASSERT(mblob,
+                    "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
+                    "but by fact we were not able to cast inputBlob to MemoryBlob");
+    // locked memory holder should be alive all time while access to its buffer happens
+    auto mblobHolder = mblob->wmap();
+
+    T* blob_data = mblobHolder.as<T*>();
+
+    cv::Mat resized_image(orig_image);
+    if (static_cast<int>(width) != orig_image.size().width || static_cast<int>(height) != orig_image.size().height) {
+        cv::resize(orig_image, resized_image, cv::Size(width, height));
+    }
+
+    int batchOffset = batchIndex * width * height * channels;
+
+    for (size_t c = 0; c < channels; c++) {
+        for (size_t h = 0; h < height; h++) {
+            for (size_t w = 0; w < width; w++) {
+                blob_data[batchOffset + c * width * height + h * width + w] = resized_image.at<cv::Vec3b>(h, w)[c];
+            }
+        }
+    }
+}
+
+/**
+ * @brief Wraps data stored inside of a passed cv::Mat object by new Blob pointer.
+ * @note: No memory allocation is happened. The blob just points to already existing
+ *        cv::Mat data.
+ * @param mat - given cv::Mat object with an image data.
+ * @return resulting Blob pointer.
+ */
+static UNUSED InferenceEngine::Blob::Ptr wrapMat2Blob(const cv::Mat& mat) {
+    size_t channels = mat.channels();
+    size_t height = mat.size().height;
+    size_t width = mat.size().width;
+
+    size_t strideH = mat.step.buf[0];
+    size_t strideW = mat.step.buf[1];
+
+    bool is_dense = strideW == channels && strideH == channels * width;
+
+    OPENVINO_ASSERT(is_dense, "Doesn't support conversion from not dense cv::Mat");
+
+    InferenceEngine::TensorDesc tDesc(InferenceEngine::Precision::U8,
+                                      {1, channels, height, width},
+                                      InferenceEngine::Layout::NHWC);
+
+    return InferenceEngine::make_shared_blob<uint8_t>(tDesc, mat.data);
+}
+
+static UNUSED ov::Tensor wrapMat2Tensor(const cv::Mat& mat) {
+    const size_t channels = mat.channels();
+    const size_t height = mat.size().height;
+    const size_t width = mat.size().width;
+
+    const size_t strideH = mat.step.buf[0];
+    const size_t strideW = mat.step.buf[1];
+
+    const bool is_dense = strideW == channels && strideH == channels * width;
+    OPENVINO_ASSERT(is_dense, "Doesn't support conversion from not dense cv::Mat");
+
+    return ov::Tensor(ov::element::u8, ov::Shape{1, height, width, channels}, mat.data);
+}
diff --git a/python/openvino/runtime/common/utils/include/samples/os/windows/w_dirent.h b/python/openvino/runtime/common/utils/include/samples/os/windows/w_dirent.h
new file mode 100644
index 0000000..40d1c5b
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/os/windows/w_dirent.h
@@ -0,0 +1,176 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#if defined(_WIN32)
+
+#    ifndef WIN32_LEAN_AND_MEAN
+#        define WIN32_LEAN_AND_MEAN
+#        define WIN32_LEAN_AND_MEAN_UNDEF
+#    endif
+
+#    ifndef NOMINMAX
+#        define NOMINMAX
+#        define NOMINMAX_UNDEF
+#    endif
+
+#    if defined(_M_IX86) && !defined(_X86_) && !defined(_AMD64_)
+#        define _X86_
+#    endif
+
+#    if defined(_M_X64) && !defined(_X86_) && !defined(_AMD64_)
+#        define _AMD64_
+#    endif
+
+#    if defined(_M_ARM) && !defined(_ARM_) && !defined(_ARM64_)
+#        define _ARM_
+#    endif
+
+#    if defined(_M_ARM64) && !defined(_ARM_) && !defined(_ARM64_)
+#        define _ARM64_
+#    endif
+
+// clang-format off
+    #include <string.h>
+    #include <windef.h>
+    #include <fileapi.h>
+    #include <Winbase.h>
+    #include <sys/stat.h>
+// clang-format on
+
+// Copied from linux libc sys/stat.h:
+#    define S_ISREG(m) (((m)&S_IFMT) == S_IFREG)
+#    define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+
+/// @brief structure to store directory names
+struct dirent {
+    char* d_name;
+
+    explicit dirent(const wchar_t* wsFilePath) {
+        size_t i;
+        auto slen = wcslen(wsFilePath);
+        d_name = static_cast<char*>(malloc(slen + 1));
+        wcstombs_s(&i, d_name, slen + 1, wsFilePath, slen);
+    }
+    ~dirent() {
+        free(d_name);
+    }
+};
+
+/// @brief class to store directory data (files meta)
+class DIR {
+    WIN32_FIND_DATAA FindFileData;
+    HANDLE hFind;
+    dirent* next;
+
+    static inline bool endsWith(const std::string& src, const char* with) {
+        int wl = static_cast<int>(strlen(with));
+        int so = static_cast<int>(src.length()) - wl;
+        if (so < 0)
+            return false;
+        return 0 == strncmp(with, &src[so], wl);
+    }
+
+public:
+    DIR(const DIR& other) = delete;
+    DIR(DIR&& other) = delete;
+    DIR& operator=(const DIR& other) = delete;
+    DIR& operator=(DIR&& other) = delete;
+
+    explicit DIR(const char* dirPath) : next(nullptr) {
+        std::string ws = dirPath;
+        if (endsWith(ws, "\\"))
+            ws += "*";
+        else
+            ws += "\\*";
+        hFind = FindFirstFileA(ws.c_str(), &FindFileData);
+        FindFileData.dwReserved0 = hFind != INVALID_HANDLE_VALUE;
+    }
+
+    ~DIR() {
+        if (!next)
+            delete next;
+        next = nullptr;
+        FindClose(hFind);
+    }
+
+    /**
+     * @brief Check file handler is valid
+     * @return status True(success) or False(fail)
+     */
+    bool isValid() const {
+        return (hFind != INVALID_HANDLE_VALUE && FindFileData.dwReserved0);
+    }
+
+    /**
+     * @brief Add directory to directory names struct
+     * @return pointer to directory names struct
+     */
+    dirent* nextEnt() {
+        if (next != nullptr)
+            delete next;
+        next = nullptr;
+
+        if (!FindFileData.dwReserved0)
+            return nullptr;
+
+        wchar_t wbuf[4096];
+
+        size_t outSize;
+        mbstowcs_s(&outSize, wbuf, 4094, FindFileData.cFileName, 4094);
+        next = new dirent(wbuf);
+        FindFileData.dwReserved0 = FindNextFileA(hFind, &FindFileData);
+        return next;
+    }
+};
+
+/**
+ * @brief Create directory data struct element
+ * @param string directory path
+ * @return pointer to directory data struct element
+ */
+static DIR* opendir(const char* dirPath) {
+    auto dp = new DIR(dirPath);
+    if (!dp->isValid()) {
+        delete dp;
+        return nullptr;
+    }
+    return dp;
+}
+
+/**
+ * @brief Walk throw directory data struct
+ * @param pointer to directory data struct
+ * @return pointer to directory data struct next element
+ */
+static struct dirent* readdir(DIR* dp) {
+    return dp->nextEnt();
+}
+
+/**
+ * @brief Remove directory data struct
+ * @param pointer to struct directory data
+ * @return void
+ */
+static void closedir(DIR* dp) {
+    delete dp;
+}
+
+#    ifdef WIN32_LEAN_AND_MEAN_UNDEF
+#        undef WIN32_LEAN_AND_MEAN
+#        undef WIN32_LEAN_AND_MEAN_UNDEF
+#    endif
+
+#    ifdef NOMINMAX_UNDEF
+#        undef NOMINMAX_UNDEF
+#        undef NOMINMAX
+#    endif
+
+#else
+
+#    include <dirent.h>
+#    include <sys/types.h>
+
+#endif
diff --git a/python/openvino/runtime/common/utils/include/samples/slog.hpp b/python/openvino/runtime/common/utils/include/samples/slog.hpp
new file mode 100644
index 0000000..3f237e5
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/slog.hpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with logging facility for common samples
+ * @file log.hpp
+ */
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace slog {
+/**
+ * @class LogStreamEndLine
+ * @brief The LogStreamEndLine class implements an end line marker for a log stream
+ */
+class LogStreamEndLine {};
+
+static constexpr LogStreamEndLine endl;
+
+/**
+ * @class LogStreamBoolAlpha
+ * @brief The LogStreamBoolAlpha class implements bool printing for a log stream
+ */
+class LogStreamBoolAlpha {};
+
+static constexpr LogStreamBoolAlpha boolalpha;
+
+/**
+ * @class LogStreamFlush
+ * @brief The LogStreamFlush class implements flushing for a log stream
+ */
+class LogStreamFlush {};
+
+static constexpr LogStreamFlush flush;
+
+/**
+ * @class LogStream
+ * @brief The LogStream class implements a stream for sample logging
+ */
+class LogStream {
+    std::string _prefix;
+    std::ostream* _log_stream;
+    bool _new_line;
+
+public:
+    /**
+     * @brief A constructor. Creates an LogStream object
+     * @param prefix The prefix to print
+     */
+    LogStream(const std::string& prefix, std::ostream& log_stream);
+
+    /**
+     * @brief A stream output operator to be used within the logger
+     * @param arg Object for serialization in the logger message
+     */
+    template <class T>
+    LogStream& operator<<(const T& arg) {
+        if (_new_line) {
+            (*_log_stream) << "[ " << _prefix << " ] ";
+            _new_line = false;
+        }
+
+        (*_log_stream) << arg;
+        return *this;
+    }
+
+    /**
+     * @brief Overload output stream operator to print vectors in pretty form
+     * [value1, value2, ...]
+     */
+    template <typename T>
+    LogStream& operator<<(const std::vector<T>& v) {
+        (*_log_stream) << "[ ";
+
+        for (auto&& value : v)
+            (*_log_stream) << value << " ";
+
+        (*_log_stream) << "]";
+
+        return *this;
+    }
+
+    // Specializing for LogStreamEndLine to support slog::endl
+    LogStream& operator<<(const LogStreamEndLine&);
+
+    // Specializing for LogStreamBoolAlpha to support slog::boolalpha
+    LogStream& operator<<(const LogStreamBoolAlpha&);
+
+    // Specializing for LogStreamFlush to support slog::flush
+    LogStream& operator<<(const LogStreamFlush&);
+};
+
+extern LogStream info;
+extern LogStream warn;
+extern LogStream err;
+
+}  // namespace slog
diff --git a/python/openvino/runtime/common/utils/include/samples/vpu/vpu_tools_common.hpp b/python/openvino/runtime/common/utils/include/samples/vpu/vpu_tools_common.hpp
new file mode 100644
index 0000000..ba0665f
--- /dev/null
+++ b/python/openvino/runtime/common/utils/include/samples/vpu/vpu_tools_common.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <fstream>
+#include <map>
+#include <string>
+
+static std::map<std::string, std::string> parseConfig(const std::string& configName, char comment = '#') {
+    std::map<std::string, std::string> config = {};
+
+    std::ifstream file(configName);
+    if (!file.is_open()) {
+        return config;
+    }
+
+    std::string key, value;
+    while (file >> key >> value) {
+        if (key.empty() || key[0] == comment) {
+            continue;
+        }
+        config[key] = value;
+    }
+
+    return config;
+}
diff --git a/python/openvino/runtime/common/utils/src/args_helper.cpp b/python/openvino/runtime/common/utils/src/args_helper.cpp
new file mode 100644
index 0000000..ae7fa67
--- /dev/null
+++ b/python/openvino/runtime/common/utils/src/args_helper.cpp
@@ -0,0 +1,390 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include <iostream>
+
+#ifdef _WIN32
+#    include "samples/os/windows/w_dirent.h"
+#else
+#    include <dirent.h>
+#    include <unistd.h>
+#endif
+
+#include "openvino/openvino.hpp"
+
+#include "gflags/gflags.h"
+#include "samples/args_helper.hpp"
+#include "samples/slog.hpp"
+// clang-format on
+
+/**
+ * @brief Checks input file argument and add it to files vector
+ * @param files reference to vector to store file names
+ * @param arg file or folder name
+ * @return none
+ */
+void readInputFilesArguments(std::vector<std::string>& files, const std::string& arg) {
+    struct stat sb;
+
+#if defined(_WIN32)
+    FILE* fd = fopen(arg.c_str(), "r");
+    if (!fd) {
+        slog::warn << "File " << arg << " cannot be opened!" << slog::endl;
+        return;
+    }
+
+    if (fstat(fileno(fd), &sb) != 0) {
+        fclose(fd);
+        slog::warn << "File " << arg << " cannot be opened!" << slog::endl;
+        return;
+    }
+    fclose(fd);
+#else
+    int fd = open(arg.c_str(), O_RDONLY);
+    if (fd == -1) {
+        slog::warn << "File " << arg << " cannot be opened!" << slog::endl;
+        return;
+    }
+
+    if (fstat(fd, &sb) != 0) {
+        close(fd);
+        slog::warn << "File " << arg << " cannot be opened!" << slog::endl;
+        return;
+    }
+    close(fd);
+#endif
+
+    if (S_ISDIR(sb.st_mode)) {
+        struct CloseDir {
+            void operator()(DIR* d) const noexcept {
+                if (d) {
+                    closedir(d);
+                }
+            }
+        };
+        using Dir = std::unique_ptr<DIR, CloseDir>;
+        Dir dp(opendir(arg.c_str()));
+        if (dp == nullptr) {
+            slog::warn << "Directory " << arg << " cannot be opened!" << slog::endl;
+            return;
+        }
+
+        struct dirent* ep;
+        while (nullptr != (ep = readdir(dp.get()))) {
+            std::string fileName = ep->d_name;
+            if (fileName == "." || fileName == "..")
+                continue;
+            files.push_back(arg + "/" + ep->d_name);
+        }
+    } else {
+        files.push_back(arg);
+    }
+}
+
+/**
+ * @brief This function find -i key in input args. It's necessary to process multiple values for
+ * single key
+ * @param files reference to vector
+ * @return none.
+ */
+void parseInputFilesArguments(std::vector<std::string>& files) {
+    std::vector<std::string> args = gflags::GetArgvs();
+    auto args_it = begin(args);
+    const auto is_image_arg = [](const std::string& s) {
+        return s == "-i" || s == "--images";
+    };
+    const auto is_arg = [](const std::string& s) {
+        return s.front() == '-';
+    };
+
+    while (args_it != args.end()) {
+        const auto img_start = std::find_if(args_it, end(args), is_image_arg);
+        if (img_start == end(args)) {
+            break;
+        }
+        const auto img_begin = std::next(img_start);
+        const auto img_end = std::find_if(img_begin, end(args), is_arg);
+        for (auto img = img_begin; img != img_end; ++img) {
+            readInputFilesArguments(files, *img);
+        }
+        args_it = img_end;
+    }
+
+    if (files.empty()) {
+        return;
+    }
+    size_t max_files = 20;
+    if (files.size() < max_files) {
+        slog::info << "Files were added: " << files.size() << slog::endl;
+        for (const auto& filePath : files) {
+            slog::info << "    " << filePath << slog::endl;
+        }
+    } else {
+        slog::info << "Files were added: " << files.size() << ". Too many to display each of them." << slog::endl;
+    }
+}
+
+std::vector<std::string> splitStringList(const std::string& str, char delim) {
+    if (str.empty())
+        return {};
+
+    std::istringstream istr(str);
+
+    std::vector<std::string> result;
+    std::string elem;
+    while (std::getline(istr, elem, delim)) {
+        if (elem.empty()) {
+            continue;
+        }
+        result.emplace_back(std::move(elem));
+    }
+
+    return result;
+}
+
+std::map<std::string, std::string> parseArgMap(std::string argMap) {
+    argMap.erase(std::remove_if(argMap.begin(), argMap.end(), ::isspace), argMap.end());
+
+    const auto pairs = splitStringList(argMap, ',');
+
+    std::map<std::string, std::string> parsedMap;
+    for (auto&& pair : pairs) {
+        const auto lastDelimPos = pair.find_last_of(':');
+        auto key = pair.substr(0, lastDelimPos);
+        auto value = pair.substr(lastDelimPos + 1);
+
+        if (lastDelimPos == std::string::npos || key.empty() || value.empty()) {
+            throw std::invalid_argument("Invalid key/value pair " + pair + ". Expected <layer_name>:<value>");
+        }
+
+        parsedMap[std::move(key)] = std::move(value);
+    }
+
+    return parsedMap;
+}
+
+using supported_type_t = std::unordered_map<std::string, ov::element::Type>;
+ov::element::Type getType(std::string value, const supported_type_t& supported_precisions) {
+    std::transform(value.begin(), value.end(), value.begin(), ::toupper);
+
+    const auto precision = supported_precisions.find(value);
+    if (precision == supported_precisions.end()) {
+        throw std::logic_error("\"" + value + "\"" + " is not a valid precision");
+    }
+
+    return precision->second;
+}
+ov::element::Type getType(const std::string& value) {
+    static const supported_type_t supported_types = {
+        {"FP32", ov::element::f32}, {"f32", ov::element::f32},      {"FP16", ov::element::f16},
+        {"f16", ov::element::f16},  {"BF16", ov::element::bf16},    {"bf16", ov::element::bf16},
+        {"U64", ov::element::u64},  {"u64", ov::element::u64},      {"I64", ov::element::i64},
+        {"i64", ov::element::i64},  {"U32", ov::element::u32},      {"u32", ov::element::u32},
+        {"I32", ov::element::i32},  {"i32", ov::element::i32},      {"U16", ov::element::u16},
+        {"u16", ov::element::u16},  {"I16", ov::element::i16},      {"i16", ov::element::i16},
+        {"U8", ov::element::u8},    {"u8", ov::element::u8},        {"I8", ov::element::i8},
+        {"i8", ov::element::i8},    {"BOOL", ov::element::boolean}, {"boolean", ov::element::boolean},
+    };
+
+    return getType(value, supported_types);
+}
+
+void printInputAndOutputsInfo(const ov::Model& network) {
+    slog::info << "model name: " << network.get_friendly_name() << slog::endl;
+
+    const std::vector<ov::Output<const ov::Node>> inputs = network.inputs();
+    for (const ov::Output<const ov::Node> &input : inputs) {
+        slog::info << "    inputs" << slog::endl;
+
+        const std::string name = input.get_names().empty() ? "NONE" : input.get_any_name();
+        slog::info << "        input name: " << name << slog::endl;
+
+        const ov::element::Type type = input.get_element_type();
+        slog::info << "        input type: " << type << slog::endl;
+
+        const ov::Shape shape = input.get_shape();
+        slog::info << "        input shape: " << shape << slog::endl;
+    }
+
+    const std::vector<ov::Output<const ov::Node>> outputs = network.outputs();
+    for (const ov::Output<const ov::Node> &output : outputs) {
+        slog::info << "    outputs" << slog::endl;
+
+        const std::string name = output.get_names().empty() ? "NONE" : output.get_any_name();
+        slog::info << "        output name: " << name << slog::endl;
+
+        const ov::element::Type type = output.get_element_type();
+        slog::info << "        output type: " << type << slog::endl;
+
+        const ov::Shape shape = output.get_shape();
+        slog::info << "        output shape: " << shape << slog::endl;
+    }
+}
+
+void configurePrePostProcessing(std::shared_ptr<ov::Model>& model,
+                                const std::string& ip,
+                                const std::string& op,
+                                const std::string& iop,
+                                const std::string& il,
+                                const std::string& ol,
+                                const std::string& iol,
+                                const std::string& iml,
+                                const std::string& oml,
+                                const std::string& ioml) {
+    auto preprocessor = ov::preprocess::PrePostProcessor(model);
+    const auto inputs = model->inputs();
+    const auto outputs = model->outputs();
+    if (!ip.empty()) {
+        auto type = getType(ip);
+        for (size_t i = 0; i < inputs.size(); i++) {
+            preprocessor.input(i).tensor().set_element_type(type);
+        }
+    }
+
+    if (!op.empty()) {
+        auto type = getType(op);
+        for (size_t i = 0; i < outputs.size(); i++) {
+            preprocessor.output(i).tensor().set_element_type(type);
+        }
+    }
+
+    if (!iop.empty()) {
+        const auto user_precisions_map = parseArgMap(iop);
+        for (auto&& item : user_precisions_map) {
+            const auto& tensor_name = item.first;
+            const auto type = getType(item.second);
+
+            bool tensorFound = false;
+            for (size_t i = 0; i < inputs.size(); i++) {
+                if (inputs[i].get_names().count(tensor_name)) {
+                    preprocessor.input(i).tensor().set_element_type(type);
+                    tensorFound = true;
+                    break;
+                }
+            }
+            if (!tensorFound) {
+                for (size_t i = 0; i < outputs.size(); i++) {
+                    if (outputs[i].get_names().count(tensor_name)) {
+                        preprocessor.output(i).tensor().set_element_type(type);
+                        tensorFound = true;
+                        break;
+                    }
+                }
+            }
+            OPENVINO_ASSERT(!tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name);
+        }
+    }
+    if (!il.empty()) {
+        for (size_t i = 0; i < inputs.size(); i++) {
+            preprocessor.input(i).tensor().set_layout(ov::Layout(il));
+        }
+    }
+
+    if (!ol.empty()) {
+        for (size_t i = 0; i < outputs.size(); i++) {
+            preprocessor.output(i).tensor().set_layout(ov::Layout(ol));
+        }
+    }
+
+    if (!iol.empty()) {
+        const auto user_precisions_map = parseArgMap(iol);
+        for (auto&& item : user_precisions_map) {
+            const auto& tensor_name = item.first;
+
+            bool tensorFound = false;
+            for (size_t i = 0; i < inputs.size(); i++) {
+                if (inputs[i].get_names().count(tensor_name)) {
+                    preprocessor.input(i).tensor().set_layout(ov::Layout(item.second));
+                    tensorFound = true;
+                    break;
+                }
+            }
+            if (!tensorFound) {
+                for (size_t i = 0; i < outputs.size(); i++) {
+                    if (outputs[i].get_names().count(tensor_name)) {
+                        preprocessor.output(i).tensor().set_layout(ov::Layout(item.second));
+                        tensorFound = true;
+                        break;
+                    }
+                }
+            }
+            OPENVINO_ASSERT(!tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name);
+        }
+    }
+
+    if (!iml.empty()) {
+        for (size_t i = 0; i < inputs.size(); i++) {
+            preprocessor.input(i).model().set_layout(ov::Layout(iml));
+        }
+    }
+
+    if (!oml.empty()) {
+        for (size_t i = 0; i < outputs.size(); i++) {
+            preprocessor.output(i).model().set_layout(ov::Layout(oml));
+        }
+    }
+
+    if (!ioml.empty()) {
+        const auto user_precisions_map = parseArgMap(ioml);
+        for (auto&& item : user_precisions_map) {
+            const auto& tensor_name = item.first;
+
+            bool tensorFound = false;
+            for (size_t i = 0; i < inputs.size(); i++) {
+                if (inputs[i].get_names().count(tensor_name)) {
+                    preprocessor.input(i).model().set_layout(ov::Layout(item.second));
+                    tensorFound = true;
+                    break;
+                }
+            }
+            if (!tensorFound) {
+                for (size_t i = 0; i < outputs.size(); i++) {
+                    if (outputs[i].get_names().count(tensor_name)) {
+                        preprocessor.output(i).model().set_layout(ov::Layout(item.second));
+                        tensorFound = true;
+                        break;
+                    }
+                }
+            }
+            OPENVINO_ASSERT(!tensorFound, "Model doesn't have input/output with tensor name: ", tensor_name);
+        }
+    }
+
+    model = preprocessor.build();
+}
+
+ov::element::Type getPrecision(std::string value,
+                               const std::unordered_map<std::string, ov::element::Type>& supported_precisions) {
+    std::transform(value.begin(), value.end(), value.begin(), ::toupper);
+
+    const auto precision = supported_precisions.find(value);
+    if (precision == supported_precisions.end()) {
+        throw std::logic_error("\"" + value + "\"" + " is not a valid precision");
+    }
+
+    return precision->second;
+}
+
+ov::element::Type getPrecision2(const std::string& value) {
+    static const std::unordered_map<std::string, ov::element::Type> supported_precisions = {
+        {"FP32", ov::element::f32},
+        {"FP16", ov::element::f16},
+        {"BF16", ov::element::bf16},
+        {"U64", ov::element::u64},
+        {"I64", ov::element::i64},
+        {"U32", ov::element::u32},
+        {"I32", ov::element::i32},
+        {"U16", ov::element::u16},
+        {"I16", ov::element::i16},
+        {"U8", ov::element::u8},
+        {"I8", ov::element::i8},
+        {"BOOL", ov::element::boolean},
+    };
+
+    return getPrecision(value, supported_precisions);
+}
diff --git a/python/openvino/runtime/common/utils/src/common.cpp b/python/openvino/runtime/common/utils/src/common.cpp
new file mode 100644
index 0000000..fb238c7
--- /dev/null
+++ b/python/openvino/runtime/common/utils/src/common.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "samples/common.hpp"
+
+std::map<std::string, std::string> parseConfig(const std::string& configName, char comment) {
+    std::map<std::string, std::string> config = {};
+
+    std::ifstream file(configName);
+    if (!file.is_open()) {
+        return config;
+    }
+
+    std::string key, value;
+    while (file >> key >> value) {
+        if (key.empty() || key[0] == comment) {
+            continue;
+        }
+        config[key] = value;
+    }
+
+    return config;
+}
diff --git a/python/openvino/runtime/common/utils/src/latency_metrics.cpp b/python/openvino/runtime/common/utils/src/latency_metrics.cpp
new file mode 100644
index 0000000..c6c3d15
--- /dev/null
+++ b/python/openvino/runtime/common/utils/src/latency_metrics.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "samples/latency_metrics.hpp"
+// clang-format on
+
+void LatencyMetrics::write_to_stream(std::ostream& stream) const {
+    std::ios::fmtflags fmt(stream.flags());
+    stream << data_shape << ";" << std::fixed << std::setprecision(2) << median_or_percentile << ";" << avg << ";"
+           << min << ";" << max;
+    stream.flags(fmt);
+}
+
+void LatencyMetrics::write_to_slog() const {
+    std::string percentileStr = (percentile_boundary == 50)
+                                    ? "   Median:           "
+                                    : "   " + std::to_string(percentile_boundary) + " percentile:     ";
+
+    slog::info << percentileStr << double_to_string(median_or_percentile) << " ms" << slog::endl;
+    slog::info << "   Average:          " << double_to_string(avg) << " ms" << slog::endl;
+    slog::info << "   Min:              " << double_to_string(min) << " ms" << slog::endl;
+    slog::info << "   Max:              " << double_to_string(max) << " ms" << slog::endl;
+}
+
+void LatencyMetrics::fill_data(std::vector<double> latencies, size_t percentile_boundary) {
+    if (latencies.empty()) {
+        throw std::logic_error("Latency metrics class expects non-empty vector of latencies at consturction.");
+    }
+    std::sort(latencies.begin(), latencies.end());
+    min = latencies[0];
+    avg = std::accumulate(latencies.begin(), latencies.end(), 0.0) / latencies.size();
+    median_or_percentile = latencies[size_t(latencies.size() / 100.0 * percentile_boundary)];
+    max = latencies.back();
+};
diff --git a/python/openvino/runtime/common/utils/src/slog.cpp b/python/openvino/runtime/common/utils/src/slog.cpp
new file mode 100644
index 0000000..df484ec
--- /dev/null
+++ b/python/openvino/runtime/common/utils/src/slog.cpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// clang-format off
+#include <iostream>
+
+#include "samples/slog.hpp"
+// clang-format on
+
+namespace slog {
+
+LogStream info("INFO", std::cout);
+LogStream warn("WARNING", std::cout);
+LogStream err("ERROR", std::cerr);
+
+LogStream::LogStream(const std::string& prefix, std::ostream& log_stream) : _prefix(prefix), _new_line(true) {
+    _log_stream = &log_stream;
+}
+
+// Specializing for LogStreamEndLine to support slog::endl
+LogStream& LogStream::operator<<(const LogStreamEndLine& /*arg*/) {
+    if (_new_line)
+        (*_log_stream) << "[ " << _prefix << " ] ";
+    _new_line = true;
+
+    (*_log_stream) << std::endl;
+    return *this;
+}
+
+// Specializing for LogStreamBoolAlpha to support slog::boolalpha
+LogStream& LogStream::operator<<(const LogStreamBoolAlpha& /*arg*/) {
+    (*_log_stream) << std::boolalpha;
+    return *this;
+}
+
+// Specializing for LogStreamFlush to support slog::flush
+LogStream& LogStream::operator<<(const LogStreamFlush& /*arg*/) {
+    (*_log_stream) << std::flush;
+    return *this;
+}
+
+}  // namespace slog
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h
new file mode 100644
index 0000000..76fd968
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/batch_job.h
@@ -0,0 +1,31 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef BATCH_JOB_H
+#define BATCH_JOB_H
+
+class BatchJob {
+ public:
+  // @param inputArray - ptr to CPU array containing input data to be copied to DDR
+  // blocking function
+  virtual void LoadInputFeatureToDDR(void* inputArray) = 0;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0;
+  virtual void ScheduleInputFeature() const = 0;
+  virtual void StartDla() = 0;
+  virtual ~BatchJob() {}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
new file mode 100644
index 0000000..7d91f0e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
@@ -0,0 +1,88 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "batch_job.h"    // BatchJob
+#include "mmd_wrapper.h"  // MmdWrapper
+
+// TODO:integrate with dla compiler later
+// #include "dla_types.h"
+// #include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  // uint64_t
+#include <memory>   // std::unique_ptr
+
+class StreamControllerComms;
+
+// BatchJob represents one batch execution
+// Contains input/output address and size in DDR for one batch
+// Contains functions to write feature data to DDR, start DLA and read output data from DDR
+class CoreDlaBatchJob : public BatchJob {
+ private:
+  // MMD access is required to handshake with CSR and transfer data between host/device memory
+  MmdWrapper* mmdWrapper_;
+  int instance_;
+  // size and address of graph config data allocated in DDR
+  uint64_t totalConfigWords_;
+  uint64_t configBaseAddrDDR_;
+  // size and address of input and output data allocated in DDR for 1 batch
+  uint64_t inputAddrDDR_;
+  uint64_t outputAddrDDR_;
+  uint64_t inputSizeDDR_;
+  uint64_t outputSizeDDR_;
+  const bool enableIstream_;
+  const bool enableOstream_;
+  uint64_t lastJobQueueNumber_;
+
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+
+  CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+                  uint64_t totalConfigWords,
+                  uint64_t configBaseAddrDDR,
+                  uint64_t inputAddrDDR,
+                  uint64_t outputAddrDDR,
+                  uint64_t inputSizeDDR,
+                  uint64_t outputSizeDDR,
+                  const bool enableIstream,
+                  const bool enableOstream,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+
+ public:
+  CoreDlaBatchJob(const CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob(CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete;
+  static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper,
+                                              uint64_t totalConfigWords,
+                                              uint64_t configBaseAddrDDR,
+                                              uint64_t inputAddrDDR,
+                                              uint64_t outputAddrDDR,
+                                              uint64_t inputSizeDDR,
+                                              uint64_t outputSizeDDR,
+                                              const bool enableIstream,
+                                              const bool enableOstream,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // @param inputArray - ptr to CPU array containing input data tp be copied to DDR
+  // blocking function
+  void LoadInputFeatureToDDR(void* inputArray) override;
+  void ScheduleInputFeature() const override;
+
+  // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data
+  void StartDla() override;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  void ReadOutputFeatureFromDDR(void* outputArray) const override;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h
new file mode 100644
index 0000000..2a04fa8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h
@@ -0,0 +1,144 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "device.h"                   //Device
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+#include <condition_variable>  //std::condition_variable
+#include <cstdint>             //uint64_t
+#include <map>                 //std::map
+#include <memory>              //std::unique_ptr
+#include <mutex>               //std::mutex
+#include <vector>              //std::vector
+
+class StreamControllerComms;
+
+// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
+// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
+// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
+// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
+// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
+// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
+// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
+struct InterruptServiceRoutineData {
+  MmdWrapper* mmdWrapper;
+  std::vector<uint64_t> jobsFinished;
+  std::vector<uint32_t> base_multiplier;
+  std::vector<uint32_t> prevCount;
+  std::vector<uint32_t> desc_queue_diag;
+  std::vector<std::mutex> isrMutex;
+  std::vector<std::condition_variable> isrCondVar;
+};
+
+/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
+ * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
+ * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
+ */
+class CoreDlaDevice : public Device {
+ public:
+  GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                           size_t numPipelines,
+#else
+                           uint64_t numPipelines,
+#endif
+                           int instance,
+                           std::string AES_key,
+                           std::string IV_key,
+                           bool encryption_enabled,
+                           // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
+                           // for both HW and SW emulator runtime. The software emulator has output file where as currently the
+                           // HW runtime does not.
+                           const std::string export_dir,
+                           const std::string parameter_rom_export_dir);
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  double GetActiveHWTimeMs(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  uint64_t GetNumFilterMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;
+
+ private:
+  // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
+  // if the module number and address have not been implemented. The debug network is fault tolerant to both read
+  // requests never being accepted as well as read responses never being produced.
+  bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;
+
+#ifndef USE_OLD_COREDLA_DEVICE
+  // Must be called when there are no active jobs on DLA
+  // Returns total number of clocks by DLA jobs on hardware.
+  uint64_t GetClocksActive(int instance) const;
+
+  // Must be called when there are no active jobs on DLA
+  // Returns the clocks of all jobs
+  uint64_t GetClocksAllJobs(int instance) const;
+#endif
+
+  uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumFilterMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;
+
+ public:
+  // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+  // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+  // information the debug register contains, and the value is the data of the debug register.
+  DebugNetworkData ReadDebugNetwork(int instance) const override;
+
+  CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
+  ~CoreDlaDevice();
+  int GetSizeCsrDescriptorQueue() const override;
+  double GetCoreDlaClockFreq() const override;
+  int GetNumInstances() const override { return numInstances_; }
+  void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override;  // threadId is optional and for debugging purpose only
+  std::string SchedulerGetStatus() const override;
+  bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
+                           const std::string source_fifo_file="") override;
+
+ private:
+  std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
+  std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
+  int numInstances_;
+  MmdWrapper mmdWrapper_;
+  InterruptServiceRoutineData isrData_;
+  std::vector<uint64_t> jobsWaited_;
+#ifndef USE_OLD_COREDLA_DEVICE
+  std::vector<uint64_t> startClocksActive;
+  std::vector<uint64_t> startClockAllJobs;
+#endif
+  std::vector<uint64_t> startNumInputFeatureMemoryReads;
+  std::vector<uint64_t> startNumFilterMemoryReads;
+  std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+  bool runtimePolling_;
+  uint32_t waitForDlaTimeoutSeconds_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
new file mode 100644
index 0000000..3dc91bc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
@@ -0,0 +1,83 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "coredla_batch_job.h"        //BatchJob
+#include "device.h"                   //DLA_LOG
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+// TODO:integrate with dla compiler later
+//#include "dla_types.h"
+//#include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  //uint64_t
+#include <memory>   //std::unique_ptr
+#include <mutex>    //std::mutex
+#include <vector>   //std::vector
+
+/*! GraphJob is a DLA compiled graph loaded onto a device
+ * Initialized with DlaDevice object
+ * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs
+ * It provides handle to "batch job" objects that are used to load input and start DLA for one batch
+ */
+
+class CoreDlaGraphJob : public GraphJob {
+ public:
+  // Function to construct and return a unique pointer GraphJob object to the runtime user
+  // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below
+  // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias
+  // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA
+  // compiler's output)
+  // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h
+  // "CONFIG_READER_DATA_BYTES")
+  // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers
+  // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously
+  // placed
+  // @param outputSizeDDR - size of one batch output data in DDR
+  // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs
+  // @param spStreamControllerComms - optional interface to stream controller
+  static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator,
+                                              MmdWrapper* mmdWrapper,
+                                              const dla::CompiledResult* compiled_result,
+                                              uint64_t numPipelines,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  // Increments batchJobsRequested_
+  // Thread safe
+  BatchJob* GetBatchJob();
+  CoreDlaGraphJob(const GraphJob&) = delete;
+  CoreDlaGraphJob(CoreDlaGraphJob&) = delete;
+  CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete;
+
+ private:
+  uint64_t configFilterBiasBufferSizeDDR_;
+  uint64_t intermediateBufferSizeDDR_;
+  DeviceMemoryAllocator* ddrBufferAllocator_;
+  MmdWrapper* mmdWrapper_;
+  std::vector<std::unique_ptr<BatchJob>> batchJobs_;
+  unsigned int batchJobsRequested_;
+  unsigned int instance_;
+  std::mutex graphJobMutex;
+  CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator,
+                  MmdWrapper* mmdWrapper,
+                  const dla::CompiledResult* compiledResult,
+                  uint64_t numPipelines,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+};
diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h
new file mode 100644
index 0000000..e506578
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device.h
@@ -0,0 +1,81 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+using namespace std;
+using DebugNetworkData = std::map<std::string, uint64_t>;
+
+// dla log macro
+#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__);
+#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__);
+
+class GraphJob;
+class arch_params;
+namespace dla {
+class CompiledResult;
+}
+class Device {
+ public:
+  static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds);
+  virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+                                   size_t numPipelines,
+                                   int instance,
+                                   std::string AES_key,
+                                   std::string IV_key,
+                                   bool encryption_enabled,
+                                   const std::string export_dir,
+                                   const std::string parameter_rom_export_dir) = 0;
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  virtual int GetNumInferencesCompleted(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  virtual double GetActiveHWTimeMs(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0;
+  // Waits for a job to finish on specified instance
+  virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0;
+  virtual int GetNumInstances() const = 0;
+  virtual double GetCoreDlaClockFreq() const = 0;
+  virtual int GetSizeCsrDescriptorQueue() const = 0;
+  virtual std::string SchedulerGetStatus() const = 0;
+  virtual bool InitializeScheduler(uint32_t sourceBufferSize,
+                                   uint32_t dropSourceBuffers,
+                                   uint32_t numInferenceRequests,
+                                   const std::string source_fifo_file="") = 0;
+  virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0;
+  virtual ~Device(){}
+};
+
+#endif  // DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
new file mode 100644
index 0000000..adc0a71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
@@ -0,0 +1,61 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "mmd_wrapper.h"  //MmdWrapper
+
+#include <cstdint>  //uint64_t
+
+/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR
+ * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O)
+ * A graph buffer is allocated in DDR from right to left
+ * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data
+ * This intermediate buffer space is allocated from left to right (starting address is 0)
+ * and is expanded based on graph's requirement
+ */
+class DeviceMemoryAllocator {
+ public:
+  void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper);
+  ~DeviceMemoryAllocator();
+
+  // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate
+  // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for
+  void AllocateSharedBuffer(uint64_t bufferSize, int instance);
+
+  // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is
+  // an example of this. We have decided to allocate this at the upper address and allocate downwards from there.
+  // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add
+  // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call
+  // to the allocator.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2
+  // @param bufferAddr - the allocator indicates where it placed this buffer
+  void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr);
+
+  // Clears whole DDR space including the intermediate buffer
+  void Clear();
+
+ private:
+  // total DDR size (BSP parameter)
+  uint64_t totalGlobalMemSize_;
+  // For access to MMD
+  MmdWrapper *mmdWrapper_;
+  // current starting address of allocated graph buffer region
+  // graph buffers are allocated right to left
+  uint64_t currentStartAddressGraphBufferSpace_;
+  // current maximum allocated size for intermediate data
+  uint64_t currentIntermediateMaxBufferSizeAllocated_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
new file mode 100644
index 0000000..13fb56b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
@@ -0,0 +1,27 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+// save a copy
+#pragma push_macro("localparam")
+
+// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;"
+#undef localparam
+#define localparam constexpr
+
+// include the verilog header
+#include "dla_dma_constants.svh"
+
+// undo the syntax change
+#pragma pop_macro("localparam")
diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h
new file mode 100644
index 0000000..b04dde1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/graph_job.h
@@ -0,0 +1,28 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef GRAPH_JOB_H
+#define GRAPH_JOB_H
+
+#include "batch_job.h"
+using namespace std;
+class GraphJob {
+ public:
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  virtual BatchJob* GetBatchJob() = 0;
+
+  virtual ~GraphJob(){}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
new file mode 100644
index 0000000..4014454
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
@@ -0,0 +1,63 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <cstdint>  //uint32_t
+
+using interrupt_service_routine_signature = void (*)(int handle, void *data);
+
+class MmdWrapper {
+ public:
+  MmdWrapper();
+  // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close()
+  // fails.  Ideally we would find some way to re-order the code so that it
+  // can throw an exception (before calling the destructor) if aocl_mmd_close()
+  // fails.
+  ~MmdWrapper();
+
+  // class cannot be copied
+  MmdWrapper(const MmdWrapper &) = delete;
+  MmdWrapper &operator=(const MmdWrapper &) = delete;
+
+  // Register a function to run as the interrupt service routine
+  void RegisterISR(interrupt_service_routine_signature func, void *data) const;
+
+  // 32-bit handshake with each CSR
+  void WriteToCsr(int instance, uint32_t addr, uint32_t data) const;
+  uint32_t ReadFromCsr(int instance, uint32_t addr) const;
+
+  // Copy data between host and device memory
+  void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const;
+  void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const;
+
+  // If the mmd layer supports accesses to the STREAM CONTROLLER
+  bool bIsStreamControllerValid(int instance) const;
+
+  // 32-bit handshake with each Stream Controller CSR
+  void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const;
+  void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const;
+
+  // Provide read-only access to board-specific constants
+  int GetMaxInstances() const { return maxInstances_; }
+  uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; }
+  double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; }
+  double GetDDRClockFreq() const { return ddrClockFreq_; }
+
+ private:
+  int handle_;
+  int maxInstances_;
+  uint64_t ddrSizePerInstance_;
+  double coreDlaClockFreq_;
+  double ddrClockFreq_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
new file mode 100644
index 0000000..e2fcdfc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
@@ -0,0 +1,69 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <mutex>
+#include <string>
+#include <vector>
+#include "mmd_wrapper.h"
+#include "stream_controller_messages.h"
+
+template <class T>
+struct Payload : public T {
+  void* GetPayload() { return this; }
+  size_t GetSize() { return sizeof(*this); }
+};
+
+class BusyFlag {
+ public:
+  bool Lock();
+  void Release();
+
+ private:
+  std::recursive_mutex _mutex;
+  bool _busy = false;
+};
+
+class BusyCheck {
+ public:
+  BusyCheck(BusyFlag& busyFlag);
+  ~BusyCheck();
+  operator bool();
+
+ private:
+  BusyFlag& _busyFlag;
+  bool _haveLocked;
+};
+
+class StreamControllerComms {
+ public:
+  StreamControllerComms();
+  bool IsPresent();
+  Payload<StatusMessagePayload> GetStatus();
+  std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload);
+  bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items);
+  bool Ping();
+  bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+
+ private:
+  bool StatusMessageHandler(uint32_t payloadOffset);
+  MessageType ReceiveMessage();
+  bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0);
+  MmdWrapper _mmdWrapper;
+  uint32_t _lastReceiveSequenceID = 0;
+  uint32_t _sendSequenceID = 0;
+  uint32_t _numBadMessages = 0;
+  const int _streamControllerInstance = 0;
+  Payload<StatusMessagePayload> _receivedStatusMessage;
+  BusyFlag _busyFlag;
+};
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
new file mode 100644
index 0000000..445a304
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
@@ -0,0 +1,62 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+add_definitions(-DI_DK_AFU_ID="11446C9D-AA42-4085-9B3D-4EEF9429A4AD")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/mmd.cpp
+   ./host/mmd_device.cpp
+   ./host/mmd_dma.cpp
+   ./host/mmd_helper.cpp
+   ./host/kernel_interrupt.cpp
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when linking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
new file mode 100755
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
new file mode 100755
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..97882d4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
@@ -0,0 +1,257 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "kernel_interrupt.h"
+
+#include <poll.h>
+#include <sys/eventfd.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+#include "mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+static const int mmd_kernel_interrupt_line_num = 1;
+static const uint32_t enable_int_mask = 0x00000001;
+static const uint32_t disable_int_mask = 0x00000000;
+
+bool KernelInterrupt::enable_thread = false;
+
+static const int debug_log_level = 0;
+
+// TODO: use consistent function throughout MMD for controlling debug
+// messages. This debug_print function is from OFS.
+static void debug_print(std::string &err_msg, int msglog) {
+  if (debug_log_level >= msglog) {
+    std::cerr << "KernelInterrupt: " << err_msg << std::endl;
+  }
+}
+
+static inline void check_result(fpga_result res, const char *err_str) {
+  if (res == FPGA_OK) {
+    return;
+  }
+  std::string opae_err_str =
+      std::string("KernelInterrupt: ") + std::string(err_str) + std::string(": ") + std::string(fpgaErrStr(res));
+}
+
+/** KernelInterrupt constructor
+ */
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_work_thread_active(false),
+      m_eventfd(0),
+      m_kernel_interrupt_fn(nullptr),
+      m_kernel_interrupt_user_data(nullptr),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(nullptr) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Constructor\n");
+  }
+  set_member_for_interrupts();
+  enable_interrupts();
+}
+
+/** KernelInterrupt destructor
+ *  calls disable_interrupts()
+ */
+KernelInterrupt::~KernelInterrupt() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Destructor\n");
+  }
+  try {
+    disable_interrupts();
+  } catch (...) {
+    std::string err("destructor error");
+    debug_print(err, 0);
+  }
+}
+
+/** disable_interrupts() function is used in KernelInterrupt destructor
+ *  if interupt not enabled , !enable_thread
+ *  then disable interrupt mask
+ *  else if interrupts are used,
+ *  call noftify_work_thread(), join the thread
+ *  we call OPAE API fpgaUnregisterEvent() to unregister FPGA event,
+ *  it tells driver caller is no longer interested in notification for event associated with m_event_handle
+ *  we call OPAE API fpgaDestroyEventHandle() to free resources
+ */
+void KernelInterrupt::disable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+    }
+    assert(m_work_thread_active == false);
+    return;
+  }
+
+  m_work_thread_active = false;
+  notify_work_thread();
+  m_work_thread->join();
+
+  if (m_event_handle != nullptr) {
+    fpga_result res;
+
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    check_result(res, "error fpgaUnregisterEvent");
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    check_result(res, "error fpgaDestroyEventHandle");
+  }
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+  }
+}
+
+/** notify_work_thread() function is called by disable_interrupts() function
+ *  eventfd object created by OPAE API fpgaGetOSObjectFromEventHandle() , m_eventfd,
+ *  can be used as an event wait/notify mechanism by user space applications and by kernel,
+ *  to notify user space applications of events
+ *  every time write() is performed on eventfd,
+ *  the value of uint64_t being written is added to count and wakeup is performed.
+ * We dont use read() below but read() will return count value to user space and reset count to 0
+ */
+void KernelInterrupt::notify_work_thread() {
+  uint64_t val = 1;
+  ssize_t res = write(m_eventfd, &val, sizeof(val));
+  if (res < 0) {
+    std::cerr << "Warning: KernelInterrupts::notify_work_thread()"
+                 " write to eventfd failed: "
+              << strerror(errno) << std::endl;
+  }
+}
+
+/** enable_interrupts() function is called by Kernel Interrupt constructor
+ *  if interrupt is not enabled it will disable interrupt mask , set thread active as false and return
+ *  if interrupt is enabled, it will use OPAE APIs to create event handle fpgaCreateEventHandle()
+ *  OPAE event APIs provide functions for handling asynchronous events such as errors and interrupts
+ *  Associated with every event a process has registered for is an fpga_event_handle,
+ *  which encapsulates OS specific data structure for event objects
+ *  On Linux fpga_event_handle can be used as file descriptor
+ *  and passed to select(), poll() and similar functions to wait for asynchronous events
+ *  OPAE API fpgaRegisterEvent() is used to tell driver that caller is interested in notification for event specified
+ *  OPAE API fpgaGetOSObjectFromEventHandle() checks validity of event handle and
+ *  gets OS object used to subscribe and unsubscribe to events
+ *  we create a thread and call work_thread()
+ */
+void KernelInterrupt::enable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+    }
+    m_work_thread_active = false;
+    return;
+  }
+
+  fpga_result res;
+
+  res = fpgaCreateEventHandle(&m_event_handle);
+  check_result(res, "error creating event handle");
+
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, mmd_kernel_interrupt_line_num);
+  check_result(res, "error registering event");
+
+  res = fpgaGetOSObjectFromEventHandle(m_event_handle, &m_eventfd);
+  check_result(res, "error getting event file handle");
+
+  m_work_thread_active = true;
+  m_work_thread = std::unique_ptr<std::thread>(new std::thread([this] { this->work_thread(); }));
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+  }
+}
+
+/** work_thread() is called from enable_interrupts() function while creating new thread
+ *  it calls wait_for_event(), disables interrupt mask
+ *  creates lock_guard with m_mutex, calls kernel interrupt function and then enables interrupt mask
+ */
+void KernelInterrupt::work_thread() {
+  while (m_work_thread_active) {
+    wait_for_event();
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (m_kernel_interrupt_fn != nullptr) {
+      m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+    }
+  }
+}
+
+/** wait_for_event() is called from work_thread() function
+ *  it uses poll() function to wait for event on a file descriptor,
+ *  the m_event_fd file descriptor which we got from fpgaOSObjectFromEventHandle()
+ *  poll() uses pollfd struct, which inncludes
+ *  fd - file descriptor, events - requested events, revents - returned events
+ *  timeout argument in poll() specifies number of milliseconds,
+ *  poll() will block waiting for file descriptor
+ *  On success, poll() returns a nonnegative value which is the
+ *  number of elements in the pollfds whose revents fields have been
+ *  set to a nonzero value (indicating an event or an error).  A
+ *  return value of zero indicates that the system call timed out
+ *  before any file descriptors became read
+ */
+void KernelInterrupt::wait_for_event() {
+  // Use timeout when polling eventfd because sometimes interrupts are missed.
+  // This may be caused by knonw race condition with runtime, or there may
+  // be occasional events lost from OPAE.
+
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt waiting for event using poll()\n");
+  const int timeout_ms = 250;
+  struct pollfd pfd = {.fd = m_eventfd, .events = POLLIN, .revents = 0};
+  int num_events = poll(&pfd, 1, timeout_ms);
+  if (num_events <= 0) {
+    std::string err(num_events < 0 ? strerror(errno) : "timed out");
+    std::string err_str("poll(): ");
+    debug_print(err_str.append(err), 1);
+  } else if (pfd.revents != POLLIN) {
+    std::string err("poll error num: ", pfd.revents);
+    debug_print(err, 0);
+  } else {
+    uint64_t val = 0;
+    ssize_t bytes_read = read(pfd.fd, &val, sizeof(val));
+    if (bytes_read < 0) {
+      std::string err(strerror(errno));
+      std::string err_str("read: ");
+      debug_print(err_str.append(err), 1);
+    }
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt setting kernel interrupt\n");
+  std::lock_guard<std::mutex> lock(m_mutex);
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
+
+/** Configure interrupts
+ *  set_member_for_interrupts() called from KernelInterrupts constructor
+ */
+void KernelInterrupt::set_member_for_interrupts() {
+  static bool initialized = false;
+  if (initialized) {
+    return;
+  }
+  // Use interrupts
+  MMD_DEBUG("DEBUG LOG : Using interrupts\n");
+
+  enable_thread = true;
+  initialized = true;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
new file mode 100644
index 0000000..9ea6e68
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
@@ -0,0 +1,68 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef KERNEL_INTERRUPT_H_
+#define KERNEL_INTERRUPT_H_
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <chrono>
+#include <mutex>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  void enable_interrupts();
+  void disable_interrupts();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+
+  KernelInterrupt(const KernelInterrupt &) = delete;
+  KernelInterrupt &operator=(const KernelInterrupt &) = delete;
+  KernelInterrupt(KernelInterrupt &&) = delete;
+  KernelInterrupt &operator=(KernelInterrupt &&) = delete;
+
+ private:
+  static void set_member_for_interrupts();
+
+  void notify_work_thread();
+  void wait_for_event();
+  void work_thread();
+
+  static bool enable_thread;
+
+  std::mutex m_mutex;
+  std::unique_ptr<std::thread> m_work_thread;
+  std::atomic<bool> m_work_thread_active;
+  int m_eventfd;
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void *m_kernel_interrupt_user_data;
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+  fpga_event_handle m_event_handle;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // KERNEL_INTERRUPT_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
new file mode 100644
index 0000000..58cd8e0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
@@ -0,0 +1,830 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+// On some systems MAP_HUGE_2MB is not defined. It should be defined for all
+// platforms that DCP supports, but we also want ability to compile MMD on
+// CentOS 6 systems.
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#endif
+
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include "aocl_mmd.h"
+#include "mmd_device.h"
+
+bool diagnose = 0;
+
+/** If the MMD is loaded dynamically, destructors in the MMD will execute before
+ *  the destructors in the runtime upon program termination. The DeviceMapManager
+ *  guards accesses to the device/handle maps to make sure the runtime doesn't
+ *  get to reference them after MMD destructors have been called. Destructor
+ *  makes sure that all devices are closed at program termination regardless of
+ *  what the runtime does. Implemented as a singleton.
+ */
+class DeviceMapManager final {
+ public:
+  /** C++ std map data structure to keep track of
+   *  object id -> handle and handle -> device
+   */
+  typedef std::map<int, Device *> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  /** Returns handle and device pointer to the device with the specified name
+   *  Creates a new entry for this device if it doesn't already exist
+   *  Return 0 on success, -1 on failure
+   */
+  int get_or_create_device(const char *board_name, int *handle, Device **device);
+
+  /** Return obj id based on ASP name.*/
+  uint64_t id_from_name(const char *board_name);
+
+  /** Return MMD handle based on obj id. Returned value is negative if board
+   *   doesn't exist
+   */
+  inline int handle_from_id(uint64_t obj_id);
+
+  /** Return pointer to device based on MMD handle. Returned value is null
+   *   if board doesn't exist
+   */
+  Device *device_from_handle(int handle);
+
+  /** Closes specified device if it exists */
+  void close_device_if_exists(int handle);
+
+  /* Returns a reference to the class singleton */
+  static DeviceMapManager &get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const &) = delete;
+  void operator=(DeviceMapManager const &) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated Device* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+#ifdef SIM
+      std::cout << "# mmd.cpp: When destroying DeviceMapManager in ASE, assume it worked.\n";
+      break;
+#endif
+      MMD_DEBUG("DEBUG LOG : In DeviceMapManager destructor, closing device with handle %d \n", handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+
+    MMD_DEBUG("DEBUG LOG : Constructing DeviceMapManager object\n");
+  }
+  t_handle_to_dev_map *handle_to_dev_map = nullptr;
+  t_id_to_handle_map *id_to_handle_map = nullptr;
+};
+static DeviceMapManager &device_manager = DeviceMapManager::get_instance();
+
+/** Returns handle and device pointer to the device with the specified name
+ *  Creates a new entry for this device if it doesn't already exist
+ *  Return 0 on success, -1 on failure
+ */
+int DeviceMapManager::get_or_create_device(const char *board_name, int *handle, Device **device) {
+  int _handle = MMD_INVALID_PARAM;
+  Device *_device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    MMD_DEBUG(
+        "DEBUG LOG : Failure in DeviceMapManager::get_or_create_device,id_to_handle_map or handle_to_dev_map is "
+        "NULL\n");
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device. obj_id : %ld \n", obj_id);
+    return false;
+  }
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new Device(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device %s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in creating new device object handle : %d \n", _handle);
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device metadata(handle , object) , handle : %d\n", _handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+
+  MMD_DEBUG("DEBUG LOG : Success in creating new device object , handle : %d\n", _handle);
+  return DeviceMapManager::SUCCESS;
+}
+
+/** Return obj id based on ASP name.*/
+uint64_t DeviceMapManager::id_from_name(const char *board_name) {
+  uint64_t obj_id = 0;
+  if (Device::parse_board_name(board_name, obj_id)) {
+    MMD_DEBUG("DEBUG LOG : Success in retrieving object id from board name\n");
+    return obj_id;
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve object id from board name\n");
+    return 0;
+  }
+}
+
+/** Return MMD handle based on obj id. Returned value is negative if board
+ *  doesn't exist
+ */
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving handle from object id. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve handle from object id \n");
+  }
+  return handle;
+}
+
+/** Return pointer to device based on MMD handle. Returned value is null
+ *  if board doesn't exist
+ */
+Device *DeviceMapManager::device_from_handle(int handle) {
+  Device *dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device from handle. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve device from handle\n");
+  }
+  return dev;
+}
+
+/** Closes specified device if it exists */
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      Device *dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+      MMD_DEBUG("DEBUG LOG : Closing device with handle : %d\n", handle);
+    } else {
+      MMD_DEBUG("DEBUG LOG : Nothing to close. Device with handle : %d already closed\n", handle);
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error, no handle to device map entry found for handle : %d \n", handle);
+  }
+}
+
+/** Interface for checking if AFU has ASP loaded */
+bool mmd_asp_loaded(const char *name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error, no object id found for board : %s \n", name);
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    Device *dev = device_manager.device_from_handle(handle);
+    if (dev) {
+      MMD_DEBUG("DEBUG LOG : ASP loaded for handle : %d \n", handle);
+      return dev->asp_loaded();
+    } else {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d \n", handle);
+      return false;
+    }
+  } else {
+    bool asp_loaded = false;
+    try {
+      Device dev(obj_id);
+      asp_loaded = dev.asp_loaded();
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d , %s\n", handle, e.what());
+      return false;
+    }
+
+    MMD_DEBUG("DEBUG LOG : ASP loaded : %d (0 - not loaded , 1 - loaded) for handle : %d \n", asp_loaded, handle);
+    return asp_loaded;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine number of baords in system
+ */
+static unsigned int get_offline_num_acl_boards(const char *asp_uuid) {
+  bool asp_only = true;
+  fpga_guid guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(asp_uuid, guid) < 0) {
+    MMD_DEBUG("Error parsing guid '%s'\n", asp_uuid);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (asp_only) {
+    res = fpgaPropertiesSetGUID(filter, guid);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine names of boards in the system
+ */
+static bool get_offline_board_names(std::string &boards, bool asp_only = true) {
+  boards = "dla_agx7_ofs_board";
+  return true;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strnlen(X, 4096) + 1;                                    \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy((void *)param_value, X, Xcpylen);                                 \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+/** Get information about the board using the enum aocl_mmd_offline_info_t for
+ *  offline info (called without a handle), and the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *    requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *    param_value_size - size of the param_value field in bytes. This should
+ *      match the size of the return type expected as indicated in the enum
+ *      definition.
+ *
+ *    param_value - pointer to the variable that will receive the returned info
+ *
+ *    param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+
+// From DLA perspective, only AOCL_MMD_BOARD_NAMES info we care
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  /** aocl_mmd_get_offline_info can be called many times by the runtime
+   *  and it is expensive to query the system.  Only compute values first
+   *  time aocl_mmd_get_offline_info called future iterations use saved results
+   */
+  static bool initialized = false;
+  static int mem_type_info;
+  static unsigned int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards(I_DK_AFU_ID);
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      RESULT_INT(num_acl_boards);
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+/** Get information about the board using the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *  requested_info_id - a value from the aocl_mmd_info_t enum
+ *
+ *  param_value_size - size of the param_value field in bytes. This should
+ *    match the size of the return type expected as indicated in the enum
+ *    definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *    the param_value_size should be set to sizeof(float) and you should
+ *    expect the same number of bytes returned in param_size_ret.
+ *
+ *  param_value - pointer to the variable that will receive the returned info
+ *
+ *  param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  MMD_DEBUG("DEBUG LOG : called aocl_mmd_get_info\n");
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel OFS Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float *ptr = static_cast<float *>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+
+    case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT:
+      RESULT_SIZE_T(64);
+      break;
+
+    case AOCL_MMD_HOST_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+    case AOCL_MMD_SHARED_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+
+    case AOCL_MMD_DEVICE_MEM_CAPABILITIES:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+/** Set the interrupt handler for the opened device.
+ *  The interrupt handler is called whenever the client needs to be notified
+ *  of an asynchronous event signaled by the device internals.
+ *  For example, the kernel has completed or is stalled.
+ *
+ *  Important: Interrupts from the kernel must be ignored until this handler is
+ *  set
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a kernel interrupt occurs
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set kernel interrupt handler for device handle : %d\n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error setting kernel interrupt handler for device handle : %d\n", handle);
+    return MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+/** Set the operation status handler for the opened device.
+ *  The operation status handler is called with
+ *     status 0 when the operation has completed successfully.
+ *     status negative when the operation completed with errors.
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a status update is to be
+ *    performed.
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_status_handler(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set status handler for device handle : %d\n", handle);
+  }
+  return 0;
+}
+
+/** Host to device-global-memory write (HOST DDR -> FPGA DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_write: handle : %d\t operation : %p\t len : 0x%zx\t src : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      src,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_write , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Host reading from device-global-memory (FPGA DDR -> HOST DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_read: handle : %d\t operation : %p\t len : 0x%zx\t dst : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      dst,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_read , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Open and initialize the named device.
+ *
+ *  The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ *  info.
+ *
+ *  Arguments:
+ *     name - open the board with this name (provided as a C-style string,
+ *            i.e. NUL terminated ASCII.)
+ *
+ *  Returns: the non-negative integer handle for the board, otherwise a
+ *  negative value to indicate error. Upon receiving the error, the OpenCL
+ *  runtime will proceed to open other known devices, hence the MMD mustn't
+ *  exit the application if an open call fails.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+
+  MMD_DEBUG("DEBUG LOG : aocl_mmd_open, Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, object id not found for board : %s\n", name);
+    return MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  Device *dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    if (std::getenv("MMD_PROGRAM_DEBUG") || std::getenv("MMD_DMA_DEBUG") || std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, device not found for board : %s\n", name);
+    }
+    return MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->asp_loaded()) {
+    if (!dev->initialize_asp()) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, Error initializing asp for board : %s\n", name);
+      return MMD_ASP_INIT_FAILED;
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, asp not loaded for board : %s\n", name);
+    return MMD_ASP_NOT_LOADED;
+  }
+  MMD_DEBUG("end of aocl_mmd_open \n");
+  MMD_DEBUG("DEBUG LOG : Success aocl_mmd_open for board : %s, handle : %d \n", name, handle);
+  return handle;
+}
+
+/** Close an opened device, by its handle.
+ *  Returns: 0 on success, negative values on error.
+ */
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+#ifndef SIM
+  device_manager.close_device_if_exists(handle);
+#else
+  std::cout << "# mmd.cpp: During simulation (ASE) we are not closing the device.\n";
+#endif
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; }
+
+// DLA can only uses 4GB DDR as of 2024.2
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() {
+  #ifdef USE_N6001_BOARD
+  return 300.0; // MHz
+  #else
+  return 333.333333; // MHz
+  #endif
+}
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x10000 + (0x800 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) {
+  #ifdef USE_N6001_BOARD
+  return (1ULL << 32) * instance + addr;
+  #else
+  return (1ULL << 33) * instance + addr;
+  #endif
+}
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
new file mode 100644
index 0000000..dd4ca42
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
@@ -0,0 +1,448 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <assert.h>
+#include <numa.h>
+
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_helper.h"
+
+int Device::next_mmd_handle{1};
+
+/**
+ * The Device object is created for each device/board opened and
+ * it has methods to interact with fpga device.
+ * The entry point for Device is in DeviceMapManager Class
+ * which maintains mapping between device names and handles.
+ * Device Object is foundation for interacting with device.
+ */
+Device::Device(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      asp_initialized(false),
+      mmio_is_mapped(false),
+      filter(NULL),
+      mmio_token(NULL),
+      mmio_handle(NULL),
+      fme_token(NULL),
+      guid(),
+      mmd_dma(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  MMD_DEBUG("DEBUG LOG : Constructing Device object\n");
+
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  fpga_properties filter = NULL;
+  uint32_t num_matches;
+  fpga_result r;
+
+  // Set up a filter that will search for an accelerator
+  fpgaGetProperties(NULL, &filter);
+  fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+
+  // Add the desired UUID to the filter
+  uuid_parse(I_DK_AFU_ID, guid);
+  fpgaPropertiesSetGUID(filter, guid);
+
+  // Do the search across the available FPGA contexts
+  num_matches = 1;
+  fpgaEnumerate(&filter, 1, &mmio_token, 1, &num_matches);
+
+  fpgaPropertiesGetParent(filter, &fme_token);
+
+  // Not needed anymore so we destroy the filter
+  fpgaDestroyProperties(&filter);
+
+  if (num_matches < 1) {
+    throw std::runtime_error(std::string("Cannot find accelerator"));
+  }
+
+  // Open accelerator
+  r = fpgaOpen(mmio_token, &mmio_handle, 0);
+  assert(FPGA_OK == r);
+
+  // While the token is available, check whether it is for HW
+  // or for ASE simulation.
+  fpga_properties accel_props;
+  uint16_t vendor_id, dev_id;
+  fpgaGetProperties(mmio_token, &accel_props);
+  fpgaPropertiesGetVendorID(accel_props, &vendor_id);
+  fpgaPropertiesGetDeviceID(accel_props, &dev_id);
+
+  afu_initialized = true;
+  MMD_DEBUG("DEBUG LOG : Done constructing Device object\n");
+}
+
+/** Return true if board name parses correctly, false if it does not
+ *  Return the parsed object_id in obj_id as an [out] parameter
+ */
+bool Device::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  MMD_DEBUG("DEBUG LOG : Parsing board name\n");
+  std::string prefix(ASP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    MMD_DEBUG("DEBUG LOG : Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+/** initialize_asp() function is used in aocl_mmd_open() API
+ *  It resets AFC and reinitializes DMA, Kernel Interrupts if in use
+ */
+bool Device::initialize_asp() {
+  MMD_DEBUG("DEBUG LOG : Initializing ASP ... \n");
+  if (asp_initialized) {
+    MMD_DEBUG("DEBUG LOG : ASP already initialized \n");
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(mmio_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  // Trigger an user reset
+  uint64_t reset = 1;
+  fpgaWriteMMIO64(mmio_handle, 0, 0x40000, reset);
+
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for
+  // performance that the pinned buffer is located on the NUMA node as the
+  // threads that performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  MMD_DEBUG("DEBUG LOG : Initializing HOST -> FPGA DMA channel \n");
+
+  mmd_dma = new intel_opae_mmd::mmd_dma(mmio_handle, mmd_handle);
+  if (!mmd_dma->initialized()) {
+    MMD_DEBUG("DEBUG LOG : Error initializing DMA channel \n");
+    delete mmd_dma;
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+// Do not enable interrupt if polling mode is enabled in the DLA runtime.
+#ifndef COREDLA_RUNTIME_POLLING
+  try {
+    kernel_interrupt_thread = new intel_opae_mmd::KernelInterrupt(mmio_handle, mmd_handle);
+  } catch (const std::system_error &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << e.code() << std::endl;
+    return false;
+  } catch (const std::exception &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << std::endl;
+    return false;
+  }
+#endif
+
+  asp_initialized = true;
+  MMD_DEBUG("DEBUG LOG : ASP Initialized ! \n");
+  return asp_initialized;
+}
+
+/** Device Class Destructor implementation
+ *  Properly releasing and free-ing memory
+ *  part of best coding practices and help
+ *  with stable system performance and
+ *  helps reduce bugs
+ */
+Device::~Device() {
+  MMD_DEBUG("DEBUG LOG : Destructing Device object \n");
+  int num_errors = 0;
+
+  if (kernel_interrupt_thread != nullptr) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (mmd_dma) {
+    delete mmd_dma;
+    mmd_dma = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(mmio_handle, 0)) {
+      MMD_DEBUG("DEBUG LOG :  fpgaUnmapMMIO failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_handle) {
+    if (fpgaClose(mmio_handle) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaClose mmio_handle failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_token) {
+    if (fpgaDestroyToken(&mmio_token) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyToken mmio_token failed\n");
+      num_errors++;
+    }
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyProperties filter failed\n");
+      num_errors++;
+    }
+  }
+
+  if (num_errors > 0) {
+    MMD_DEBUG("DEBUG LOG : Error freeing resources in Device destructor\n");
+  }
+}
+
+/** asp_loaded() function which checks if asp is loaded on board
+ *  it is used in aocl_mmd_open() API
+ */
+bool Device::asp_loaded() {
+  fpga_guid pci_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(I_DK_AFU_ID, pci_guid) < 0) {
+    MMD_DEBUG("DEBUG LOG : Error parsing guid\n");
+    return false;
+  }
+
+  res = fpgaGetProperties(mmio_token, &prop);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading properties: %s \n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  if (!mmio_token) {
+    fpgaDestroyProperties(&prop);
+    MMD_DEBUG("DEBUG LOG : Error reading the mmio_token\n");
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading GUID \n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(pci_guid, afu_guid) == 0) {
+    MMD_DEBUG("DEBUG LOG : asp loaded : true \n");
+    return true;
+  } else {
+    MMD_DEBUG("DEBUG LOG : asp loaded : false \n");
+    return false;
+  }
+}
+
+/** get_bdf() function is called
+ *  in aocl_mmd_get_info() API
+ */
+std::string Device::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << std::hex << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << std::hex << unsigned(device) << "." << std::hex << unsigned(function);
+
+  return bdf.str();
+}
+
+/** get_temperature() function is called
+ *  in aocl_mmd_get_info() API
+ *  We currently use hardcoded paths to retrieve temperature information
+ *  We will replace with OPAE APIs in future
+ */
+float Device::get_temperature() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : Reading temperature ... \n");
+  }
+  float temp = 0;
+  fpga_object obj;
+  const char *name;
+  name = "dfl_dev.*/spi_master/spi*/spi*.*/*-hwmon.*.auto/hwmon/hwmon*/temp1_input";
+  fpga_result res;
+  res = fpgaTokenGetObject(fme_token, name, &obj, FPGA_OBJECT_GLOB);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading temperature monitor from BMC :");
+    MMD_DEBUG(" %s \n", fpgaErrStr(res));
+    temp = -999;
+    return temp;
+  }
+
+  uint64_t value = 0;
+  fpgaObjectRead64(obj, &value, FPGA_OBJECT_SYNC);
+  fpgaDestroyObject(&obj);
+  temp = value / 1000;
+  return temp;
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_interrupt_handler() API
+ */
+void Device::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_kernel_interrupt() \n");
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_status_handler() API
+ */
+void Device::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_status_handler() \n");
+  event_update = fn;
+  event_update_user_data = user_data;
+}
+
+/** event_update_fn() is used in read_block(), write_block(), copy_block() functions
+ *  OPAE provides event API for handling asynchronous events sucj as errors and interrupts
+ *  under the hood those are used
+ */
+void Device::event_update_fn(aocl_mmd_op_t op, int status) {
+  MMD_DEBUG("DEBUG LOG : Device::event_update_fn() \n");
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+/** read_block() is used in aocl_mmd_read() API
+ *  as name suggests its used for fpga->host DMA and MMIO transfers
+ */
+int Device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::read_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read.
+
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to read block\n");
+    res = mmd_dma->fpga_to_host(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4);  // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = read_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block\n");
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+  return res;
+}
+
+/** write_block() is used in aocl_mmd_write() API
+ *  as name suggests its used for DMA and MMIO transfers
+ */
+int Device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::write_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to write block\n");
+    res = mmd_dma->host_to_fpga(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4); // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = write_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to write block\n");
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+
+  return res;
+}
+
+/** read_mmio() is used in read_block() function
+ *  it uses OPAE APIs fpgaReadMMIO64() and fpgaReadMMIO32()
+ */
+int Device::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::read_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
+
+/** write_mmio() is used in write_block() function
+ *  it uses OPAE APIs fpgaWriteMMIO64() and fpgaWriteMMIO32()
+ */
+int Device::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::write_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
new file mode 100644
index 0000000..1cded83
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
@@ -0,0 +1,151 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_DEVICE_H
+#define MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#include <opae/fpga.h>
+
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+#include "kernel_interrupt.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning ASP is not loaded.
+#define MMD_ASP_NOT_LOADED -2
+#define MMD_ASP_INIT_FAILED -3
+
+// Delay settings
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define ASP_NAME "ofs_"
+
+#define SVM_MMD_MPF 0x24000
+
+#define SVM_DDR_OFFSET 0x1000000000000
+#define PCI_DDR_OFFSET 0
+
+enum {
+  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0,
+  AOCL_MMD_MEMORY = 1,
+  AOCL_MMD_DLA_CSR = 2,
+};
+
+enum AfuStatu { MMD_INVALID_ID = 0, MMD_ASP, MMD_AFU };
+
+class Device final {
+ public:
+  Device(uint64_t);
+  Device(const Device &) = delete;
+  Device &operator=(const Device &) = delete;
+  ~Device();
+
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+
+  bool initialize_asp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool asp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+  void initialize_local_cpus_sysfs();
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool asp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_properties filter;
+  fpga_token mmio_token;
+  fpga_handle mmio_handle;
+  fpga_token fme_token;
+  fpga_guid guid;
+  intel_opae_mmd::mmd_dma *mmd_dma;
+  std::mutex m_dma_mutex;
+
+  // Helper functions
+  int read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  int write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
new file mode 100644
index 0000000..6a4e13c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
@@ -0,0 +1,573 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <memory.h>
+#include <sys/mman.h>
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+
+#include <inttypes.h>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+namespace intel_opae_mmd {
+
+/** mmd_dma class constructor
+ */
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle) : m_initialized(false), m_fpga_handle(fpga_handle_arg) {
+  MMD_DEBUG("DEBUG LOG : Constructing DMA \n");
+  // Initialize shared buffer
+  auto res = fpgaPrepareBuffer(m_fpga_handle, DMA_BUFFER_SIZE, (void **)&dma_buf_ptr, &dma_buf_wsid, 0);
+
+  assert(FPGA_OK == res && "Allocating DMA Buffer failed");
+
+  memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+  // Store virtual address of IO registers
+  res = fpgaGetIOAddress(m_fpga_handle, dma_buf_wsid, &dma_buf_iova);
+  assert(FPGA_OK == res && "getting dma DMA_BUF_IOVA failed");
+
+  m_initialized = true;
+}
+
+/** mmd_dma destructor
+ *  free-ing , releasing various resources created during object construction is a good idea
+ *  it helps with system stability and reduces code bugs
+ */
+mmd_dma::~mmd_dma() {
+  MMD_DEBUG("DEBUG LOG : Destructing DMA \n");
+  auto res = fpgaReleaseBuffer(m_fpga_handle, dma_buf_wsid);
+  assert(FPGA_OK == res && "Release DMA Buffer failed");
+  m_initialized = false;
+}
+
+// Called in dma_transfer() to send DMA descriptor
+int mmd_dma::send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc) {
+  // mmio requires 8 byte alignment
+  assert(mmio_dst % 8 == 0);
+
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.src_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.src_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.dest_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.dest_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.len);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.len, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.control);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.control, mmio_dst);
+
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::fpga_to_host(void *host_addr, uint64_t dev_src, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dev_src = dev_src;
+  void *curr_host_addr = host_addr;
+
+  if (dev_src % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src is non 64B aligned\n");
+    if (count_left < 64) {
+      MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src count < 64\n");
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+      return res;
+    } else {
+      aligned_addr = ((curr_dev_src / 64) + 1) * 64;
+      align_bytes = aligned_addr - curr_dev_src;
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dev_src += align_bytes;
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+
+      uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+      // Copy data from shared buffer to host addr
+      memcpy(curr_host_addr, (void *)dma_buf_ptr, DMA_BUFFER_SIZE);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dev_src += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+        dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+        // Copy data from shared buffer to host addr
+        memcpy(curr_host_addr, (void *)dma_buf_ptr, dma_tx_bytes);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+        // Update the address
+        curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + dma_tx_bytes);
+        curr_dev_src += dma_tx_bytes;
+        count_left -= dma_tx_bytes;
+      }
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+        if (FPGA_OK != res) {
+          MMD_DEBUG("DEBUG LOG : mmd_dma::_ase_fpga_to_host failed\n");
+          return -1;
+        }
+        count_left = 0;
+
+        // No need to update address as the transaction is done.
+      }
+    }
+  }
+  assert(count_left==0 && "fpga_to_host failed");
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dest = dev_dest;
+  const void *curr_host_addr = host_addr;
+
+  if (dev_dest % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga dev_dest is non 64B aligned\n");
+    if (count_left < 64) {
+      res = _ase_host_to_fpga(dev_dest, host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+      return res;
+    } else {
+      aligned_addr = ((dev_dest / 64) + 1) * 64;
+      align_bytes = aligned_addr - dev_dest;
+      res = _ase_host_to_fpga(dev_dest, host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dest += align_bytes;
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+      // Copy host_src value to the shared buffer
+      memcpy((void *)dma_buf_ptr, curr_host_addr, DMA_BUFFER_SIZE);
+      uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dest += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        // Copy host_src value to the shared buffer
+        memcpy((void *)dma_buf_ptr, curr_host_addr, dma_tx_bytes);
+        uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of dma_tx_bytes / DMA_LINE_SIZE
+        dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+      }
+
+      // Update the address
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + dma_tx_bytes);
+      curr_dest += dma_tx_bytes;
+      count_left -= dma_tx_bytes;
+
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_host_to_fpga(curr_dest, curr_host_addr, count_left);
+        assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+        count_left = 0;
+      }
+    }
+  }
+  assert(count_left==0 && "host_to_fpga failed");
+  return 0;
+}
+
+int mmd_dma::dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode) {
+
+  // Get debug information for thread id
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint64_t id = std::stoull(ss.str());
+  MMD_DEBUG("dma_transfer start current thread_id is %04lX\n", id);
+
+  // Native DMA transfer requires 64 byte alignment
+  assert(dev_src % 64 == 0);
+  assert(dev_dest % 64 == 0);
+
+  const uint64_t MASK_FOR_35BIT_ADDR = 0x7FFFFFFFF;
+
+  dma_descriptor_t desc;
+
+  MMD_DEBUG("DEBUG LOG : mmd_dma::dma_transfer starts\n");
+  MMD_DEBUG("DEBUG LOG dev_dest = %04lX\n", dev_dest);
+
+  desc.src_address = dev_src & MASK_FOR_35BIT_ADDR;
+  desc.dest_address = dev_dest & MASK_FOR_35BIT_ADDR;
+  desc.len = len;
+  desc.control = 0x80000000 | (descriptor_mode << MODE_SHIFT);
+
+  const uint64_t DMA_DESC_BASE = 8 * DMA_CSR_IDX_SRC_ADDR;
+  const uint64_t DMA_STATUS_BASE = 8 * DMA_CSR_IDX_STATUS;
+  uint64_t mmio_data = 0;
+
+  int desc_size = sizeof(desc);
+
+  MMD_DEBUG("Descriptor size   = %d\n", desc_size);
+  MMD_DEBUG("desc.src_address  = %04lX\n", desc.src_address);
+  MMD_DEBUG("desc.dest_address = %04lX\n", desc.dest_address);
+  MMD_DEBUG("desc.len          = %d\n", desc.len);
+  MMD_DEBUG("desc.control      = %04X\n", desc.control);
+  MMD_DEBUG("descriptor_mode   = %04X\n", descriptor_mode);
+
+  // send descriptor
+  send_descriptor(DMA_DESC_BASE, desc);
+
+  fpga_result r;
+  r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+  MMD_DEBUG("DMA_STATUS_BASE before = %04lX\n", mmio_data);
+  if (FPGA_OK != r) return -1;
+
+  // If the busy bit is empty, then we are done.
+  while ((mmio_data & 0x1) == 0x1) {
+    r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+    assert(FPGA_OK == r);
+  }
+  MMD_DEBUG("dma_transfer end current thread_id is %04lX\n", id);
+  return 0;
+}
+
+// Transfer "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO writes.
+fpga_result mmd_dma::_ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG: _ase_host_to_fpga is being called\n ");
+
+  MMD_DEBUG("DEBUG LOG : dev_dest is ");
+  MMD_DEBUG("%" PRIu64 "\n", dev_dest);
+
+  assert(count < 64);  // DLA only uses ASE transfer with less than 64 Byte transfer.
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  const void *curr_src_ptr = src_ptr;
+
+  if (count == 0) return res;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      uint64_t mmio_base_control = ASE_MMIO_BASE + ASE_MMIO_CTRL;
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, mmio_base_control, ase_window);
+      assert(res == FPGA_OK && "Write to ASE control failed");
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      MMD_DEBUG("DEBUG LOG  : _ase_host_to_fpga count is ");
+      MMD_DEBUG("%" PRIu64 "\n", count);
+
+      MMD_DEBUG("DEBUG LOG : dev addr is ");
+      MMD_DEBUG("%" PRIu64 "\n", dev_addr);
+
+      size_t size = (count > 8) ? 8 : count;
+      mmd_helper::write_mmio(m_fpga_handle, curr_src_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+
+    assert(count == 0);
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    MMD_DEBUG("DEBUG LOG  :  _ase_host_to_fpga count is ");
+    MMD_DEBUG("%" PRIu64 "\n", count);
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data, check if the shift is correct here
+    memcpy((reinterpret_cast<char *>(&read_tmp) + shift), src_ptr, unaligned_size);
+
+    // Write back data to the device
+    fpgaWriteMMIO64(m_fpga_handle, 0, dev_addr, read_tmp);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+
+    const void *curr_src_ptr = (const void *)(static_cast<const char *>(src_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned loop is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::write_mmio(m_fpga_handle,
+                             curr_src_ptr,
+                             dev_addr,
+                             size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+    assert(count_left == 0);
+  }
+
+  return FPGA_OK;
+}
+
+// Transfer "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO reads.
+fpga_result mmd_dma::_ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG  : _ase_fpga_to_host is being called\n ");
+
+  assert(count < 64);
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  if (count == 0) return res;
+
+  void *curr_host_ptr = host_ptr;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count > 8) ? 8 : count;
+
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG : ase_window is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data
+    memcpy(host_ptr, (reinterpret_cast<char *>(&read_tmp) + shift), unaligned_size);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+    curr_host_ptr = (void *)(static_cast<char *>(host_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+    assert(count_left == 0);
+  }
+  return FPGA_OK;
+}
+}  // namespace intel_opae_mmd
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
new file mode 100644
index 0000000..a2841b1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
@@ -0,0 +1,89 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+#ifndef MMD_DMA_H_
+#define MMD_DMA_H_
+
+#include <opae/fpga.h>
+#include <poll.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "aocl_mmd.h"
+#include "mmd_helper.h"
+
+#define DMA_CSR_IDX_SRC_ADDR 0x5
+#define DMA_CSR_IDX_STATUS 0x9
+#define MODE_SHIFT 26
+// For now limits to 16K to avoid DMA transfer hang in hw, further testing required to increase the value.
+#define DMA_BUFFER_SIZE (1024 * 16)
+#define DMA_LINE_SIZE 64
+#define DMA_HOST_MASK 0x2000000000000
+
+#define ASE_MMIO_BASE 0x20000
+#define ASE_MMIO_CTRL 0x200
+#define ASE_MMIO_WINDOW 0x1000
+
+namespace intel_opae_mmd {
+
+enum dma_mode { stand_by = 0x0, host_to_ddr = 0x1, ddr_to_host = 0x2, ddr_to_ddr = 0x3 };
+
+struct dma_descriptor_t {
+  uint64_t src_address;
+  uint64_t dest_address;
+  uint32_t len;
+  uint32_t control;
+};
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  int fpga_to_host(void *host_addr, uint64_t dev_src, size_t size);
+  int host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size);
+  int dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode);
+  fpga_result _ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count);
+  fpga_result _ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count);
+  mmd_dma(mmd_dma &other) = delete;
+  mmd_dma &operator=(const mmd_dma &other) = delete;
+
+ private:
+  // Helper functions
+  int send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc);
+  // Member variables
+  bool m_initialized;
+  fpga_handle m_fpga_handle;
+
+  // Shared buffer in host memory
+  uint64_t *dma_buf_ptr = NULL;
+  // Workspace ID used by OPAE to identify buffer
+  uint64_t dma_buf_wsid;
+  // IO virtual address
+  uint64_t dma_buf_iova;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // MMD_DMA_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
new file mode 100644
index 0000000..4af482a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
@@ -0,0 +1,163 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "mmd_helper.h"
+#include <inttypes.h>
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::read_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  if (mmio_addr % 4 != 0) {
+    MMD_DEBUG("DEBUG LOG : ead_mmio function doesn't support non 4 Byte aligned mmio_addr due to OPAE\n");
+    return -1;
+  }
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaReadMMIO64(mmio_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n", host_addr64, mmio_addr);
+      return -1;
+    }
+    MMD_DEBUG("DEBUG LOG : the host_addr64 value is ");
+    MMD_DEBUG("%" PRIu64 "\n", *host_addr64);
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n", host_addr32, mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr,
+              mmio_addr,
+              size);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr,
+                mmio_addr,
+                size);
+      MMD_DEBUG("result is %d \n", res);
+      return -1;
+    }
+
+    memcpy(host_addr32, &read_data, size);
+  }
+
+  return res;
+}
+
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::write_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaWriteMMIO64(mmio_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t \n",
+                host_addr64,
+                mmio_addr);
+      return -1;
+    }
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, *host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t\n",
+                host_addr32,
+                mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  while (size > 0) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr32,
+              mmio_addr,
+              size);
+    uint32_t tmp_data32 = 0;
+    fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &tmp_data32);  // First read the data back
+    size_t chunk_size = (size >= 4) ? 4 : size;
+
+    memcpy(&tmp_data32, host_addr32, chunk_size);  // Apply our data overlay
+
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr32,
+                mmio_addr,
+                size);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return 0;
+}
+
+};  // namespace mmd_helper
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
new file mode 100644
index 0000000..b7e2667
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
@@ -0,0 +1,41 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_HELPER_H
+#define MMD_HELPER_H
+
+#include <opae/fpga.h>
+#include <stdarg.h>
+
+inline void MMD_DEBUG(const char *format, ...) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    va_list arglist;
+    va_start(arglist, format);
+    vprintf(format, arglist);
+    va_end(arglist);
+    fflush(stdout);
+  }
+}
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size);
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size);
+
+};  // namespace mmd_helper
+
+#endif  // MMD_HELPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
new file mode 100644
index 0000000..16992da
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
@@ -0,0 +1,377 @@
+// Copyright 2022 Intel Corporation
+// SPDX-License-Identifier: MIT
+
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* TODO: this file comes from OpenCL SDK and should be formatted there first */
+/* clang-format off */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+// #ifndef AOCL_MMD_CALL
+// #if defined(_WIN32)
+// #define AOCL_MMD_CALL __declspec(dllimport)
+// #else
+// #define AOCL_MMD_CALL
+// #endif
+// #endif
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+#define WEAK __attribute__((weak))
+#endif
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>  //size_t
+#else
+#include <stddef.h> //size_t
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+   unsigned lo; /* 32 least significant bits of time value. */
+   unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+   AOCL_MMD_VERSION = 0,       /* Version of MMD (char*)*/
+   AOCL_MMD_NUM_BOARDS = 1,    /* Number of candidate boards (int)*/
+   AOCL_MMD_BOARD_NAMES = 2,   /* Names of boards available delimiter=; (char*)*/
+   AOCL_MMD_VENDOR_NAME = 3,   /* Name of vendor (char*) */
+   AOCL_MMD_VENDOR_ID = 4,     /* An integer ID for the vendor (int) */
+   AOCL_MMD_USES_YIELD = 5,    /* 1 if yield must be called to poll hw (int) */
+   /* The following can be combined in a bit field:
+    * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_SYSTEM.
+    * Prior to 14.1, all existing devices supported physical memory and no types of SVM memory, so this
+    * is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+    */
+   AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED      (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC         (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT     (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P            (1 << 3)
+
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+   AOCL_MMD_NUM_KERNEL_INTERFACES = 1,  /* Number of Kernel interfaces (int) */
+   AOCL_MMD_KERNEL_INTERFACES = 2,      /* Kernel interface (int*) */
+   AOCL_MMD_PLL_INTERFACES = 3,         /* Kernel clk handles (int*) */
+   AOCL_MMD_MEMORY_INTERFACE = 4,       /* Global memory handle (int) */
+   AOCL_MMD_TEMPERATURE = 5,            /* Temperature measurement (float) */
+   AOCL_MMD_PCIE_INFO = 6,              /* PCIe information (char*) */
+   AOCL_MMD_BOARD_NAME = 7,             /* Name of board (char*) */
+   AOCL_MMD_BOARD_UNIQUE_ID = 8,        /* Unique ID of board (int) */
+   AOCL_MMD_CONCURRENT_READS = 9,       /* # of parallel reads; 1 is serial*/
+   AOCL_MMD_CONCURRENT_WRITES = 10,     /* # of parallel writes; 1 is serial*/
+   AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/
+   AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,  /* Min alignment that the ASP supports for host allocations (size_t) */
+   AOCL_MMD_HOST_MEM_CAPABILITIES = 13,      /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+   AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,    /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+   AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,    /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+   AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16,   /*(size_t)*/
+   AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+   AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+   unsigned long long int exception_type;
+   void *user_private_info;
+   size_t user_cb;
+}aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)( int handle, void* user_data );
+typedef void (*aocl_mmd_device_interrupt_handler_fn)( int handle, aocl_mmd_interrupt_info* data_in, void* user_data );
+typedef void (*aocl_mmd_status_handler_fn)( int handle, void* user_data, aocl_mmd_op_t op, int status );
+
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(
+    aocl_mmd_offline_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(
+    int handle,
+    aocl_mmd_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char *name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler( int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data ) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler( int handle, aocl_mmd_status_handler_fn fn, void* user_data ) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      void* dst,
+      int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      const void* src,
+      int mmd_interface, size_t offset ) WEAK;
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS                 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE         -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY          -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT  -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY   -4
+#define AOCL_MMD_ERROR_INVALID_POINTER        -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/* clang-format on */
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
new file mode 100644
index 0000000..66e06bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
@@ -0,0 +1,18 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
new file mode 100644
index 0000000..28dcfa4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
@@ -0,0 +1,63 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/ccip_mmd.cpp
+   ./host/ccip_mmd_device.cpp
+   ./host/dma_work_thread.cpp
+   ./host/fpga_dma.c
+   ./host/kernel_interrupt.cpp
+   ./host/mmd_dma.cpp
+   ./host/memcpy_s_fast.c
+   ./host/x86-sse2.S
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when liking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
new file mode 100644
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
new file mode 100644
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
new file mode 100644
index 0000000..1530978
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
@@ -0,0 +1 @@
+*.o
+\ No newline at end of file
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
new file mode 100644
index 0000000..6d8f9fa
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
@@ -0,0 +1,123 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Driver does not support Address Span Extender
+ * - Implementation is not optimized for performance.
+ *   User buffer data is copied into a DMA-able buffer before the transfer
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef AFU_BBB_UTIL_H__
+#define AFU_BBB_UTIL_H__
+
+#include <assert.h>
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#define DFH_FEATURE_EOL(dfh) (((dfh >> 40) & 1) == 1)
+#define DFH_FEATURE(dfh) ((dfh >> 60) & 0xf)
+#define DFH_FEATURE_IS_PRIVATE(dfh) (DFH_FEATURE(dfh) == 3)
+#define DFH_FEATURE_IS_BBB(dfh) (DFH_FEATURE(dfh) == 2)
+#define DFH_FEATURE_IS_AFU(dfh) (DFH_FEATURE(dfh) == 1)
+#define DFH_FEATURE_NEXT(dfh) ((dfh >> 16) & 0xffffff)
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             uint64_t find_id_l,
+                             uint64_t find_id_h,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  assert(find_id_l);
+  assert(find_id_h);
+
+  uint64_t offset = 0;
+  if (result_offset) {
+    offset = *result_offset;
+  }
+  uint64_t dfh = 0;
+
+  // Limit the maximum number of DFH search iterations to avoid getting stuck
+  // in an infinte loop in case the DFH_FEATURE_EOL is not found.  Limit of
+  // 5000 is very conservaitve.  In practice search should terminate in 3 or
+  // fewer iterations.
+  int MAX_DFH_SEARCHES = 5000;
+  int dfh_search_iterations = 0;
+
+  do {
+    fpgaReadMMIO64(afc_handle, 0, offset, &dfh);
+
+    int is_bbb = DFH_FEATURE_IS_BBB(dfh);
+    int is_afu = DFH_FEATURE_IS_AFU(dfh);
+
+    if (is_afu || is_bbb) {
+      uint64_t id_l = 0;
+      uint64_t id_h = 0;
+      fpgaReadMMIO64(afc_handle, 0, offset + 8, &id_l);
+      fpgaReadMMIO64(afc_handle, 0, offset + 16, &id_h);
+
+      if (find_id_l == id_l && find_id_h == id_h) {
+        if (result_offset) *result_offset = offset;
+        if (result_next_offset) *result_next_offset = DFH_FEATURE_NEXT(dfh);
+        return true;
+      }
+    }
+    offset += DFH_FEATURE_NEXT(dfh);
+
+    dfh_search_iterations++;
+    if (dfh_search_iterations > MAX_DFH_SEARCHES) {
+      return false;
+    }
+  } while (!DFH_FEATURE_EOL(dfh));
+
+  return false;
+}
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             const char *guid_str,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  fpga_guid guid;
+
+  if (uuid_parse(guid_str, guid) < 0) return 0;
+
+  uint32_t i;
+  uint32_t s;
+
+  uint64_t find_id_l = 0;
+  uint64_t find_id_h = 0;
+
+  // The API expects the MSB of the GUID at [0] and the LSB at [15].
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_h = ((find_id_h << 8) | (0xff & guid[i]));
+  }
+
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_l = ((find_id_l << 8) | (0xff & guid[8 + i]));
+  }
+
+  return find_dfh_by_guid(afc_handle, find_id_l, find_id_h, result_offset, result_next_offset);
+}
+
+#endif  // AFU_BBB_UTIL_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
new file mode 100644
index 0000000..b7cd06a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
@@ -0,0 +1,655 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <cassert>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "aocl_mmd.h"
+#include "ccip_mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+#define ACL_DCP_ERROR_IF(COND, NEXT, ...)  \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define ACL_PKG_SECTION_DCP_GBS_GZ ".acl.gbs.gz"
+
+// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime
+// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure
+// the runtime doesn't get to reference them after MMD destructors have been called.
+// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does.
+// Implemented as a singleton.
+class DeviceMapManager final {
+ public:
+  typedef std::map<int, CcipDevice*> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  // Returns handle and device pointer to the device with the specified name
+  // Creates a new entry for this device if it doesn't already exist
+  // Return 0 on success, -1 on failure
+  int get_or_create_device(const char* board_name, int* handle, CcipDevice** device);
+
+  // Return obj id based on BSP name.
+  uint64_t id_from_name(const char* board_name);
+
+  // Return MMD handle based on obj id. Returned value is negative if board doesn't exist
+  inline int handle_from_id(uint64_t obj_id);
+
+  // Return pointer to CCIP device based on MMD handle. Returned value is null if board doesn't exist
+  CcipDevice* device_from_handle(int handle);
+
+  // Closes specified device if it exists
+  void close_device_if_exists(int handle);
+
+  // Returns a reference to the class singleton
+  static DeviceMapManager& get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const&) = delete;
+  void operator=(DeviceMapManager const&) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated CcipDevice* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+  }
+  t_handle_to_dev_map* handle_to_dev_map = nullptr;
+  t_id_to_handle_map* id_to_handle_map = nullptr;
+};
+static DeviceMapManager& device_manager = DeviceMapManager::get_instance();
+
+int DeviceMapManager::get_or_create_device(const char* board_name, int* handle, CcipDevice** device) {
+  int _handle = CCIP_MMD_INVALID_PARAM;
+  CcipDevice* _device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new CcipDevice(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+  return DeviceMapManager::SUCCESS;
+}
+
+uint64_t DeviceMapManager::id_from_name(const char* board_name) {
+  uint64_t obj_id = 0;
+  if (CcipDevice::parse_board_name(board_name, obj_id)) {
+    return obj_id;
+  } else {
+    // TODO: add error hanlding for DeviceMapManager (make sure 0 is marked as invalid device)
+    return 0;
+  }
+}
+
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = CCIP_MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+  }
+  return handle;
+}
+
+CcipDevice* DeviceMapManager::device_from_handle(int handle) {
+  CcipDevice* dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+  }
+  return dev;
+}
+
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      CcipDevice* dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+    }
+  }
+}
+
+// Interface for checking if AFU has BSP loaded
+bool ccip_mmd_bsp_loaded(const char* name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    CcipDevice* dev = device_manager.device_from_handle(handle);
+    if (dev)
+      return dev->bsp_loaded();
+    else
+      return false;
+  } else {
+    bool bsp_loaded = false;
+    try {
+      CcipDevice dev(obj_id);
+      bsp_loaded = dev.bsp_loaded();
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      return false;
+    }
+    return bsp_loaded;
+  }
+}
+
+static int get_offline_num_acl_boards(bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return CCIP_MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+bool static get_offline_board_names(std::string& boards, bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  fpga_properties filter = nullptr;
+  fpga_properties prop = nullptr;
+  std::ostringstream board_name;
+  fpga_token* toks = nullptr;
+  uint64_t obj_id;
+  bool success = true;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      success = false;
+      goto cleanup;
+    }
+  }
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  toks = static_cast<fpga_token*>(calloc(num_matches, sizeof(fpga_token)));
+  if (toks == NULL) {
+    LOG_ERR("Error allocating memory\n");
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaEnumerate(&filter, 1, toks, num_matches, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  for (unsigned int i = 0; i < num_matches; i++) {
+    if (prop) fpgaDestroyProperties(&prop);
+    res = fpgaGetProperties(toks[i], &prop);
+    if (res == FPGA_OK) {
+      res = fpgaPropertiesGetObjectID(prop, &obj_id);
+      if (res != FPGA_OK) {
+        LOG_ERR("Error reading object ID: %s\n", fpgaErrStr(res));
+        success = false;
+        break;
+      }
+      boards.append(CcipDevice::get_board_name(BSP_NAME, obj_id));
+      if (i < num_matches - 1) boards.append(";");
+    } else {
+      success = false;
+      LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    }
+  }
+
+cleanup:
+  if (prop) {
+    fpgaDestroyProperties(&prop);
+  }
+  if (filter) {
+    fpgaDestroyProperties(&filter);
+  }
+  if (toks) {
+    for (unsigned i = 0; i < num_matches; i++) {
+      if (toks[i]) {
+        fpgaDestroyToken(&toks[i]);
+      }
+    }
+    free(toks);
+  }
+
+  return success;
+}
+
+int aocl_mmd_yield(int handle) {
+  DEBUG_PRINT("* Called: aocl_mmd_yield\n");
+  YIELD_DELAY();
+
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  assert(dev);
+  if (dev) {
+    return dev->yield();
+  }
+
+  return 0;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int*)param_value) = X;                          \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strlen(X) + 1;                                           \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy_s_fast((void*)param_value, param_value_size, X, Xcpylen);         \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void* param_value,
+                              size_t* param_size_ret) {
+  // aocl_mmd_get_offline_info can be called many times by the runtime
+  // and it is expensive to query the system.  Only compute values first
+  // time aocl_mmd_get_offline_info called future iterations use saved results
+  static bool initialized = false;
+  static int mem_type_info;
+  static int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards();
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      if (num_acl_boards >= 0) {
+        RESULT_INT(num_acl_boards);
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(KernelInterrupt::yield_is_enabled());
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+int ccip_mmd_get_offline_board_names(size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  std::string boards;
+  bool success = get_offline_board_names(boards, false);
+  if (success) {
+    RESULT_STR(boards.c_str());
+  } else {
+    RESULT_INT(-1);
+  }
+
+  return 0;
+}
+
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  DEBUG_PRINT("called aocl_mmd_get_info\n");
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel PAC Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float* ptr = static_cast<float*>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+  } else {
+    return CCIP_MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) dev->set_status_handler(fn, user_data);
+  // TODO: handle error condition if dev null
+  return 0;
+}
+
+// Host to device-global-memory write
+int aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n- aocl_mmd_write: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, src, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n+ aocl_mmd_read: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, dst, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_open(const char* name) {
+  DEBUG_PRINT("Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return CCIP_MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  CcipDevice* dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    delete dev;
+    return CCIP_MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->bsp_loaded()) {
+    if (!dev->initialize_bsp()) {
+      LOG_ERR("Error initializing bsp\n");
+      return CCIP_MMD_BSP_INIT_FAILED;
+    }
+  } else {
+    return CCIP_MMD_BSP_NOT_LOADED;
+  }
+
+  return handle;
+}
+
+int aocl_mmd_close(int handle) {
+  device_manager.close_device_if_exists(handle);
+
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 2; }
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 266.666667; }  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 32) * instance + addr; }
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) {
+  return aocl_mmd_write(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
new file mode 100644
index 0000000..9bc055a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
@@ -0,0 +1,579 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <numa.h>
+
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+
+// TODO: better encapsulation of afu_bbb_util functions
+#include "afu_bbb_util.h"
+
+#define MMD_COPY_BUFFER_SIZE (1024 * 1024)
+
+#define MEM_WINDOW_BBB_GUID "72347537-7821-4125-442a-472d4b615064"
+#define MEM_WINDOW_BBB_SIZE 8192
+
+#define MSGDMA_BBB_GUID "ef82def7-f6ec-40fc-a914-9a35bace01ea"
+#define MSGDMA_BBB_SIZE 256
+
+#define NULL_DFH_BBB_GUID "da1182b1-b344-4e23-90fe-6aab12a0132f"
+#define BSP_AFU_GUID "96ef4230-dafa-cb5f-18b7-9ffa2ee54aa0"
+
+using namespace intel_opae_mmd;
+
+int CcipDevice::next_mmd_handle{1};
+
+std::string CcipDevice::get_board_name(std::string prefix, uint64_t obj_id) {
+  std::ostringstream stream;
+  stream << prefix << std::setbase(16) << obj_id;
+  return stream.str();
+}
+
+CcipDevice::CcipDevice(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      bsp_initialized(false),
+      mmio_is_mapped(false),
+      afc_handle(NULL),
+      filter(NULL),
+      afc_token(NULL),
+      dma_ch0_dfh_offset(0),
+      dma_ch1_dfh_offset(0),
+      dma_ase_dfh_offset(0),
+      dma_host_to_fpga(NULL),
+      dma_fpga_to_host(NULL),
+      mmd_copy_buffer(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  mmd_copy_buffer = (char *)malloc(MMD_COPY_BUFFER_SIZE);
+  if (mmd_copy_buffer == NULL) {
+    throw std::runtime_error(std::string("malloc failed for mmd_copy_buffer"));
+  }
+
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches;
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error creating properties object: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object type: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectID(filter, obj_id);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object ID: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaEnumerate(&filter, 1, &afc_token, 1, &num_matches);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error enumerating AFCs: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (num_matches < 1) {
+    res = fpgaDestroyProperties(&filter);
+    throw std::runtime_error("AFC not found");
+  }
+
+  res = fpgaOpen(afc_token, &afc_handle, 0);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error opening AFC: ") + std::string(fpgaErrStr(res)));
+  }
+
+  fpga_properties prop = nullptr;
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error reading properties: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (prop) {
+    res = fpgaPropertiesGetBus(prop, &bus);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading bus: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetDevice(prop, &device);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading device: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetFunction(prop, &function);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading function: ") + std::string(fpgaErrStr(res)));
+    }
+    fpgaDestroyProperties(&prop);
+  }
+
+  initialize_fme_sysfs();
+
+  mmd_dev_name = get_board_name(BSP_NAME, obj_id);
+  afu_initialized = true;
+}
+
+// Return true if board name parses correctly, false if it does not
+// Return the parsed object_id in obj_id as an [out] parameter
+bool CcipDevice::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  std::string prefix(BSP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    LOG_ERR("Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+// Read information directly from sysfs.  This is non-portable and relies on
+// paths set in driver (will not interoperate between DFH driver in up-stream
+// kernel and Intel driver distributed with PAC cards).  In the future hopefully
+// OPAE can provide SDK to read this information
+void CcipDevice::initialize_fme_sysfs() {
+  const int MAX_LEN = 250;
+  char temp_fmepath[MAX_LEN];
+  char numa_path[MAX_LEN];
+
+  // HACK: currently ObjectID is constructed using its lower 20 bits
+  // as the device minor number.  The device minor number also matches
+  // the device ID in sysfs.  This is a simple way to construct a path
+  // to the device FME using information that is already available (object_id).
+  // Eventually this code should be replaced with a direct call to OPAE C API,
+  // but API does not currently expose the device temperature.
+  int dev_num = 0xFFFFF & fpga_obj_id;
+
+  // Path to temperature value
+  snprintf(temp_fmepath,
+           MAX_LEN,
+           "/sys/class/fpga/intel-fpga-dev.%d/intel-fpga-fme.%d/thermal_mgmt/temperature",
+           dev_num,
+           dev_num);
+  // Path to NUMA node
+  snprintf(numa_path, MAX_LEN, "/sys/class/fpga/intel-fpga-dev.%d/device/numa_node", dev_num);
+
+  // Try to open the sysfs file. If open succeeds then set as initialized
+  // to be able to read temperature in future.  If open fails then not
+  // initalized and skip attempt to read temperature in future.
+  FILE *tmp;
+  tmp = fopen(temp_fmepath, "r");
+  if (tmp) {
+    fme_sysfs_temp_path = std::string(temp_fmepath);
+    fme_sysfs_temp_initialized = true;
+    fclose(tmp);
+  }
+
+  // Read NUMA node and set value for future use. If not available set to -1
+  // and disable use of NUMA setting
+  std::ifstream sysfs_numa_node(numa_path, std::ifstream::in);
+  if (sysfs_numa_node.is_open()) {
+    sysfs_numa_node >> fpga_numa_node;
+    sysfs_numa_node.close();
+    if (std::stoi(fpga_numa_node) >= 0) {
+      enable_set_numa = true;
+    } else {
+      enable_set_numa = false;
+    }
+  } else {
+    enable_set_numa = false;
+    fpga_numa_node = "-1";
+  }
+}
+
+bool CcipDevice::find_dma_dfh_offsets() {
+  uint64_t dfh_offset = 0;
+  uint64_t next_dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch0_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH1 offset: 0x%lX\t GUID: %s\n", dma_ch0_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA: Cannot find DMA channel 0 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset += next_dfh_offset;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch1_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH2 offset: 0x%lX\t GUID: %s\n", dma_ch1_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find DMA channel 2 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MEM_WINDOW_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ase_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA ASE offset: 0x%lX\t GUID: %s\n", dma_ase_dfh_offset, MEM_WINDOW_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find ASE DFH offset\n");
+    return false;
+  }
+
+  assert(dma_ch0_dfh_offset != 0);
+  assert(dma_ch1_dfh_offset != 0);
+  assert(dma_ase_dfh_offset != 0);
+  assert(dma_ch0_dfh_offset != dma_ch1_dfh_offset);
+
+  return true;
+}
+
+bool CcipDevice::initialize_bsp() {
+  if (bsp_initialized) {
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(afc_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  /* Reset AFC */
+  res = fpgaReset(afc_handle);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error resetting AFC: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for performance
+  // that the pinned buffer is located on the NUMA node as the threads that
+  // performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  find_dma_dfh_offsets();
+
+  const int dma_ch0_interrupt_num = 0;  // DMA channel 0 hardcoded to interrupt 0
+  dma_host_to_fpga = new mmd_dma(afc_handle, mmd_handle, dma_ch0_dfh_offset, dma_ase_dfh_offset, dma_ch0_interrupt_num);
+  if (!dma_host_to_fpga->initialized()) {
+    LOG_ERR("Error initializing mmd dma\n");
+    delete dma_host_to_fpga;
+    return false;
+  }
+
+  const int dma_ch1_interrupt_num = 2;  // DMA channel 1 hardcoded to interrupt 2
+  dma_fpga_to_host = new mmd_dma(afc_handle, mmd_handle, dma_ch1_dfh_offset, dma_ase_dfh_offset, dma_ch1_interrupt_num);
+  if (!dma_fpga_to_host->initialized()) {
+    fprintf(stderr, "Error initializing mmd dma\n");
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+  kernel_interrupt_thread = new KernelInterrupt(afc_handle, mmd_handle);
+
+  if (!kernel_interrupt_thread->initialized()) {
+    LOG_ERR("Error initializing kernel interrupts\n");
+    delete kernel_interrupt_thread;
+    return false;
+  }
+
+  bsp_initialized = true;
+  return bsp_initialized;
+}
+
+CcipDevice::~CcipDevice() {
+  int num_errors = 0;
+  if (mmd_copy_buffer) {
+    free(mmd_copy_buffer);
+    mmd_copy_buffer = NULL;
+  }
+
+  if (kernel_interrupt_thread) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (dma_host_to_fpga) {
+    delete dma_host_to_fpga;
+    dma_host_to_fpga = NULL;
+  }
+
+  if (dma_fpga_to_host) {
+    delete dma_fpga_to_host;
+    dma_fpga_to_host = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(afc_handle, 0)) num_errors++;
+  }
+
+  if (afc_handle) {
+    if (fpgaClose(afc_handle) != FPGA_OK) num_errors++;
+  }
+
+  if (afc_token) {
+    if (fpgaDestroyToken(&afc_token) != FPGA_OK) num_errors++;
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) num_errors++;
+  }
+
+  if (num_errors > 0) {
+    DEBUG_PRINT("Error freeing resources in destructor\n");
+  }
+}
+
+int CcipDevice::yield() {
+  if (kernel_interrupt_thread) kernel_interrupt_thread->yield();
+  return 0;
+}
+
+bool CcipDevice::bsp_loaded() {
+  fpga_guid dcp_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    return false;
+  }
+
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading GUID\n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(dcp_guid, afu_guid) == 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::string CcipDevice::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << unsigned(device) << "." << unsigned(function);
+
+  return bdf.str();
+}
+
+float CcipDevice::get_temperature() {
+  float temp = 0;
+  if (fme_sysfs_temp_initialized) {
+    std::ifstream sysfs_temp(fme_sysfs_temp_path, std::ifstream::in);
+    sysfs_temp >> temp;
+    sysfs_temp.close();
+  }
+  return temp;
+}
+
+void CcipDevice::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+void CcipDevice::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  event_update = fn;
+  event_update_user_data = user_data;
+  dma_host_to_fpga->set_status_handler(fn, user_data);
+  dma_fpga_to_host->set_status_handler(fn, user_data);
+}
+
+void CcipDevice::event_update_fn(aocl_mmd_op_t op, int status) {
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+int CcipDevice::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read of
+  // base address + offset
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_fpga_to_host->read_memory(op, static_cast<uint64_t *>(host_addr), offset, size);
+  } else {
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what status value should really be instead of just using 0
+      // Also handle case when op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaReadMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+int CcipDevice::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_host_to_fpga->write_memory(op, static_cast<const uint64_t *>(host_addr), offset, size);
+  } else {
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what 'status' value should really be.  Right now just
+      // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  // TODO: check what status values aocl wants and also parse the result
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaWriteMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+fpga_result CcipDevice::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DCP_DEBUG_MEM("read_mmio start: %p\t %lx\t %lu\n", host_addr, mmio_addr, size);
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaReadMMIO64(afc_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) return res;
+    memcpy_s_fast(host_addr32, size, &read_data, size);
+  }
+
+  return res;
+}
+
+fpga_result CcipDevice::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DEBUG_PRINT("write_mmio\n");
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaWriteMMIO64(afc_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+  while (size > 0) {
+    uint32_t tmp_data32 = 0;
+    size_t chunk_size = (size >= 4) ? 4 : size;
+    memcpy_s_fast(&tmp_data32, sizeof(tmp_data32), host_addr32, chunk_size);
+    res = fpgaWriteMMIO32(afc_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
new file mode 100644
index 0000000..f8088ac
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
@@ -0,0 +1,187 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _CCIP_MMD_DEVICE_H
+#define _CCIP_MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "kernel_interrupt.h"
+#include "mmd_dma.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// CCIP_MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define CCIP_MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define CCIP_MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning BSP is not loaded.
+#define CCIP_MMD_BSP_NOT_LOADED -2
+#define CCIP_MMD_BSP_INIT_FAILED -3
+
+// Delay settings
+// TODO: Figure out why these delays are needed and
+// have requirement removed (at least for HW)
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define DCP_OPENCL_BSP_AFU_ID "63B3779B-8BDD-4F03-9CEB-0301181D6AEF"
+
+#define BSP_NAME "pac_"
+
+// LOG ERRORS
+#define CCIP_MMD_ERR_LOGGING 1
+#ifdef CCIP_MMD_ERR_LOGGING
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define LOG_ERR(...)
+#endif
+
+// debugging
+#ifdef DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_MEM(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_MEM(...)
+#endif
+
+enum {
+#ifndef DLA_MMD                    // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0x4000,        /* Control interface into kernel interface */
+#else
+  AOCL_MMD_KERNEL = 0,  // CoreDLA completely removes the Opencl kernel interface, repurposed for CSRs
+#endif
+  AOCL_MMD_MEMORY = 0x100000 /* Data interface to device memory */
+};
+
+enum AfuStatu { CCIP_MMD_INVALID_ID = 0, CCIP_MMD_BSP, CCIP_MMD_AFU };
+
+class CcipDevice final {
+ public:
+  CcipDevice(uint64_t);
+  CcipDevice(const CcipDevice &) = delete;
+  CcipDevice &operator=(const CcipDevice &) = delete;
+  ~CcipDevice();
+
+  static std::string get_board_name(std::string prefix, uint64_t obj_id);
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+  bool initialize_bsp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  int yield();
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool bsp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  // HACK: use the sysfs path to read temperature value and NUMA node
+  // this should be replaced with OPAE call once that is
+  // available
+  std::string fme_sysfs_temp_path;
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+
+  void initialize_local_cpus_sysfs();
+
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool bsp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_handle afc_handle;
+  fpga_properties filter;
+  fpga_token afc_token;
+  uint64_t dma_ch0_dfh_offset;
+  uint64_t dma_ch1_dfh_offset;
+  uint64_t dma_ase_dfh_offset;
+  intel_opae_mmd::mmd_dma *dma_host_to_fpga;
+  intel_opae_mmd::mmd_dma *dma_fpga_to_host;
+
+  char *mmd_copy_buffer;
+
+  // Helper functions
+  fpga_result read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // _CCIP_MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
new file mode 100644
index 0000000..30113eb
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
@@ -0,0 +1,151 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include "dma_work_thread.h"
+#include <assert.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+dma_work_thread::dma_work_thread(mmd_dma &mmd_dma_arg)
+    : m_initialized(false),
+      m_thread_wake_event(NULL),
+      m_thread(NULL),
+      m_work_queue_mutex(),
+      m_work_queue(),
+      m_mmd_dma(mmd_dma_arg) {
+  m_thread_wake_event = new eventfd_wrapper();
+  if (!m_thread_wake_event->initialized()) return;
+
+  m_thread = new std::thread(work_thread, std::ref(*this));
+
+  m_initialized = true;
+}
+
+dma_work_thread::~dma_work_thread() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_thread_wake_event->notify(UINT64_MAX - 1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_thread_wake_event) {
+    delete m_thread_wake_event;
+    m_thread_wake_event = NULL;
+  }
+
+  m_initialized = false;
+}
+
+void dma_work_thread::work_thread(dma_work_thread &obj) {
+  int res;
+
+  // get eventfd handle
+  int thread_signal_fd = obj.m_thread_wake_event->get_fd();
+
+  struct pollfd pollfd_setup;
+  while (1) {
+    pollfd_setup.fd = thread_signal_fd;
+    pollfd_setup.events = POLLIN;
+    pollfd_setup.revents = 0;
+    res = poll(&pollfd_setup, 1, -1);
+    if (res < 0) {
+      fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    } else if (res > 0 && pollfd_setup.revents == POLLIN) {
+      uint64_t count_work_items = 0;
+      ssize_t bytes_read = read(thread_signal_fd, &count_work_items, sizeof(count_work_items));
+      if (bytes_read > 0) {
+        DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+      } else {
+        // TODO: the MMD should not exit.  But I have a different branch
+        // I'm working on that will change synchronization to use
+        // condition variable instead of eventfd in synchronization
+        // within the same process.  Will remove this exit() call at
+        // when PR for that change is submitted.
+        fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+        exit(-1);
+      }
+
+      // Ensure count is in proper range
+      const unsigned long MAX_WORK_ITEMS = 1000000000;
+      if (count_work_items > MAX_WORK_ITEMS && count_work_items != (UINT64_MAX - 1)) {
+          fprintf(stderr, "Error: poll value is out of range");
+          exit(-1);
+      }
+
+      obj.m_work_queue_mutex.lock();
+      if (obj.m_work_queue.empty() && count_work_items == UINT64_MAX - 1) {
+        // The maximum value of count is set when there is no work left
+        // The work queue must also be empty
+        // This thread can break out of the loop
+        obj.m_work_queue_mutex.unlock();
+        break;
+      }
+
+      std::queue<dma_work_item> items;
+      for (uint64_t i = 0; i < count_work_items; i++) {
+        // Check if there are enough jobs in the work queue as requested (count)
+        if (obj.m_work_queue.empty()) {
+          fprintf(stderr, "Poll error. Not enough tasks in queue.");
+          exit(-1);
+        }
+        dma_work_item item = obj.m_work_queue.front();
+        items.push(item);
+        obj.m_work_queue.pop();
+      }
+      obj.m_work_queue_mutex.unlock();
+
+      while (!items.empty()) {
+        dma_work_item item = items.front();
+        obj.do_dma(item);
+        items.pop();
+      }
+    }
+  }
+}
+
+int dma_work_thread::enqueue_dma(dma_work_item &item) {
+  if (item.op) {
+    m_work_queue_mutex.lock();
+    m_work_queue.push(item);
+    m_work_queue_mutex.unlock();
+    // send message to thread to wake it
+    // setting count to 1 as only 1 job is pushed to the work queue
+    m_thread_wake_event->notify(1);
+    return 0;
+  } else {
+    // if op is not specified, it is a blocking operation and we don't use
+    // the thread
+    return do_dma(item);
+  }
+}
+
+int dma_work_thread::do_dma(dma_work_item &item) { return m_mmd_dma.do_dma(item); }
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
new file mode 100644
index 0000000..0afb036
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
@@ -0,0 +1,73 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _DMA_WORK_THREAD_H
+#define _DMA_WORK_THREAD_H
+
+#include <opae/fpga.h>
+
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+// forward class definitions
+class eventfd_wrapper;
+class mmd_dma;
+
+class dma_work_item {
+ public:
+  aocl_mmd_op_t op;
+  uint64_t *rd_host_addr;
+  const uint64_t *wr_host_addr;
+  size_t dev_addr;
+  size_t size;
+};
+
+class dma_work_thread final {
+ public:
+  dma_work_thread(mmd_dma &mmd_dma_arg);
+  ~dma_work_thread();
+
+  bool initialized() { return m_initialized; }
+
+  int enqueue_dma(dma_work_item &item);
+  int do_dma(dma_work_item &item);
+
+ private:
+  static void work_thread(dma_work_thread &obj);
+
+  bool m_initialized;
+
+  eventfd_wrapper *m_thread_wake_event;
+  std::thread *m_thread;
+  std::mutex m_work_queue_mutex;
+  std::queue<dma_work_item> m_work_queue;
+
+  mmd_dma &m_mmd_dma;
+
+  // not used and not implemented
+  dma_work_thread(dma_work_thread &other);
+  dma_work_thread &operator=(const dma_work_thread &other);
+};  // class dma_work_thread
+
+};  // namespace intel_opae_mmd
+
+#endif  // _DMA_WORK_THREAD_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
new file mode 100644
index 0000000..2de3f74
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
@@ -0,0 +1,74 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _EVENTFD_WRAPPER_H
+#define _EVENTFD_WRAPPER_H
+
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+namespace intel_opae_mmd {
+
+// simple wrapper class for managing eventfd objects
+class eventfd_wrapper final {
+ public:
+  eventfd_wrapper() {
+    m_initialized = false;
+    // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set
+    // The implementation of functions using eventfd assumes that
+    m_fd = eventfd(0, 0);
+    if (m_fd < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return;
+    }
+
+    m_initialized = true;
+  }
+
+  ~eventfd_wrapper() {
+    if (m_initialized) {
+      if (close(m_fd) < 0) {
+        fprintf(stderr, "eventfd : %s", strerror(errno));
+      }
+    }
+  }
+
+  bool notify(uint64_t count) {
+    ssize_t res = write(m_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+    return true;
+  }
+
+  int get_fd() { return m_fd; }
+  bool initialized() { return m_initialized; }
+
+ private:
+  // not used and not implemented
+  eventfd_wrapper(eventfd_wrapper& other);
+  eventfd_wrapper& operator=(const eventfd_wrapper& other);
+
+  // member varaibles
+  int m_fd;
+  int m_initialized;
+};  // class eventfd_wrapper
+
+};  // namespace intel_opae_mmd
+
+#endif  // _EVENTFD_WRAPPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
new file mode 100644
index 0000000..6c8df30
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
@@ -0,0 +1,1313 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.c
+ * \brief FPGA DMA User-mode driver
+ */
+
+#include "fpga_dma.h"
+#include <assert.h>
+#include <errno.h>
+#include <opae/fpga.h>
+#include <poll.h>
+#include <safe_string/safe_string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "fpga_dma_internal.h"
+#include "memcpy_s_fast.h"
+
+#ifdef SIM
+#define USE_ASE
+#else
+// TODO:  Need this until we can adequately sync MMIO R/W with pointer accesses.
+// Causes module to use fpgaMMIORead32() instead of foo = *ptr;
+#define USE_ASE
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+static int err_cnt = 0;
+#endif
+
+#ifdef CHECK_DELAYS
+double poll_wait_count = 0;
+double buf_full_count = 0;
+#endif
+
+/*
+ * macro for checking return codes
+ */
+#define ON_ERR_GOTO(res, label, desc)                         \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      goto label;                                             \
+    }                                                         \
+  } while (0)
+
+#define ON_ERR_RETURN(res, desc)                              \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      return (res);                                           \
+    }                                                         \
+  } while (0)
+
+// Internal Functions
+
+/**
+ * MMIOWrite64Blk
+ *
+ * @brief                Writes a block of 64-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIOWrite32Blk
+ *
+ * @brief                Writes a block of 32-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead64Blk
+ *
+ * @brief                Reads a block of 64-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead32Blk
+ *
+ * @brief                Reads a block of 32-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+// Feature type is BBB
+static inline bool fpga_dma_feature_is_bbb(uint64_t dfh) {
+  // BBB is type 2
+  return ((dfh >> AFU_DFH_TYPE_OFFSET) & 0xf) == FPGA_DMA_BBB;
+}
+
+/**
+ * _switch_to_ase_page
+ *
+ * @brief                Updates the current page of ASE to the address given
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] addr       Address to which the ASE page should be switched
+ * @return Nothing.  Side-effect is to update the current page in the DMA handle.
+ *
+ */
+static inline void _switch_to_ase_page(fpga_dma_handle dma_h, uint64_t addr) {
+  uint64_t requested_page = addr & ~DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  if (requested_page != dma_h->cur_ase_page) {
+    MMIOWrite64Blk(dma_h, ASE_CNTL_BASE(dma_h), (uint64_t)&requested_page, sizeof(requested_page));
+    dma_h->cur_ase_page = requested_page;
+  }
+}
+
+/**
+ * _send_descriptor
+ *
+ * @brief                Queues a DMA descriptor to the FPGA
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] desc       Pointer to a descriptor structure to send
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _send_descriptor(fpga_dma_handle dma_h, msgdma_ext_desc_t *desc) {
+  fpga_result res = FPGA_OK;
+  msgdma_status_t status = {0};
+
+  debug_print("desc.rd_address = %x\n", desc->rd_address);
+  debug_print("desc.wr_address = %x\n", desc->wr_address);
+  debug_print("desc.len = %x\n", desc->len);
+  debug_print("desc.wr_burst_count = %x\n", desc->wr_burst_count);
+  debug_print("desc.rd_burst_count = %x\n", desc->rd_burst_count);
+  debug_print("desc.wr_stride %x\n", desc->wr_stride);
+  debug_print("desc.rd_stride %x\n", desc->rd_stride);
+  debug_print("desc.rd_address_ext %x\n", desc->rd_address_ext);
+  debug_print("desc.wr_address_ext %x\n", desc->wr_address_ext);
+
+  debug_print("SGDMA_CSR_BASE = %lx SGDMA_DESC_BASE=%lx\n", dma_h->dma_csr_base, dma_h->dma_desc_base);
+
+#ifdef CHECK_DELAYS
+  bool first = true;
+#endif
+  do {
+    res = MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+    ON_ERR_GOTO(res, out, "MMIORead32Blk");
+#ifdef CHECK_DELAYS
+    if (first && status.st.desc_buf_full) {
+      buf_full_count++;
+      first = false;
+    }
+#endif
+  } while (status.st.desc_buf_full);
+
+  res = MMIOWrite64Blk(dma_h, dma_h->dma_desc_base, (uint64_t)desc, sizeof(*desc));
+  ON_ERR_GOTO(res, out, "MMIOWrite64Blk");
+
+out:
+  return res;
+}
+
+/**
+ * _do_dma
+ *
+ * @brief                    Performs a DMA transaction with the FPGA
+ * @param[in] dma_h          Handle to the FPGA DMA object
+ * @param[in] dst            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] src            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] count          Number of bytes
+ * @param[in] is_last_desc   True if this is the last buffer of a batch
+ * @param[in] type           Direction of transfer
+ * @param[in] intr_en        True means to ask for an interrupt from the FPGA
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _do_dma(fpga_dma_handle dma_h,
+                           uint64_t dst,
+                           uint64_t src,
+                           int count,
+                           int is_last_desc,
+                           fpga_dma_transfer_t type,
+                           bool intr_en) {
+  msgdma_ext_desc_t desc = {0};
+  fpga_result res = FPGA_OK;
+  int alignment_offset = 0;
+  int segment_size = 0;
+
+  // src, dst and count must be 64-byte aligned
+  if (dst % FPGA_DMA_ALIGN_BYTES != 0 || src % FPGA_DMA_ALIGN_BYTES != 0 || count % FPGA_DMA_ALIGN_BYTES != 0) {
+    return FPGA_INVALID_PARAM;
+  }
+  // these fields are fixed for all DMA transfers
+  desc.seq_num = 0;
+  desc.wr_stride = 1;
+  desc.rd_stride = 1;
+
+  desc.control.go = 1;
+  if (intr_en)
+    desc.control.transfer_irq_en = 1;
+  else
+    desc.control.transfer_irq_en = 0;
+
+  // Enable "earlyreaddone" in the control field of the descriptor except the last.
+  // Setting early done causes the read logic to move to the next descriptor
+  // before the previous descriptor completes.
+  // This elminates a few hundred clock cycles of waiting between transfers.
+  if (!is_last_desc)
+    desc.control.early_done_en = 1;
+  else
+    desc.control.early_done_en = 0;
+
+  if (type == FPGA_TO_FPGA_MM) {
+    desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+    desc.len = count;
+    desc.wr_burst_count = 4;
+    desc.rd_burst_count = 4;
+    desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+    res = _send_descriptor(dma_h, &desc);
+    ON_ERR_GOTO(res, out, "_send_descriptor");
+  }
+  // either FPGA to Host or Host to FPGA transfer so we need to make sure the DMA transaction is aligned to the burst
+  // size (CCIP restriction)
+  else {
+    // need to determine if the CCIP (host) address is aligned to 4CL (256B).  When 0 the CCIP address is aligned.
+    alignment_offset =
+        (type == HOST_TO_FPGA_MM) ? (src % (4 * FPGA_DMA_ALIGN_BYTES)) : (dst % (4 * FPGA_DMA_ALIGN_BYTES));
+
+    // not aligned to 4CL so performing a short transfer to get aligned
+    if (alignment_offset != 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // count isn't large enough to hit next 4CL boundary
+      if (((4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset) >= count) {
+        segment_size = count;
+        count = 0;  // only had to transfer count amount of data to reach the end of the provided buffer
+      } else {
+        segment_size = (4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset;
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;  // subtract the segment size from count since the transfer below will bring us into 4CL
+                                // alignment
+        desc.control.transfer_irq_en = 0;
+      }
+
+      // will post short transfer to align to a 4CL (256 byte) boundary
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we are 4CL (256 byte) aligned
+    // if there is at least 4CL (256 bytes) of data to transfer, post bursts of 4
+    if (count >= (4 * FPGA_DMA_ALIGN_BYTES)) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 4;
+      desc.rd_burst_count = 4;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // buffer ends on 4CL boundary
+      if ((count % (4 * FPGA_DMA_ALIGN_BYTES)) == 0) {
+        segment_size = count;
+        count = 0;  // transfer below will move the remainder of the buffer
+      }
+      // buffers do not end on 4CL boundary so transfer only up to the last 4CL boundary leaving a segment at the end to
+      // finish later
+      else {
+        segment_size = count - (count % (4 * FPGA_DMA_ALIGN_BYTES));  // round count down to the nearest multiple of 4CL
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;
+        desc.control.transfer_irq_en = 0;
+      }
+
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we have posted all the bursts of length 4 we can but there might be 64, 128, or 192 bytes of data
+    // to transfer still if buffer did not end on 4CL (256 byte) boundary post short transfer to handle the remainder
+    if (count > 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.len = count;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+      if (intr_en) desc.control.transfer_irq_en = 1;
+      // will post short transfer to move the remainder of the buffer
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+
+  }  // end of FPGA --> Host or Host --> FPGA transfer
+
+out:
+  return res;
+}
+
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dfh_offset, int interrupt_num, fpga_dma_handle *dma_p) {
+  fpga_result res = FPGA_OK;
+  fpga_dma_handle dma_h = NULL;
+  int i = 0;
+  if (!fpga) {
+    return FPGA_INVALID_PARAM;
+  }
+  if (!dma_p) {
+    return FPGA_INVALID_PARAM;
+  }
+  // init the dma handle
+  dma_h = (fpga_dma_handle)malloc(sizeof(struct _dma_handle_t));
+  if (!dma_h) {
+    return FPGA_NO_MEMORY;
+  }
+  dma_h->fpga_h = fpga;
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) dma_h->dma_buf_ptr[i] = NULL;
+  dma_h->mmio_num = 0;
+  dma_h->cur_ase_page = 0xffffffffffffffffUll;
+
+  // Discover DMA BBB by traversing the device feature list
+  bool dma_found = false;
+
+#ifndef USE_ASE
+  res = fpgaMapMMIO(dma_h->fpga_h, 0, (uint64_t **)&dma_h->mmio_va);
+  ON_ERR_GOTO(res, out, "fpgaMapMMIO");
+#endif
+
+  dfh_feature_t dfh = {0};
+  res = MMIORead64Blk(dma_h, dfh_offset, (uint64_t)&dfh, sizeof(dfh));
+  ON_ERR_GOTO(res, out, "MMIORead64Blk");
+
+  if (fpga_dma_feature_is_bbb(dfh.dfh) && (dfh.feature_uuid_lo == FPGA_DMA_UUID_L) &&
+      (dfh.feature_uuid_hi == FPGA_DMA_UUID_H)) {
+    dma_h->dma_base = dfh_offset;
+    dma_h->dma_csr_base = dma_h->dma_base + FPGA_DMA_CSR;
+    dma_h->dma_desc_base = dma_h->dma_base + FPGA_DMA_DESC;
+    dma_h->dma_ase_cntl_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_CNTL;
+    dma_h->dma_ase_data_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_DATA;
+    dma_found = true;
+    *dma_p = dma_h;
+    res = FPGA_OK;
+  } else {
+    *dma_p = NULL;
+    res = FPGA_NOT_FOUND;
+    goto out;
+  }
+
+  // Buffer size must be page aligned for prepareBuffer
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaPrepareBuffer(
+        dma_h->fpga_h, FPGA_DMA_BUF_SIZE, (void **)&(dma_h->dma_buf_ptr[i]), &dma_h->dma_buf_wsid[i], 0);
+    ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+    // Make sure it's actually allocated
+    dma_h->dma_buf_ptr[i][0] = 0xff;
+    madvise((void *)dma_h->dma_buf_ptr[i], FPGA_DMA_BUF_SIZE, MADV_SEQUENTIAL);
+
+    res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->dma_buf_wsid[i], &dma_h->dma_buf_iova[i]);
+    ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  }
+
+  // Allocate magic number buffer
+  res = fpgaPrepareBuffer(dma_h->fpga_h, FPGA_DMA_ALIGN_BYTES, (void **)&(dma_h->magic_buf), &dma_h->magic_wsid, 0);
+  ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+  dma_h->magic_buf[0] = 0xff;
+
+  res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->magic_wsid, &dma_h->magic_iova);
+  ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  memset((void *)dma_h->magic_buf, 0, FPGA_DMA_ALIGN_BYTES);
+
+  // turn on global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 1;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, rel_buf, "MMIOWrite32Blk");
+
+  // register interrupt event handle
+  res = fpgaCreateEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaCreateEventHandle");
+
+  res = fpgaRegisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh, interrupt_num /*vector id */);
+  ON_ERR_GOTO(res, destroy_eh, "fpgaRegisterEvent");
+
+  return FPGA_OK;
+
+destroy_eh:
+  res = fpgaDestroyEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaDestroyEventHandle");
+
+rel_buf:
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+  }
+out:
+  if (!dma_found) {
+    free(dma_h);
+  }
+  return res;
+}
+
+/**
+ * _read_memory_mmio_unaligned
+ *
+ * @brief                Performs a unaligned read(address not 4/8/64 byte aligned) from FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _read_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                               uint64_t dev_addr,
+                                               uint64_t host_addr,
+                                               uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - shift) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast((void *)host_addr, count, ((char *)(&read_tmp)) + shift, count);
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio_unaligned
+ *
+ * @brief                Performs an unaligned write(address not 4/8/64 byte aligned) to FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _write_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                                uint64_t dev_addr,
+                                                uint64_t host_addr,
+                                                uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - (dev_addr % QWORD_BYTES)) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, count, (void *)host_addr, count);
+
+  // write back to device
+  res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio
+ *
+ * @brief                   Writes to a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in/out] count     Pointer to the Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _write_memory_mmio(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*dst_ptr));
+  if (!IS_ALIGNED_DWORD(*dst_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(dst)) {
+    // Write out a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(dst));
+
+  // Write out blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, size_to_copy);
+    ON_ERR_RETURN(res, "MMIOWrite64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Write out remaining DWORD
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_host_to_fpga
+ *
+ * @brief                   Tx "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in] count         Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_host_to_fpga(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t dst = *dst_ptr;
+  uint64_t src = *src_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using dst masking method
+  if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
+    unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    src += unaligned_size;
+    dst += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _write_memory_mmio(dma_h, &dst, &src, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using dst masking method
+  unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+/**
+ * _read_memory_mmio
+ *
+ * @brief                   Reads a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Pointer to the size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _read_memory_mmio(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*src_ptr));
+  if (!IS_ALIGNED_DWORD(*src_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(src)) {
+    // Read a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(src));
+
+  // Read blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, size_to_copy);
+    ON_ERR_RETURN(res, "MMIORead64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Read remaining DWORD
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_fpga_to_host
+ *
+ * @brief                   Tx "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_fpga_to_host(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using src masking method
+  if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
+    unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    dst += unaligned_size;
+    src += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _read_memory_mmio(dma_h, &src, &dst, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using src masking method
+  unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+static fpga_result clear_interrupt(fpga_dma_handle dma_h) {
+  // clear interrupt by writing 1 to IRQ bit in status register
+  msgdma_status_t status = {0};
+  status.st.irq = 1;
+
+  return MMIOWrite32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+}
+
+static fpga_result poll_interrupt(fpga_dma_handle dma_h) {
+  struct pollfd pfd = {0};
+  msgdma_status_t status = { 0 };
+  fpga_result res = FPGA_OK;
+  int poll_res;
+
+  res = fpgaGetOSObjectFromEventHandle(dma_h->eh, &pfd.fd);
+  ON_ERR_GOTO(res, out, "fpgaGetOSObjectFromEventHandle failed\n");
+
+  pfd.events = POLLIN;
+
+#ifdef CHECK_DELAYS
+  if (0 == poll(&pfd, 1, 0)) poll_wait_count++;
+#endif
+  poll_res = poll(&pfd, 1, FPGA_DMA_TIMEOUT_MSEC);
+  MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)& status.reg, sizeof(status.reg));
+  if (poll_res < 0) {
+    fprintf(stderr, "Poll error errno = %s DMA status reg: 0x%x\n", strerror(errno), status.reg);
+    res = FPGA_EXCEPTION;
+    goto out;
+  } else if (poll_res == 0) {
+    fprintf(stderr, "Poll(interrupt) timeout DMA status reg: 0x%x\n", status.reg);
+    res = FPGA_EXCEPTION;
+  } else {
+    uint64_t count = 0;
+    ssize_t bytes_read = read(pfd.fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      debug_print("Poll success. Return = %d, count = %d\n", poll_res, (int)count);
+      res = FPGA_OK;
+    } else {
+      fprintf(stderr, "Error: poll failed read: zero bytes read");
+      res = FPGA_EXCEPTION;
+    }
+  }
+
+out:
+  clear_interrupt(dma_h);
+  return res;
+}
+
+static fpga_result _issue_magic(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  *(dma_h->magic_buf) = 0x0ULL;
+
+  res = _do_dma(dma_h,
+                dma_h->magic_iova | FPGA_DMA_WF_HOST_MASK,
+                FPGA_DMA_WF_ROM_MAGIC_NO_MASK,
+                64,
+                1,
+                FPGA_TO_HOST_MM,
+                FPGA2HOST_IRQ_REQ /*intr_en */);
+  return res;
+}
+
+static void _wait_magic(fpga_dma_handle dma_h) {
+#ifndef SKIP_FPGA2HOST_IRQ
+  poll_interrupt(dma_h);
+#endif
+  while (*(dma_h->magic_buf) != FPGA_DMA_WF_MAGIC_NO)
+    ;
+  *(dma_h->magic_buf) = 0x0ULL;
+}
+
+fpga_result transferHostToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int issued_intr = 0;
+  debug_print("Host To Fpga ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(dst)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      return res;
+    } else {
+      aligned_addr = ((dst / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - dst;
+      res = _ase_host_to_fpga(dma_h, &dst, &src, align_bytes);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chuncks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      // constant size transfer, no length check required for memcpy
+      memcpy_s_fast(dma_h->dma_buf_ptr[i % FPGA_DMA_MAX_BUF],
+                    FPGA_DMA_BUF_SIZE,
+                    (void *)(src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE);
+      // The value of FPGA_DMA_MAX_BUF is 2. Thus FPGA_DMA_MAX_BUF/2 -- 1, so the comparison
+      // is always i % 1 == 0, which will always be true. This means that the i == (dma_chunks -1)
+      // portion of the conditional will never be reached. However, for clarity and in case
+      // FPGA_DMA_MAX_BUF changes, I will leave the conditional as is and apply a coverity supression
+      // coverity[deadcode:FALSE]
+      if ((i % (FPGA_DMA_MAX_BUF / 2) == (FPGA_DMA_MAX_BUF / 2) - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (i == (FPGA_DMA_MAX_BUF / 2) - 1) {
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true);
+        } else {
+          if (issued_intr) poll_interrupt(dma_h);
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true /*intr_en */);
+        }
+        issued_intr = 1;
+      } else {
+        res = _do_dma(dma_h,
+                      (dst + i * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                      FPGA_DMA_BUF_SIZE,
+                      0,
+                      type,
+                      false /*intr_en */);
+      }
+    }
+    if (issued_intr) {
+      poll_interrupt(dma_h);
+      issued_intr = 0;
+    }
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+
+        memcpy_s_fast(
+            dma_h->dma_buf_ptr[0], dma_tx_bytes, (void *)(src + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      true /*intr_en */);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+        poll_interrupt(dma_h);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToHost(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t j = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int wf_issued = 0;
+
+  debug_print("FPGA To Host ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(src)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      return res;
+    } else {
+      aligned_addr = ((src / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - src;
+      res = _ase_fpga_to_host(dma_h, &src, &dst, align_bytes);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chunks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+    uint64_t pending_buf = 0;
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    dma_h->dma_buf_iova[i % (FPGA_DMA_MAX_BUF)] | FPGA_DMA_HOST_MASK,
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+
+      const int num_pending = i - pending_buf + 1;
+      if (num_pending == (FPGA_DMA_MAX_BUF / 2)) {  // Enters this loop only once,after first batch of descriptors.
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+      if (num_pending > (FPGA_DMA_MAX_BUF - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (wf_issued) {
+          _wait_magic(dma_h);
+          for (j = 0; j < (FPGA_DMA_MAX_BUF / 2); j++) {
+            // constant size transfer; no length check required
+            memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                          FPGA_DMA_BUF_SIZE,
+                          dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                          FPGA_DMA_BUF_SIZE);
+            pending_buf++;
+          }
+          wf_issued = 0;
+        }
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+    }
+
+    if (wf_issued) _wait_magic(dma_h);
+
+    // clear out final dma memcpy operations
+    while (pending_buf < dma_chunks) {
+      // constant size transfer; no length check required
+      memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                    FPGA_DMA_BUF_SIZE);
+      pending_buf++;
+    }
+    if (count_left > 0) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      false /*intr_en */);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+        memcpy_s_fast(
+            (void *)(dst + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes, dma_h->dma_buf_ptr[0], dma_tx_bytes);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t *tmp_buf = NULL;
+  if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src) && IS_DMA_ALIGNED(count_left)) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print("!!!FPGA to FPGA!!! TX :dma chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                dma_chunks,
+                count_left,
+                dst,
+                src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    (dst + i * FPGA_DMA_BUF_SIZE),
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    0,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      if ((i + 1) % FPGA_DMA_MAX_BUF == 0 || i == (dma_chunks - 1) /*last descriptor */) {
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+      }
+    }
+    if (count_left > 0) {
+      debug_print("Count_left = %08lx  was transfered using DMA\n", count_left);
+      res = _do_dma(dma_h,
+                    (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    count_left,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      res = _issue_magic(dma_h);
+      ON_ERR_GOTO(res, out, "Magic number issue failed");
+      _wait_magic(dma_h);
+    }
+  } else {
+    if ((src < dst) && (src + count_left >= dst)) {
+      debug_print("Overlapping addresses, Provide correct dst address\n");
+      return FPGA_NOT_SUPPORTED;
+    }
+    uint32_t tx_chunks = count_left / FPGA_DMA_BUF_ALIGN_SIZE;
+    count_left -= (tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE);
+    debug_print("!!!FPGA to FPGA TX!!! : tx chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                tx_chunks,
+                count_left,
+                dst,
+                src);
+    tmp_buf = (uint64_t *)malloc(FPGA_DMA_BUF_ALIGN_SIZE);
+    for (i = 0; i < tx_chunks; i++) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + i * FPGA_DMA_BUF_ALIGN_SIZE), FPGA_DMA_BUF_ALIGN_SIZE, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + i * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, FPGA_DMA_BUF_ALIGN_SIZE, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    if (count_left > 0) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), count_left, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, count_left, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    free(tmp_buf);
+  }
+out:
+  return res;
+out_spl:
+  free(tmp_buf);
+  return res;
+}
+
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+
+  if (!dma_h) return FPGA_INVALID_PARAM;
+
+  if (type >= FPGA_MAX_TRANSFER_TYPE) return FPGA_INVALID_PARAM;
+
+  if (!dma_h->fpga_h) return FPGA_INVALID_PARAM;
+
+  if (type == HOST_TO_FPGA_MM) {
+    res = transferHostToFpga(dma_h, dst, src, count, HOST_TO_FPGA_MM);
+  } else if (type == FPGA_TO_HOST_MM) {
+    res = transferFpgaToHost(dma_h, dst, src, count, FPGA_TO_HOST_MM);
+  } else if (type == FPGA_TO_FPGA_MM) {
+    res = transferFpgaToFpga(dma_h, dst, src, count, FPGA_TO_FPGA_MM);
+  } else {
+    // Should not be possible, since we have handled all fpga_dma_transfer_t types
+    assert(0);
+  }
+
+  return res;
+}
+
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context) {
+  // TODO
+  return FPGA_NOT_SUPPORTED;
+}
+
+fpga_result fpgaDmaClose(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  int i = 0;
+  if (!dma_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  if (!dma_h->fpga_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer failed");
+  }
+
+  res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->magic_wsid);
+  ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+
+  fpgaUnregisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh);
+  fpgaDestroyEventHandle(&dma_h->eh);
+
+  // turn off global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 0;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, out, "MMIOWrite32Blk");
+
+out:
+  free((void *)dma_h);
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
new file mode 100644
index 0000000..e382696
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
@@ -0,0 +1,141 @@
+// Copyright 2017-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef __FPGA_DMA_H__
+#define __FPGA_DMA_H__
+
+#include <opae/fpga.h>
+
+//#define DEBUG_MEM 1
+//#define FPGA_DMA_DEBUG 1
+#define SKIP_FPGA2HOST_IRQ 1
+#ifdef SKIP_FPGA2HOST_IRQ
+#define FPGA2HOST_IRQ_REQ false
+#else
+#define FPGA2HOST_IRQ_REQ true
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The DMA driver supports host to FPGA, FPGA to host and FPGA
+ * to FPGA transfers. The FPGA interface can be streaming
+ * or memory-mapped. Streaming interfaces are not currently
+ * supported.
+ */
+typedef enum {
+  HOST_TO_FPGA_MM = 0,  // Memory mapped FPGA interface
+  FPGA_TO_HOST_MM,      // Memory mapped FPGA interface
+  FPGA_TO_FPGA_MM,      // Memory mapped FPGA interface
+  FPGA_MAX_TRANSFER_TYPE,
+} fpga_dma_transfer_t;
+
+typedef struct _dma_handle_t *fpga_dma_handle;
+
+// Callback for asynchronous DMA transfers
+typedef void (*fpga_dma_transfer_cb)(void *context);
+
+/**
+ * fpgaDmaOpen
+ *
+ * @brief           Open a handle to DMA BBB.
+ *                  Scans the device feature chain looking for a DMA BBB.
+ *
+ * @param[in]  fpga Handle to the FPGA AFU object obtained via fpgaOpen()
+ * @param[in]  dma_base to DMA channel DFH
+ * @param[in]  interrupt_num interrupt number assigned to DMA channel
+ * @param[out] dma  DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dma_base, int interrupt_num, fpga_dma_handle *dma);
+
+/**
+ * fpgaDmaTransferSync
+ *
+ * @brief             Perform a blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type);
+
+/**
+ * fpgaDmaTransferAsync (Not supported)
+ *
+ * @brief             Perform a non-blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @param[in] cb      Callback to invoke when DMA transfer is complete
+ * @param[in] context Pointer to define user-defined context
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context);
+
+/**
+ * fpgaDmaClose
+ *
+ * @brief           Close the DMA BBB handle.
+ *
+ * @param[in] dma   DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaClose(fpga_dma_handle dma);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __FPGA_DMA_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
new file mode 100644
index 0000000..e4c8373
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
@@ -0,0 +1,289 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma_internal.h
+ * \brief FPGA DMA BBB Internal Header
+ */
+
+#ifndef __FPGA_DMA_INT_H__
+#define __FPGA_DMA_INT_H__
+
+#include <opae/fpga.h>
+#include "x86-sse2.h"
+
+#ifdef CHECK_DELAYS
+#pragma message "Compiled with -DCHECK_DELAYS.  Not to be used in production"
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+#pragma message "Compiled with -DFPGA_DMA_DEBUG.  Not to be used in production"
+#endif
+
+#ifndef max
+#define max(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a > _b ? _a : _b;      \
+  })
+#endif
+
+#ifndef min
+#define min(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a < _b ? _a : _b;      \
+  })
+#endif
+
+#define FPGA_DMA_TIMEOUT_MSEC (5000)
+
+#define QWORD_BYTES 8
+#define DWORD_BYTES 4
+#define IS_ALIGNED_DWORD(addr) (addr % 4 == 0)
+#define IS_ALIGNED_QWORD(addr) (addr % 8 == 0)
+
+#define FPGA_DMA_UUID_H 0xef82def7f6ec40fc
+#define FPGA_DMA_UUID_L 0xa9149a35bace01ea
+#define FPGA_DMA_WF_MAGIC_NO 0x5772745F53796E63ULL
+#define FPGA_DMA_HOST_MASK 0x2000000000000
+#define FPGA_DMA_WF_HOST_MASK 0x3000000000000
+#define FPGA_DMA_WF_ROM_MAGIC_NO_MASK 0x1000000000000
+
+#define AFU_DFH_REG 0x0
+#define AFU_DFH_NEXT_OFFSET 16
+#define AFU_DFH_EOL_OFFSET 40
+#define AFU_DFH_TYPE_OFFSET 60
+
+// BBB Feature ID (refer CCI-P spec)
+#define FPGA_DMA_BBB 0x2
+
+// Feature ID for DMA BBB
+#define FPGA_DMA_BBB_FEATURE_ID 0x765
+
+// DMA Register offsets from base
+#define FPGA_DMA_CSR 0x40
+#define FPGA_DMA_DESC 0x60
+#define FPGA_DMA_ADDR_SPAN_EXT_CNTL 0x200
+#define FPGA_DMA_ADDR_SPAN_EXT_DATA 0x1000
+
+#define DMA_ADDR_SPAN_EXT_WINDOW (4 * 1024)
+#define DMA_ADDR_SPAN_EXT_WINDOW_MASK ((uint64_t)(DMA_ADDR_SPAN_EXT_WINDOW - 1))
+
+#define FPGA_DMA_MASK_32_BIT 0xFFFFFFFF
+
+#define FPGA_DMA_CSR_BUSY (1 << 0)
+#define FPGA_DMA_DESC_BUFFER_EMPTY 0x2
+#define FPGA_DMA_DESC_BUFFER_FULL 0x4
+
+#define FPGA_DMA_ALIGN_BYTES 64
+#define IS_DMA_ALIGNED(addr) (addr % FPGA_DMA_ALIGN_BYTES == 0)
+
+#define CSR_BASE(dma_handle) ((uint64_t)dma_handle->dma_csr_base)
+#define ASE_DATA_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_data_base)
+#define ASE_CNTL_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_cntl_base)
+#define HOST_MMIO_32_ADDR(dma_handle, offset) \
+  ((volatile uint32_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_64_ADDR(dma_handle, offset) \
+  ((volatile uint64_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_32(dma_handle, offset) (*HOST_MMIO_32_ADDR(dma_handle, offset))
+#define HOST_MMIO_64(dma_handle, offset) (*HOST_MMIO_64_ADDR(dma_handle, offset))
+
+#define CSR_STATUS(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, status))
+#define CSR_CONTROL(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, ctrl))
+
+// Granularity of DMA transfer (maximum bytes that can be packed
+// in a single descriptor).This value must match configuration of
+// the DMA IP. Larger transfers will be broken down into smaller
+// transactions.
+#define FPGA_DMA_BUF_SIZE (1024 * 1024 * 2UL)
+#define FPGA_DMA_BUF_ALIGN_SIZE FPGA_DMA_BUF_SIZE
+
+// Convenience macros
+
+#ifdef FPGA_DMA_DEBUG
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+#define FPGA_DMA_MAX_BUF 2
+
+typedef struct __attribute__((__packed__)) {
+  uint64_t dfh;
+  uint64_t feature_uuid_lo;
+  uint64_t feature_uuid_hi;
+} dfh_feature_t;
+
+typedef union {
+  uint64_t reg;
+  struct {
+    uint64_t feature_type : 4;
+    uint64_t reserved_8 : 8;
+    uint64_t afu_minor : 4;
+    uint64_t reserved_7 : 7;
+    uint64_t end_dfh : 1;
+    uint64_t next_dfh : 24;
+    uint64_t afu_major : 4;
+    uint64_t feature_id : 12;
+  } bits;
+} dfh_reg_t;
+
+struct _dma_handle_t {
+  fpga_handle fpga_h;
+  uint32_t mmio_num;
+  uint64_t mmio_va;
+  uint64_t cur_ase_page;
+  uint64_t dma_base;
+  uint64_t dma_offset;
+  uint64_t dma_csr_base;
+  uint64_t dma_desc_base;
+  uint64_t dma_ase_cntl_base;
+  uint64_t dma_ase_data_base;
+  // Interrupt event handle
+  fpga_event_handle eh;
+  // magic number buffer
+  volatile uint64_t *magic_buf;
+  uint64_t magic_iova;
+  uint64_t magic_wsid;
+  uint64_t *dma_buf_ptr[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_wsid[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_iova[FPGA_DMA_MAX_BUF];
+};
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t tx_channel : 8;
+    uint32_t generate_sop : 1;
+    uint32_t generate_eop : 1;
+    uint32_t park_reads : 1;
+    uint32_t park_writes : 1;
+    uint32_t end_on_eop : 1;
+    uint32_t reserved_1 : 1;
+    uint32_t transfer_irq_en : 1;
+    uint32_t early_term_irq_en : 1;
+    uint32_t trans_error_irq_en : 8;
+    uint32_t early_done_en : 1;
+    uint32_t reserved_2 : 6;
+    uint32_t go : 1;
+  };
+} msgdma_desc_ctrl_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  uint32_t rd_address;
+  // 0x4
+  uint32_t wr_address;
+  // 0x8
+  uint32_t len;
+  // 0xC
+  uint16_t seq_num;
+  uint8_t rd_burst_count;
+  uint8_t wr_burst_count;
+  // 0x10
+  uint16_t rd_stride;
+  uint16_t wr_stride;
+  // 0x14
+  uint32_t rd_address_ext;
+  // 0x18
+  uint32_t wr_address_ext;
+  // 0x1c
+  msgdma_desc_ctrl_t control;
+} msgdma_ext_desc_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t busy : 1;
+    uint32_t desc_buf_empty : 1;
+    uint32_t desc_buf_full : 1;
+    uint32_t rsp_buf_empty : 1;
+    uint32_t rsp_buf_full : 1;
+    uint32_t stopped : 1;
+    uint32_t resetting : 1;
+    uint32_t stopped_on_errror : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t irq : 1;
+    uint32_t reserved : 22;
+  } st;
+} msgdma_status_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t stop_dispatcher : 1;
+    uint32_t reset_dispatcher : 1;
+    uint32_t stop_on_error : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t global_intr_en_mask : 1;
+    uint32_t stop_descriptors : 1;
+    uint32_t rsvd : 22;
+  } ct;
+} msgdma_ctrl_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_fill_level : 16;
+    uint32_t wr_fill_level : 16;
+  } fl;
+} msgdma_fill_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rsp_fill_level : 16;
+    uint32_t rsvd : 16;
+  } rsp;
+} msgdma_rsp_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_seq_num : 16;
+    uint32_t wr_seq_num : 16;
+  } seq;
+} msgdma_seq_num_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  msgdma_status_t status;
+  // 0x4
+  msgdma_ctrl_t ctrl;
+  // 0x8
+  msgdma_fill_level_t fill_level;
+  // 0xc
+  msgdma_rsp_level_t rsp;
+  // 0x10
+  msgdma_seq_num_t seq_num;
+} msgdma_csr_t;
+
+#endif  // __FPGA_DMA_INT_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..206b98a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
@@ -0,0 +1,278 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <poll.h>
+#include <stdlib.h>
+
+#include <thread>
+
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "kernel_interrupt.h"
+
+using namespace intel_opae_mmd;
+
+// if ENABLE_OPENCL_KERNEL_INTERRUPTS is set at compile time, interrupts will
+// be enabled.
+#define ENABLE_OPENCL_KERNEL_INTERRUPTS
+
+// if ENABLE_OPENCL_KERNEL_POLLING_THREAD is set at compile time, a thread will
+// replace yield and the thread will call runtime call back
+
+// DLA runtime assumes interrupt service routing will run on its own (instead of runtime yielding to MMD) when hardware
+// interrupts
+#ifdef DLA_MMD
+#define ENABLE_OPENCL_KERNEL_POLLING_THREAD
+#endif
+
+// ccip interrupt line that is used for kernel
+#define MMD_KERNEL_INTERRUPT_LINE_NUM 1
+
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_initialized(false),
+      m_eventfd_wrapper(NULL),
+      m_thread(NULL),
+      m_kernel_interrupt_fn(NULL),
+      m_kernel_interrupt_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(0) {
+  enable_interrupts();
+}
+
+KernelInterrupt::~KernelInterrupt() { disable_interrupts(); }
+
+void KernelInterrupt::disable_interrupts() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_eventfd_wrapper->notify(1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_eventfd_wrapper) {
+    delete m_eventfd_wrapper;
+    m_eventfd_wrapper = NULL;
+  }
+
+  if (m_event_handle) {
+    fpga_result res;
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaUnregisterEvent");
+    }
+#endif
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaDestroyEventHandle");
+    }
+  }
+
+  // disable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000000);
+#endif
+
+  m_initialized = false;
+}
+
+void KernelInterrupt::enable_interrupts() {
+  m_eventfd_wrapper = new eventfd_wrapper();
+  if (!m_eventfd_wrapper->initialized()) return;
+
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  m_thread = new std::thread(interrupt_polling_thread, std::ref(*this));
+#endif
+
+  fpga_result res;
+  // Create event
+  res = fpgaCreateEventHandle(&m_event_handle);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error creating event handle");
+    return;
+  }
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // Register user interrupt with event handle
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, MMD_KERNEL_INTERRUPT_LINE_NUM);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error registering event");
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    return;
+  }
+
+  // enable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000001);
+#endif
+#endif
+
+  m_initialized = true;
+}
+
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+void KernelInterrupt::set_interrupt_mask(uint32_t intr_mask) {
+  fpga_result res;
+  res = fpgaWriteMMIO32(m_fpga_handle, 0, AOCL_IRQ_MASKING_BASE, intr_mask);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaWriteMMIO32: %d\n", res);
+    return;
+  }
+}
+#endif
+
+void KernelInterrupt::interrupt_polling_thread(KernelInterrupt& obj) {
+  bool thread_is_active = true;
+  while (thread_is_active) {
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    const int timeout = -1;
+#else
+    const int timeout = 0;
+    usleep(100);
+#endif
+    thread_is_active = obj.poll_interrupt(timeout);
+  }
+}
+
+bool KernelInterrupt::poll_interrupt(int poll_timeout_arg) {
+  fpga_result fpga_res;
+
+  int res;
+  // get eventfd handles
+  int intr_fd;
+  fpga_res = fpgaGetOSObjectFromEventHandle(m_event_handle, &intr_fd);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "error getting event file handle");
+    return false;
+  }
+  int thread_signal_fd = m_eventfd_wrapper->get_fd();
+
+  struct pollfd pollfd_arr[2];
+  pollfd_arr[0].fd = intr_fd;
+  pollfd_arr[0].events = POLLIN;
+  pollfd_arr[0].revents = 0;
+  pollfd_arr[1].fd = thread_signal_fd;
+  pollfd_arr[1].events = POLLIN;
+  pollfd_arr[1].revents = 0;
+  res = poll(pollfd_arr, 2, poll_timeout_arg);
+  if (res < 0) {
+    fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    return false;
+  } else if (res > 0 && pollfd_arr[0].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(intr_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+  } else if (res > 0 && pollfd_arr[1].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(thread_signal_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+    return false;
+  } else {
+    // no event fd event happened
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    return false;
+#endif
+  }
+
+#ifdef DLA_MMD
+  run_kernel_interrupt_fn();
+#else  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+
+  // probobly not required for interrupt polling but we poll the interrupt
+  // csr line to make sure an interrupt was actually triggered
+  uint32_t irqval = 0;
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  DEBUG_PRINT("irqval: %u\n", irqval);
+  if (irqval) run_kernel_interrupt_fn();
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // workaround for fb:530016
+  // check if irq line is still high and generate another interrupt event
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  // signal intr event fd
+  if (irqval) {
+    DEBUG_PRINT("CRITICAL WARNING: irqval has not been cleared by aocl runtime\n");
+    uint64_t count = 1;
+    ssize_t res = write(intr_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+  }
+#endif
+#endif
+
+  return true;
+}
+
+bool KernelInterrupt::yield_is_enabled() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  return false;
+#else
+  return true;
+#endif
+}
+
+void KernelInterrupt::yield() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  usleep(0);
+#else
+  poll_interrupt(0);
+#endif
+}
+
+void KernelInterrupt::run_kernel_interrupt_fn() {
+  if (m_kernel_interrupt_fn) {
+    m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+  } else {
+    fprintf(stderr, "m_kernel_interrupt_fn is NULL.  No interrupt handler set!\n");
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
new file mode 100644
index 0000000..44e9b50
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
@@ -0,0 +1,75 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _KERNEL_INTERRUPT_H
+#define _KERNEL_INTERRUPT_H
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  bool initialized() { return m_initialized; }
+
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+  void yield();
+  static bool yield_is_enabled();
+
+  void enable_interrupts();
+  void disable_interrupts();
+
+ private:
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  void set_interrupt_mask(uint32_t intr_mask);
+#endif
+  void run_kernel_interrupt_fn();
+  bool poll_interrupt(int poll_timeout_arg);
+
+  static void interrupt_polling_thread(KernelInterrupt& obj);
+
+  bool m_initialized;
+  eventfd_wrapper* m_eventfd_wrapper;
+
+  std::thread* m_thread;
+
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void* m_kernel_interrupt_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  fpga_event_handle m_event_handle;
+
+  // not used and not implemented
+  KernelInterrupt(KernelInterrupt& other);
+  KernelInterrupt& operator=(const KernelInterrupt& other);
+};  // class KernelInterrupt
+
+};  // namespace intel_opae_mmd
+
+#endif  // _KERNEL_INTERRUPT_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
new file mode 100644
index 0000000..65d7f1a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
@@ -0,0 +1,133 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <safe_string/safe_string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "memcpy_s_fast.h"
+#include "x86-sse2.h"
+
+#pragma pop_macro("_GNU_SOURCE")
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n);
+
+memcpy_fn_t p_memcpy = memcpy_setup;  // Initial value points to setup routine
+
+/**
+ * SSE2_memcpy
+ *
+ * @brief                memcpy using SSE2 or REP MOVSB
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+static void *SSE2_memcpy(void *dst, size_t max, const void *src, size_t n) {
+  assert(n <= max);
+
+  void *ldst = dst;
+  void *lsrc = (void *)src;
+  if (IS_CL_ALIGNED(src) && IS_CL_ALIGNED(dst))  // 64-byte aligned
+  {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      aligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  } else {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes (unaligned) with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      unaligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  }
+
+  if (n) {
+    register unsigned long int dummy;
+    debug_print("copying 0x%lx bytes with REP MOVSB\n", n);
+    __asm__ __volatile__("rep movsb\n"
+                         : "=&D"(ldst), "=&S"(lsrc), "=&c"(dummy)
+                         : "0"(ldst), "1"(lsrc), "2"(n)
+                         : "memory");
+  }
+
+  return dst;
+}
+
+/**
+ * memcpy_wrap
+ *
+ * @brief                Trampoline for memcpy
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+static void *memcpy_wrap(void *dst, size_t max, const void *src, size_t n) { return memcpy(dst, src, n); }
+#endif  // ENABLE_MEMCPY_ENV_VAR_CHECK
+
+/**
+ * memcpy_setup
+ * Will be called on the first memcpy_s_fast invocation only.
+ *
+ * @brief                Set up which memcpy routine will be used at runtime
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n) {
+  // Default to SSE2_memcpy
+  p_memcpy = SSE2_memcpy;
+
+//
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+  char *pmemcpy = getenv(USE_MEMCPY_ENV);
+
+  if (pmemcpy) {
+    if (!strcasecmp(pmemcpy, "libc")) {
+      p_memcpy = memcpy_wrap;
+    } else if (!strcasecmp(pmemcpy, "sse2")) {
+      p_memcpy = SSE2_memcpy;
+    } else if (!strcasecmp(pmemcpy, "memcpy_s")) {
+      p_memcpy = (memcpy_fn_t)memcpy_s;
+    }
+  }
+#endif  // #ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+
+  return p_memcpy(dst, max, src, n);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
new file mode 100644
index 0000000..08056d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
@@ -0,0 +1,69 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef MEMCPY_S_FAST_H_
+#define MEMCPY_S_FAST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Constants needed in memcpy routines
+// Arbitrary crossover point for using SSE2 over rep movsb
+#define MIN_SSE2_SIZE 4096
+
+// TODO: hidden environment variables to experiment with performance
+// in production software are not a good idea in my opinion. Commenting out
+// for now but hopefully can remove this code completely in the long term.
+//#define USE_MEMCPY_ENV        "PAC_MEMCPY"
+
+#define CACHE_LINE_SIZE 64
+#define ALIGN_TO_CL(x) ((uint64_t)(x) & ~(CACHE_LINE_SIZE - 1))
+#define IS_CL_ALIGNED(x) (((uint64_t)(x) & (CACHE_LINE_SIZE - 1)) == 0)
+
+// Convenience macros
+#ifdef DEBUG_MEM
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+typedef void *(*memcpy_fn_t)(void *dst, size_t max, const void *src, size_t len);
+
+extern memcpy_fn_t p_memcpy;
+
+#define memcpy_s_fast(a, b, c, d) p_memcpy(a, b, c, d)
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // MEMCPY_S_FAST_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
new file mode 100644
index 0000000..92337a3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
@@ -0,0 +1,434 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+// disable dma and only use mmio.  this is very slow.
+//#define DISABLE_DMA
+
+// Each MSGDMA_BBB DFH is now 0x100 instead of 0x2_0000 (it needed to be 0x2_0000 previously because
+// the ASE component was within the msgdma_bbb.qsys).
+// Original addressing:
+//              board_afu_dfh: 0x0-0x3f.
+//              msgdma_bbb_csr: 0x2_0000-0x2_1fff.
+// Original range at board.ddr_board.msgdma_bbb: 0x2_0000- 0x2_1fff.
+//              DFH : 0x0-0x3f.
+//              ASE.cntl : 0x200-0x207.
+//              ASE.windowed_slave : 0x1000-0x1fff.
+// Current addressing (with ASE removed from the msgdma_bbb and now living on its own in ddr_board.qsys):
+//              From top-level board.qsys (base address 0x0):
+//                  board | dfh                             : 0x0_0000 - 0x0_003f
+//                  board | ddr_board.ase                   : 0x1_0000 - 0x1_1fff
+//                  board | ddr_board.msgdma_bbb_0          : 0x2_0000 - 0x2_007f
+//                  board | ddr_board.msgdma_bbb_1          : 0x2_0100 - 0x2_017f
+//                  board | ddr_board.null_dfh              : 0x2_0200 - 0x2_023f
+//              From ase.qsys (base address: 0x1_0000):
+//                  board.ddr_board.ase.dfh_csr             : 0x0-0x3f
+//                  board.ddr_board.ase.ASE.cntl            : 0x200-0x207
+//                  board.ddr_board.ase.ASE.windowed_slave  : 0x1000-0x1fff
+//              From msgdma_bbb.qsys inst0 (base address: 0x2_0000)
+//                  board.ddr_board.msgdma_bbb_inst_0.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+//              From msgdma_bbb.qsys inst1 (base address: 0x2_0100)
+//                  board.ddr_board.msgdma_bbb_inst_1.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+
+#define MEM_WINDOW_CRTL 0x200
+#define MEM_WINDOW_MEM 0x1000
+#define MEM_WINDOW_SPAN (4 * 1024)
+#define MEM_WINDOW_SPAN_MASK ((long)(MEM_WINDOW_SPAN - 1))
+#define MINIMUM_DMA_SIZE 256
+#define DMA_ALIGNMENT 256
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_DMA(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_DMA(...)
+#endif
+
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg,
+                 int mmd_handle,
+                 uint64_t dfh_offset_arg,
+                 uint64_t ase_bbb_addr_arg,
+                 int interrupt_num_arg)
+    : m_initialized(false),
+      m_dma_op_mutex(),
+      m_status_handler_fn(NULL),
+      m_status_handler_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      dfh_offset(dfh_offset_arg),
+      interrupt_num(interrupt_num_arg),
+      dma_h(NULL),
+      msgdma_bbb_base_addr(0),
+      ase_bbb_base_addr(ase_bbb_addr_arg) {
+#ifndef DISABLE_DMA
+
+  fpga_result res;
+  res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+  if (res != FPGA_OK) {
+    m_dma_work_thread = NULL;
+    fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+    return;
+  }
+#endif  // DISABLE_DMA
+
+  m_dma_work_thread = new dma_work_thread(*this);
+  if (!m_dma_work_thread->initialized()) {
+    return;
+  }
+
+  m_initialized = true;
+}
+
+mmd_dma::~mmd_dma() {
+  // kill the thread
+  if (m_dma_work_thread) {
+    delete m_dma_work_thread;
+    m_dma_work_thread = NULL;
+  }
+
+  if (dma_h) {
+    if (fpgaDmaClose(dma_h) != FPGA_OK) fprintf(stderr, "Error closing DMA\n");
+  }
+  m_initialized = false;
+}
+
+void mmd_dma::reinit_dma() {
+  if (!m_initialized) return;
+
+  if (dma_h) {
+    m_initialized = false;
+
+    fpga_result res;
+    res = fpgaDmaClose(dma_h);
+    dma_h = NULL;
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error closing DMA\n");
+      return;
+    }
+
+    res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+      return;
+    }
+
+    m_initialized = true;
+  }
+}
+
+void mmd_dma::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  m_status_handler_fn = fn;
+  m_status_handler_user_data = user_data;
+}
+
+void mmd_dma::event_update_fn(aocl_mmd_op_t op, int status) {
+  m_status_handler_fn(m_mmd_handle, m_status_handler_user_data, op, status);
+}
+
+fpga_result mmd_dma::do_dma(dma_work_item &item) {
+  // main dma function needs to be thread safe because dma csr operations
+  // are not thread safe
+  std::lock_guard<std::mutex> lock(m_dma_op_mutex);
+
+  fpga_result res = FPGA_OK;
+  assert(item.rd_host_addr != NULL || item.wr_host_addr != NULL);
+
+  // Tell the kernel we'll need these and they're sequential
+  uint64_t addr = item.rd_host_addr ? (uint64_t)item.rd_host_addr : (uint64_t)item.wr_host_addr;
+  addr = addr & ~((uint64_t)getpagesize() - 1);  // Align to page boundary
+  size_t remainder = ((size_t)getpagesize() - (addr & getpagesize())) & ~(getpagesize() - 1);
+  madvise((void *)addr, item.size + remainder, MADV_SEQUENTIAL);
+
+  if (item.rd_host_addr) {
+    res = read_memory(item.rd_host_addr, item.dev_addr, item.size);
+  } else {
+    assert(item.wr_host_addr);
+    res = write_memory(item.wr_host_addr, item.dev_addr, item.size);
+  }
+
+  if (item.op) {
+    // TODO: check what 'status' value should really be.  Right now just
+    // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+    event_update_fn(item.op, 0);
+  }
+
+  return res;
+}
+
+fpga_result mmd_dma::enqueue_dma(dma_work_item &item) {
+  return static_cast<fpga_result>(m_dma_work_thread->enqueue_dma(item));
+}
+
+fpga_result mmd_dma::read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = host_addr;
+  item.wr_host_addr = NULL;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = NULL;
+  item.wr_host_addr = host_addr;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::read_memory(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      read_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = read_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return read_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return read_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+#ifdef DISABLE_DMA
+  res = read_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, (uint64_t)host_addr /*dst*/, dev_addr /*src*/, dma_size, FPGA_TO_HOST_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = read_memory_mmio(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(host_addr, size, ((char *)(&read_tmp)) + shift, size);
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: read data %8p %08lx %16p\n", host_addr, dev_addr, host_addr);
+    res = fpgaReadMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = read_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory_mmio done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      DCP_DEBUG_DMA("DCP DEBUG: write_memory %ld %ld %ld\n", mmio_shift, unaligned_size, size);
+      write_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = write_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return write_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return write_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+// TODO: make switch for MMIO
+#ifdef DISABLE_DMA
+  res = write_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, dev_addr /*dst*/, (uint64_t)host_addr /*src*/, dma_size, HOST_TO_FPGA_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = write_memory(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::write_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, size, host_addr, size);
+
+  // write back to device
+  res = fpgaWriteMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_aligned_addr & MEM_WINDOW_SPAN_MASK), read_tmp);
+  if (res != FPGA_OK) return res;
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: write data %8p %08lx %016lx\n", host_addr, dev_addr, *host_addr);
+    res = fpgaWriteMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), *host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = write_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: aocl_mmd_write done!\n");
+  return FPGA_OK;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
new file mode 100644
index 0000000..ff33aed
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
@@ -0,0 +1,97 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _MMD_DMA_H
+#define _MMD_DMA_H
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+
+#include <mutex>
+
+#include "aocl_mmd.h"
+#include "dma_work_thread.h"
+#include "fpga_dma.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg,
+          int mmd_handle,
+          uint64_t dfh_offset_arg,
+          uint64_t ase_bbb_addr_arg,
+          int interrupt_num_arg);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  fpga_result read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result do_dma(dma_work_item &item);
+
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+
+  // used after reconfigation
+  void reinit_dma();
+
+  void bind_to_node(void);
+
+ private:
+  // Helper functions
+  fpga_result enqueue_dma(dma_work_item &item);
+  fpga_result read_memory(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size);
+
+  void event_update_fn(aocl_mmd_op_t op, int status);
+
+  bool m_initialized;
+
+  dma_work_thread *m_dma_work_thread;
+  std::mutex m_dma_op_mutex;
+
+  aocl_mmd_status_handler_fn m_status_handler_fn;
+  void *m_status_handler_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  uint64_t dfh_offset;
+  int interrupt_num;
+  fpga_dma_handle dma_h;
+  uint64_t msgdma_bbb_base_addr;
+  uint64_t ase_bbb_base_addr;
+
+  // not used and not implemented
+  mmd_dma(mmd_dma &other);
+  mmd_dma &operator=(const mmd_dma &other);
+};  // class mmd_dma
+
+};  // namespace intel_opae_mmd
+
+#endif  // _MMD_DMA_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
new file mode 100644
index 0000000..e1fb5d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
@@ -0,0 +1,269 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if defined(__i386__) || defined(__amd64__)
+
+.intel_syntax noprefix
+.text
+
+#define PREFETCH_DISTANCE 256
+
+.macro asm_function_helper function_name
+    .global \function_name
+.func \function_name
+\function_name:
+#ifdef __amd64__
+  #ifdef _WIN64
+    .set DST,  rcx
+    .set SRC,  rdx
+    .set SIZE, r8
+  #else
+    .set DST,  rdi
+    .set SRC,  rsi
+    .set SIZE, rdx
+  #endif
+#else
+    mov  eax,  [esp + 4]
+    mov  ecx,  [esp + 8]
+    mov  edx,  [esp + 12]
+    .set DST,  eax
+    .set SRC,  ecx
+    .set SIZE, edx
+#endif
+.endm
+
+.macro asm_function function_name
+#if defined(_WIN32) && !defined(_WIN64)
+    asm_function_helper _\function_name
+#else
+    asm_function_helper \function_name
+#endif
+.endm
+
+.macro push3 a, b, c
+    push \a
+    push \b
+    push \c
+.endm
+
+.macro pop3 a, b, c
+    pop \c
+    pop \b
+    pop \a
+.endm
+
+/*****************************************************************************/
+
+asm_function aligned_block_copy_movsb
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    rep movsb
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    rep movsb
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function aligned_block_copy_movsd
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    sar         rcx, 2
+    rep movsd
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    sar         ecx, 2
+    rep movsd
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function unaligned_block_copy_sse2
+0:
+    movdqu      xmm0,       [SRC + 0]
+    movdqu      xmm1,       [SRC + 16]
+    movdqu      xmm2,       [SRC + 32]
+    movdqu      xmm3,       [SRC + 48]
+    movdqu      [DST + 0],  xmm0
+    movdqu      [DST + 16], xmm1
+    movdqu      [DST + 32], xmm2
+    movdqu      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm0
+    movdqa      [DST + 32], xmm0
+    movdqa      [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_nt_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm0
+    movntdq     [DST + 32], xmm0
+    movntdq     [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+/*****************************************************************************/
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
new file mode 100644
index 0000000..6ebe2ef
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
@@ -0,0 +1,54 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_SSE2_H__
+#define __X86_SSE2_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aligned_block_copy_movsb(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_movsd(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void unaligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_nt_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
new file mode 100644
index 0000000..edb46c7
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
@@ -0,0 +1,489 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (C) 1992-2019 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device.  No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+#include <cstddef>  //size_t
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried.  This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "18.1"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data andy requires explicit function calls from the user
+ * to sychronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to sychronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * sychronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device.  The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support.  The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name.  The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES.  The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ *
+ * */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device.  This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ *      param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface.  If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,      /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,          /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,             /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,           /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                  /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                 /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,            /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,           /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,         /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11 /* total # of concurent operations read + writes*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes.  This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition.  For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+                                    aocl_mmd_info_t requested_info_id,
+                                    size_t param_value_size,
+                                    void* param_value,
+                                    size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.  Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signalled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signalled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed.  E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_copy(
+    int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device.  That means the kernels will be idle and no read/write/copy
+ * commands are active.  Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size.  The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again.  At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+
+/* Shared memory allocator
+ * Allocates memory that is shared between the host and the FPGA.  The
+ * host will access this memory using the pointer returned by
+ * aocl_mmd_shared_mem_alloc, while the FPGA will access the shared memory
+ * using device_ptr_out.  If shared memory is not supported this should return
+ * NULL.
+ *
+ * Shared memory survives FPGA reprogramming if the CPU is not rebooted.
+ *
+ * Arguments:
+ *   size - the size of the shared memory to allocate
+ *   device_ptr_out - will receive the pointer value used by the FPGA (the device)
+ *                    to access the shared memory.  Cannot be NULL.  The type is
+ *                    unsigned long long to handle the case where the host has a
+ *                    smaller pointer size than the device.
+ *
+ * Returns: The pointer value to be used by the host to access the shared
+ * memory if successful, otherwise NULL.
+ */
+AOCL_MMD_CALL void* aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long* device_ptr_out) WEAK;
+
+/* Shared memory de-allocator
+ * Frees previously allocated shared memory.  If shared memory is not supported,
+ * this function should do nothing.
+ *
+ * Arguments:
+ *   host_ptr - the host pointer that points to the shared memory, as returned by
+ *              aocl_mmd_shared_mem_alloc
+ *   size     - the size of the shared memory to free. Must match the size
+ *              originally passed to aocl_mmd_shared_mem_alloc
+ */
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void* host_ptr, size_t size) WEAK;
+
+/* DEPRECATED. Use aocl_mmd_program instead
+ * This reprogram API is only for mmd version previous than 18.1
+ */
+AOCL_MMD_CALL int aocl_mmd_reprogram(int handle, void* user_data, size_t size) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore
new file mode 100644
index 0000000..66e06bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore
@@ -0,0 +1,18 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master
new file mode 100644
index 0000000..835c7e0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master
@@ -0,0 +1 @@
+sc
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt
new file mode 100644
index 0000000..e7e4584
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt
@@ -0,0 +1,144 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_MAX_DEVICE=128")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOPTION3=1 -DACL_USE_DMA=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_HAS_STDLIB_STDIO")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_BIT=64")
+
+# Select PCIE Gen3 x16
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x16")
+
+if (WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DAOCL_MMD_CALL=__declspec(dllexport)")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_COMPILER_IS_MSVC=1 -DACL_HOST_RUNTIME_IS_STATIC=1")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=windows -DACL_TARGET_SYS=windows -DWINDOWS")
+endif()
+
+# from the opencl makefile
+if (NOT WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -O3 -DACL_COMPILER_IS_MSVC=0 -DACL_HOST_RUNTIME_IS_STATIC=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_TARGET_SYS=linux -DLINUX")
+  # Release build only
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
+endif()
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/acl_pcie_config.cpp
+   ./host/acl_pcie.cpp
+   ./host/acl_pcie_debug.cpp
+   ./host/acl_pcie_device.cpp
+   ./host/acl_pcie_dma_linux.cpp
+   ./host/acl_pcie_dma_windows.cpp
+   ./host/acl_pcie_hostch.cpp
+   ./host/acl_pcie_mm_io.cpp
+   ./host/acl_pcie_timer.cpp
+)
+
+add_library(de10_agilex_mmd SHARED ${MMD_SRC})
+
+target_include_directories(de10_agilex_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+if (WIN32)
+  # Terrasic production BSP Linux kernel space driver header files
+  set(TERASIC_KERNEL_HEADER_DIR $ENV{AOCL_BOARD_PACKAGE_ROOT}/linux64/driver)
+  set(TERASIC_KERNEL_HEADER_FILES 
+    fpga_cmd_guids.h
+    hw_host_channel.h
+    hw_pcie_constants.h
+    hw_pcie_dma.h
+  )
+  if (EXISTS ${TERASIC_KERNEL_HEADER_DIR})
+    foreach(header ${TERASIC_KERNEL_HEADER_FILES})
+      if (EXISTS ${TERASIC_KERNEL_HEADER_DIR}/${header})
+        file(COPY ${TERASIC_KERNEL_HEADER_DIR}/${header} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/include)
+      else()
+        message(WARNING "Header file ${header} does not exist in ${TERASIC_KERNEL_HEADER_DIR}")
+      endif()
+    endforeach()
+  else()
+    message(FATAL_ERROR "Source directory ${TERASIC_KERNEL_HEADER_DIR} does not exist.")
+  endif()
+
+  set(HW_PCI_DMA_H ${CMAKE_CURRENT_SOURCE_DIR}/include/hw_pcie_dma.h)
+  file(READ ${HW_PCI_DMA_H} HW_PCI_DMA_H_CONTENT)
+  # Remove any end-of-line whitespace from the file content (spaces and tabs)
+  string(REGEX REPLACE "[ \t]+(\r?\n)" "\\1" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}")
+  set(OLD_CODE_BLOCK
+"PACK(
+struct DMA_DESC_ENTRY {
+    UINT32 src_addr_ldw;
+    UINT32 src_addr_udw;
+    UINT32 dest_addr_ldw;
+    UINT32 dest_addr_udw;
+    UINT32 ctl_dma_len;
+    UINT32 reserved[3];
+});")
+  set(NEW_CODE_BLOCK
+"#if defined(GEN3_x8)
+PACK(
+struct DMA_DESC_ENTRY {
+    UINT32 src_addr_ldw;
+    UINT32 src_addr_udw;
+    UINT32 dest_addr_ldw;
+    UINT32 dest_addr_udw;
+    UINT32 ctl_dma_len;
+    UINT32 reserved[3];
+});
+#elif defined(GEN3_x16)
+PACK(
+struct DMA_DESC_ENTRY {
+    UINT64 src_addr;
+    UINT64 dst_addr;
+    UINT32 ctrl;
+    UINT32 reserved[3];
+});
+#endif")
+  string(REPLACE "${OLD_CODE_BLOCK}" "${NEW_CODE_BLOCK}" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}")
+  file(WRITE ${HW_PCI_DMA_H} "${HW_PCI_DMA_H_CONTENT}")
+
+  set_target_properties(de10_agilex_mmd PROPERTIES LINK_FLAGS "-subsystem:console -nologo -fixed:no -incremental:no -opt:noref -ignore:4089 /NXCOMPAT /DYNAMICBASE")
+
+  find_library(ACL_CHECK_SYS_CMD_LIB
+    acl_check_sys_cmd
+    PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64)
+  find_library(FPGA_LIB
+    FpgaLib
+    PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64)
+
+  target_link_libraries(de10_agilex_mmd ${ACL_CHECK_SYS_CMD_LIB} ${FPGA_LIB})
+else()
+  target_link_libraries(de10_agilex_mmd)
+endif()
+
+install(TARGETS de10_agilex_mmd
+  RUNTIME DESTINATION "dla/runtime/bin" COMPONENT de10_agilex_mmd
+  LIBRARY DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd
+  ARCHIVE DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp
new file mode 100644
index 0000000..527d8bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp
@@ -0,0 +1,951 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie.cpp  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions that are defined in aocl_mmd.h               */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#include "acl_pcie_device.h"
+#include "hw_pcie_constants.h"
+#ifndef DLA_MMD
+#include "acl_check_sys_cmd.h"
+#endif
+
+// other standard header files
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <unistd.h>
+#endif  // LINUX
+
+// MAX size of line read from pipe-ing the output of system call to MMD
+#define BUF_SIZE 1024
+// MAX size of command passed to system for invoking system call from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+#ifndef DLA_MMD
+// static helper functions
+static bool blob_has_elf_signature(void *data, size_t data_size);
+#endif
+
+// global variables used for handling multi-devices and its helper functions
+// Use a DeviceMapManager to manage a heap-allocated map for storing device information
+// instead of using a static global map because of a segmentation fault which occurs in
+// the following situation:
+// 1) Host program contains a global variable which calls clReleaseContext in its destructor.
+//    When the program ends the global goes out of scope and the destructor is called.
+// 2) clReleaseContext calls a function in the MMD library which modifies the static global map in
+//    the MMD library.
+// In this situation it was discovered that the destructor of the static global map is called before
+// the destructor of the global in the host program, thus resulting in a segmentation fault when
+// clReleaseContext calls a function that modifies the internal map after it has been destroyed.
+// Using a heap-allocated map avoids this issue as the lifetime of the map persists until it is
+// deleted or the process is completely terminated.
+class DeviceMapManager {
+ public:
+  typedef std::pair<const std::string, ACL_PCIE_DEVICE *> DeviceInfo;
+  typedef std::map<int, DeviceInfo> DeviceMap;
+
+  static inline bool empty() { return !s_device_map; }
+
+  // Returns the underlying device map. The map must not be empty when this is called.
+  static inline const DeviceMap &get_device_map() {
+    ACL_PCIE_ASSERT(s_device_map, "no devices are open  -- aborting\n");
+    return *s_device_map;
+  }
+
+  // Returns the device info associated with the given handle. The handle must exist.
+  static inline const DeviceInfo &get_pcie_device_info(int handle) { return get_device_it_for_handle(handle)->second; }
+
+  // Returns the device associated with the given handle. The handle must exist.
+  static inline ACL_PCIE_DEVICE *get_pcie_device(int handle) { return get_pcie_device_info(handle).second; }
+
+  // Adds a device with the specified name for the given handle. If a device with the same handle already exists
+  // it is discarded first. The caller must ensure they don't associate the same device with multiple handles.
+  static inline void add_pcie_device_handle(int handle, const std::string &name, ACL_PCIE_DEVICE *dev) {
+    // To avoid memory leaks ensure that only this function ever allocates a new device map because
+    // we only ever delete the map when the size of the map goes from non-empty to empty.
+    if (!s_device_map) s_device_map = new DeviceMap();
+
+    if (s_device_map->count(handle)) discard_pcie_device_handle(handle);
+    s_device_map->insert(std::pair<int, DeviceInfo>(handle, DeviceInfo(name, dev)));
+  }
+
+  // Removes the device associated with the given handle. The handle must exist.
+  static inline void discard_pcie_device_handle(int handle) {
+    DeviceMap::iterator it = get_device_it_for_handle(handle);
+
+    delete it->second.second;
+    s_device_map->erase(it);
+    if (s_device_map->empty()) {
+      // From a functional perspective the map can remain allocated for
+      // the entire lifetime the MMD is loaded but there
+      // is no other good place to clean it up except here.
+      delete s_device_map;
+      s_device_map = NULL;
+    }
+  }
+
+  // Removes all devices.
+  static inline void discard_all_pcie_device_handles() {
+    if (!s_device_map) return;
+
+    for (DeviceMapManager::DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) {
+      delete it->second.second;
+    }
+
+    delete s_device_map;
+    s_device_map = NULL;
+  }
+
+  // Returns true if any device is currently being programmed.
+  static inline bool is_any_device_being_programmed() {
+    if (!s_device_map) return false;
+
+    for (DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) {
+      if (it->second.second->is_being_programmed()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  static inline DeviceMap::iterator get_device_it_for_handle(int handle) {
+    ACL_PCIE_ASSERT(s_device_map, "can't find handle %d -- aborting\n", handle);
+    DeviceMap::iterator it = s_device_map->find(handle);
+    ACL_PCIE_ASSERT(it != s_device_map->end(), "can't find handle %d -- aborting\n", handle);
+    return it;
+  }
+
+  static DeviceMap *s_device_map;
+};
+DeviceMapManager::DeviceMap *DeviceMapManager::s_device_map = NULL;
+
+static int test_device_exception_signal_number = 63;
+
+// Functions for handling interrupts or signals for multiple devices
+// This functions are used inside the ACL_PCIE_DEVICE class
+#if defined(WINDOWS)
+void pcie_interrupt_handler(void *data) {
+  ACL_PCIE_DEVICE *device = static_cast<ACL_PCIE_DEVICE *>(data);
+  device->service_interrupt();
+}
+
+BOOL ctrl_c_handler(DWORD fdwCtrlType) {
+  if (fdwCtrlType != CTRL_C_EVENT) return FALSE;
+
+  if (DeviceMapManager::is_any_device_being_programmed()) {
+    ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n");
+    return TRUE;
+  }
+
+  // On Windows, the signal handle function is executed by another thread,
+  // so we cannot simply free all the open devices.
+  // Just exit when received a ctrl-c event, the OS will take care of the clean-up.
+  exit(1);
+}
+#endif  // WINDOWS
+#if defined(LINUX)
+// On Linux, driver will send a SIG_INT_NOTIFY *signal* to notify about an interrupt.
+void pcie_linux_signal_handler(int sig, siginfo_t *info, void *unused) {
+  // test_device_exception_signal_number is reserved for device exception testing
+  if (sig == test_device_exception_signal_number) {
+    ACL_PCIE_ERROR_IF(DeviceMapManager::get_device_map().empty(),
+                      return,
+                      "No devices available to trigger test_device_exception_signal_number on.\n");
+    // Pick the last (most recent) handle for device exception testing
+    unsigned int handle = DeviceMapManager::get_device_map().rbegin()->first;
+    DeviceMapManager::get_pcie_device(handle)->test_trigger_device_interrupt();
+  } else {
+    // the last bit indicates the DMA completion
+    unsigned int irq_type_flag = info->si_int & 0x1;
+    // other bits shows the handle value of the device that sent the interrupt
+    unsigned int handle = info->si_int >> 1;
+    if (DeviceMapManager::empty() || !DeviceMapManager::get_device_map().count(handle)) {
+      ACL_PCIE_DEBUG_MSG(":: received an unknown handle %d in signal handler, ignore this.\n", handle);
+      return;
+    }
+
+    DeviceMapManager::get_pcie_device(handle)->service_interrupt(irq_type_flag);
+  }
+}
+
+void ctrl_c_handler(int sig_num) {
+  if (DeviceMapManager::is_any_device_being_programmed()) {
+    ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n");
+    return;
+  }
+
+  // Free all the resource allocated for open devices before exiting the program.
+  // It also notifies the kernel driver about the termination of the program,
+  // so that the kernel driver won't try to talk to any user-allocated memory
+  // space (mainly for the DMA) after the program exit.
+  DeviceMapManager::discard_all_pcie_device_handles();
+  exit(1);
+}
+
+void abort_signal_handler(int sig_num) {
+  DeviceMapManager::discard_all_pcie_device_handles();
+  exit(1);
+}
+
+int allocate_and_register_linux_signal_number_helper(int pid) {
+  char buffer[4096], *locOfSigCgt;
+  FILE *fp;
+  int bytes_read, status, ret = -1;
+  unsigned long long sigmask = 0;
+  struct sigaction sigusr {}, sigabrt {};
+
+  snprintf(buffer, sizeof(buffer), "/proc/%d/status", pid);
+  fp = fopen(buffer, "rb");
+  ACL_PCIE_ERROR_IF(fp == NULL, return -1, "Unable to open file %s\n", buffer);
+  bytes_read = fread(buffer, sizeof(buffer[0]), sizeof(buffer) - 1, fp);
+  fclose(fp);
+  buffer[bytes_read] = 0;                   // null terminate the string
+  locOfSigCgt = strstr(buffer, "SigCgt:");  // returns null if can't find, shouldn't happen
+  ACL_PCIE_ERROR_IF(locOfSigCgt == NULL, return -1, "Did not find SigCgt: for PID %d\n", pid);
+  sscanf(locOfSigCgt + 7, "%llx", &sigmask);
+
+  // Find an unused signal number
+  for (int i = SIGRTMAX; i >= SIGRTMIN; i--) {
+    if (!((sigmask >> (i - 1)) & 1)) {
+      ret = i;
+      break;
+    }
+  }
+  ACL_PCIE_ERROR_IF(ret == -1, return -1, "Unable to find an unused signal number\n");
+
+  // Enable if driver is using signals to communicate with the host.
+  sigusr.sa_sigaction = pcie_linux_signal_handler;
+  sigusr.sa_flags = SA_SIGINFO;
+  status = sigaction(ret, &sigusr, NULL);
+  if (getenv("ACL_MMD_TEST_INTELFPGA")) {
+    ACL_PCIE_ERROR_IF(((sigmask >> (test_device_exception_signal_number - 1)) & 1),
+                      return -1,
+                      "Signal number %i cannot be occupied\n",
+                      test_device_exception_signal_number);
+    status = sigaction(test_device_exception_signal_number, &sigusr, NULL);
+  }
+  ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, ret);
+
+  // Install signal handler for SIGABRT from assertions in the upper layers
+  sigabrt.sa_handler = abort_signal_handler;
+  sigemptyset(&sigabrt.sa_mask);
+  sigabrt.sa_flags = 0;
+  status = sigaction(SIGABRT, &sigabrt, NULL);
+  ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, SIGABRT);
+
+  // if it makes it here, the user got an unused signal number and we installed all signal handlers
+  return ret;
+}
+
+// returns an unused signal number, -1 means ran into some error
+int allocate_and_register_linux_signal_number(pthread_mutex_t *mutex) {
+  int pid = getpid();
+  int err = pthread_mutex_lock(mutex);
+  ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_lock error %d\n", err);
+
+  // this has multiple return points, put in separate function so that we don't bypass releasing the mutex
+  int ret = allocate_and_register_linux_signal_number_helper(pid);
+
+  err = pthread_mutex_unlock(mutex);
+  ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_unlock error %d\n", err);
+
+  return ret;
+}
+#endif  // LINUX
+
+// Function to install the signal handler for Ctrl-C
+// If ignore_sig != 0, the ctrl-c signal will be ignored by the program
+// If ignore_sig  = 0, the custom signal handler (ctrl_c_handler) will be used
+int install_ctrl_c_handler(int ingore_sig) {
+#if defined(WINDOWS)
+  SetConsoleCtrlHandler((ingore_sig ? NULL : (PHANDLER_ROUTINE)ctrl_c_handler), TRUE);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct sigaction sig;
+  sig.sa_handler = (ingore_sig ? SIG_IGN : ctrl_c_handler);
+  sigemptyset(&sig.sa_mask);
+  sig.sa_flags = 0;
+  sigaction(SIGINT, &sig, NULL);
+#endif  // LINUX
+
+  return 0;
+}
+
+// Function to return the number of boards installed in the system
+unsigned int get_offline_num_boards() {
+  unsigned int num_boards = 0;
+
+  // These are for reading/parsing the environment variable
+  const char *override_count_string = 0;
+  long parsed_count;
+  char *endptr;
+
+// Windows MMD will try to open all the devices
+#if defined(WINDOWS)
+  fpga_result result;
+  fpga_properties filter = NULL;
+
+  result = fpgaGetProperties(NULL, &filter);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n");
+  }
+
+  result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to set object type.\n");
+  }
+
+  result = fpgaPropertiesSetVendorID(filter, ACL_PCI_INTELFPGA_VENDOR_ID);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to set vendor ID.\n");
+  }
+
+  result = fpgaEnumerate(&filter, 1, NULL, 1, &num_boards);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to scan for the PCI device.\n");
+  }
+
+  if (filter != NULL) fpgaDestroyProperties(&filter);
+
+  if (num_boards == 0) {
+    num_boards = ACL_MAX_DEVICE;
+  }
+
+End:
+#endif  // WINDOWS
+
+// Linux MMD will look into the number of devices
+#if defined(LINUX)
+  FILE *fp;
+  char str_line_in[BUF_SIZE];
+  char str_board_pkg_name[BUF_SIZE];
+  char str_cmd[SYSTEM_CMD_SIZE];
+
+  snprintf(str_board_pkg_name, sizeof(str_board_pkg_name), "acl%s", ACL_BOARD_PKG_NAME);
+  snprintf(str_cmd, sizeof(str_cmd), "ls /sys/class/aclpci_%s 2>/dev/null", ACL_BOARD_PKG_NAME);
+
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(str_cmd), "Invalid popen() function parameter: %s\n", str_cmd);
+#endif
+  fp = popen(str_cmd, "r");
+
+  if (fp == NULL) {
+    ACL_PCIE_INFO("Couldn't open pipe stream\n");
+    return false;
+  }
+  // Read every line from output
+  while (fgets(str_line_in, BUF_SIZE, fp) != NULL) {
+    if (strncmp(str_board_pkg_name, str_line_in, strnlen(str_board_pkg_name, MAX_NAME_SIZE)) == 0) {
+      num_boards++;
+    }
+  }
+
+  pclose(fp);
+
+#endif  // LINUX
+
+  override_count_string = getenv("CL_OVERRIDE_NUM_DEVICES_INTELFPGA");
+  if (override_count_string) {
+    endptr = 0;
+    parsed_count = strtol(override_count_string, &endptr, 10);
+    if (endptr == override_count_string  // no valid characters
+        || *endptr                       // an invalid character
+        || (parsed_count < 0 || parsed_count >= (long)ACL_MAX_DEVICE)) {
+      // malformed override string, do nothing
+    } else {
+      // Was ok.
+      num_boards = (unsigned int)parsed_count;
+    }
+  }
+
+  return num_boards;
+}
+
+// Get information about the board using the enum aocl_mmd_offline_info_t for
+// offline info (called without a handle), and the enum aocl_mmd_info_t for
+// info specific to a certain board.
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_UNSIGNED(X)                                  \
+  {                                                         \
+    *((unsigned *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(unsigned); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+#if defined(WINDOWS)
+#define RESULT_STR(X)                                                                                         \
+  do {                                                                                                        \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                                              \
+    memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                                               \
+  } while (0)
+#else
+#define RESULT_STR(X)                                                                     \
+  do {                                                                                    \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                          \
+    memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                           \
+  } while (0)
+#endif
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  // It might be helpful to cache the info if function aocl_mmd_get_offline_info is called frequently.
+  unsigned int num_boards;
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(MMD_VERSION);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      num_boards = get_offline_num_boards();
+      RESULT_INT((int)num_boards);
+      break;
+    }
+    case AOCL_MMD_BOARD_NAMES: {
+      // Construct a list of all possible devices supported by this MMD layer
+      std::ostringstream boards;
+      num_boards = get_offline_num_boards();
+      for (unsigned i = 0; i < num_boards; i++) {
+        boards << "acl" << ACL_BOARD_PKG_NAME << i;
+        if (i < num_boards - 1) boards << ";";
+      }
+      RESULT_STR(boards.str().c_str());
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME: {
+      RESULT_STR(ACL_VENDOR_NAME);
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(ACL_PCI_INTELFPGA_VENDOR_ID);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY);
+      break;
+  }
+  return 0;
+}
+
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_get_info failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << ACL_BOARD_NAME << " (" << DeviceMapManager::get_pcie_device_info(handle).first << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(AOCL_MMD_PLL);
+      break;
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO:
+      RESULT_STR(pcie_dev->get_dev_pcie_info());
+      break;
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+
+    case AOCL_MMD_TEMPERATURE: {
+      float *r;
+      int temp;
+      pcie_dev->get_ondie_temp_slow_call(&temp);
+      r = (float *)param_value;
+      *r = ACL_PCIE_TEMP_FORMULA;
+      if (param_size_ret) *param_size_ret = sizeof(float);
+      break;
+    }
+
+    // currently not supported
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      return -1;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+// Open and initialize the named device.
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+  static int signal_handler_installed = 0;
+  static int unique_id = 0;
+  int dev_num = -1;
+  static int user_signal_number = -1;
+#if defined(LINUX)
+  static pthread_mutex_t linux_signal_arb_mutex =
+      PTHREAD_MUTEX_INITIALIZER;  // initializes as unlocked, static = no cleanup needed
+
+  if (sscanf(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) {
+    return -1;
+  }
+#endif  // LINUX
+
+#if defined(WINDOWS)
+  if (sscanf_s(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) {
+    return -1;
+  }
+#endif
+  if (dev_num < 0 || dev_num >= ACL_MAX_DEVICE) {
+    return -1;
+  }
+  if (++unique_id <= 0) {
+    unique_id = 1;
+  }
+
+  ACL_PCIE_ASSERT(DeviceMapManager::empty() || DeviceMapManager::get_device_map().count(unique_id) == 0,
+                  "unique_id %d is used before.\n",
+                  unique_id);
+
+  if (signal_handler_installed == 0) {
+#if defined(LINUX)
+    user_signal_number = allocate_and_register_linux_signal_number(&linux_signal_arb_mutex);
+    if (user_signal_number == -1) return -1;
+#endif  // LINUX
+
+    install_ctrl_c_handler(0 /* use the custom signal handler */);
+    signal_handler_installed = 1;
+  }
+
+  ACL_PCIE_DEVICE *pcie_dev = NULL;
+
+  try {
+    pcie_dev = new ACL_PCIE_DEVICE(dev_num, name, unique_id, user_signal_number);
+  }
+
+  // Catch any memory allocation failures
+  catch (std::bad_alloc &) {
+    delete pcie_dev;
+    return -1;
+  }
+
+  if (!pcie_dev->is_valid()) {
+    delete pcie_dev;
+    return -1;
+  }
+
+  DeviceMapManager::add_pcie_device_handle(unique_id, name, pcie_dev);
+  if (pcie_dev->is_initialized()) {
+    return unique_id;
+  } else {
+    // Perform a bitwise-not operation to the unique_id if the device
+    // do not pass the initial test. This negative unique_id indicates
+    // a fail to open the device, but still provide actual the unique_id
+    // to allow reprogram executable to get access to the device and
+    // reprogram the board when the board is not usable.
+    return ~unique_id;
+  }
+}
+
+// Close an opened device, by its handle.
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+  DeviceMapManager::discard_pcie_device_handle(handle);
+
+  return 0;
+}
+
+// Set the interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_kernel_interrupt(fn, user_data);
+}
+
+// Set the device interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_device_interrupt(fn, user_data);
+}
+
+// Set the operation status handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_status_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_status_handler(fn, user_data);
+}
+
+// Called when the host is idle and hence possibly waiting for events to be
+// processed by the device
+int AOCL_MMD_CALL aocl_mmd_yield(int handle) { return DeviceMapManager::get_pcie_device(handle)->yield(); }
+
+// Read, write and copy operations on a single interface.
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  void *host_addr = dst;
+  size_t dev_addr = offset;
+
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->read_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  void *host_addr = const_cast<void *>(src);
+  size_t dev_addr = offset;
+
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_write failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->write_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_copy failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->copy_block(op, (aocl_mmd_interface_t)mmd_interface, src_offset, dst_offset, len);
+}
+
+// Initialize host channel specified in channel_name
+int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->create_hostchannel(channel_name, queue_depth, direction);
+}
+
+// reset the host channel specified with channel handle
+int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->destroy_channel(channel);
+}
+
+// Get the pointer to buffer the user can write/read from the kernel with
+AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return NULL,
+                    "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->hostchannel_get_buffer(buffer_size, channel, status);
+}
+
+// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer
+size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(), *status = -1;
+      return 0, "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n", handle);
+
+  return pcie_dev->hostchannel_ack_buffer(send_size, channel, status);
+}
+
+#ifdef DLA_MMD
+
+AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle)
+{
+  auto ret = DeviceMapManager::get_pcie_device(handle)->pause_and_save_pcie();
+  if (ret) {
+    return -1;
+  }
+  return 0;
+}
+AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle)
+{
+  auto ret = DeviceMapManager::get_pcie_device(handle)->restore_and_resume_pcie();
+  if (ret) {
+    return -1;
+  }
+  return 0;
+}
+// Reprogram the device given the sof file name
+int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename, const bool skipSaveRestore) {
+  if (DeviceMapManager::get_pcie_device(handle)->reprogram_sof(sof_filename, skipSaveRestore))
+  {
+    return -1;
+  }
+  return 0;
+}
+#else
+// Reprogram the device based on the program mode
+int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) {
+  // assuming the an ELF-formatted blob.
+  if (!blob_has_elf_signature(data, data_size)) {
+    ACL_PCIE_DEBUG_MSG("ad hoc fpga bin\n");
+    return -1;
+  }
+
+  // program the device based on the certain mode
+  if (program_mode & AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM) {
+    if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_PR)) return -1;
+    return handle;
+  } else {
+    if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_JTAG)) return -1;
+    // Re-open the device to reinitialize hardware
+    const std::string device_name = DeviceMapManager::get_pcie_device_info(handle).first;
+    DeviceMapManager::discard_pcie_device_handle(handle);
+
+    return aocl_mmd_open(device_name.c_str());
+  }
+}
+#endif
+// Shared memory allocator
+AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) {
+  return DeviceMapManager::get_pcie_device(handle)->shared_mem_alloc(size, device_ptr_out);
+}
+
+// Shared memory de-allocator
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) {
+  DeviceMapManager::get_pcie_device(handle)->shared_mem_free(host_ptr, size);
+}
+
+#ifndef DLA_MMD
+// This function checks if the input data has an ELF-formatted blob.
+// Return true when it does.
+static bool blob_has_elf_signature(void *data, size_t data_size) {
+  bool result = false;
+  if (data && data_size > 4) {
+    unsigned char *cdata = (unsigned char *)data;
+    const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'};  // Little endian
+    result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) &&
+             (cdata[3] == elf_signature[3]);
+  }
+  return result;
+}
+#endif
+
+// Return a positive number when single device open. Otherwise, return -1
+AOCL_MMD_CALL int get_open_handle() {
+  if (DeviceMapManager::empty() || DeviceMapManager::get_device_map().size() != 1) {
+    return -1;
+  }
+  return DeviceMapManager::get_device_map().begin()->first;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t *properties,
+                                        int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_free(void *mem) {
+  // Not supported on this BSP
+  return 0;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) {
+  // Not supported on this BSP
+  return 0;
+}
+
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; }
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 333.333333; }  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 33) * instance + addr; }
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(
+      handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h
new file mode 100644
index 0000000..cfba6a3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h
@@ -0,0 +1,177 @@
+#ifndef ACL_PCIE_H
+#define ACL_PCIE_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie.h  --------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file defines macros and types that are used inside the MMD driver          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifndef ACL_PCIE_EXPORT
+#define ACL_PCIE_EXPORT __declspec(dllimport)
+#endif
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#ifdef DLA_MMD
+#include <cstdint>
+#else
+#include <CL/cl_platform.h>
+#endif
+#include "aocl_mmd.h"
+#include "hw_pcie_constants.h"
+
+#define MMD_VERSION AOCL_MMD_VERSION_STRING
+
+#ifdef DLA_MMD
+#include "version.h"
+#else
+#include <version.h>
+#endif
+
+#define KERNEL_DRIVER_VERSION_EXPECTED ACL_DRIVER_VERSION
+
+#if defined(_WIN32) || defined(_WIN64)
+// Need DWORD, UINT32, etc.
+// But windows.h spits out a lot of spurious warnings.
+#pragma warning(push)
+#pragma warning(disable : 4668)
+#include <windows.h>
+#pragma warning(pop)
+
+// OPAE header files
+#include <initguid.h>
+#include <opae/fpga.h>
+#include "fpga_cmd_guids.h"
+
+#define INVALID_DEVICE (NULL)
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%lu"
+#define DWORD_FMT_X "%lx"
+#define DWORD_FMT_4X "%04lX"
+
+// define for the format string for size_t type
+#ifdef _WIN64
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+#else
+#define SIZE_FMT_U "%Iu"
+#define SIZE_FMT_X "%Ix"
+#endif
+
+typedef ULONG64 KPTR;
+typedef UINT64 DMA_ADDR;
+#endif  // WINDOWS
+
+#if defined(LINUX)
+typedef uintptr_t KPTR;
+typedef int fpga_handle;
+typedef unsigned int fpga_result;
+#define FPGA_OK 0
+
+typedef unsigned int DWORD;
+typedef unsigned long long QWORD;
+typedef char INT8;
+typedef unsigned char UINT8;
+typedef int16_t INT16;
+typedef uint16_t UINT16;
+typedef int INT32;
+typedef unsigned int UINT32;
+typedef long long INT64;
+typedef unsigned long long UINT64;
+
+#define INVALID_HANDLE_VALUE ((int)(-1))
+
+// Linux driver-specific exports
+#include "pcie_linux_driver_exports.h"
+
+#define INVALID_DEVICE (-1)
+#define WD_STATUS_SUCCESS 0
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%u"
+#define DWORD_FMT_X "%x"
+#define DWORD_FMT_4X "%04X"
+
+// define for the format string for size_t type
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+
+#endif  // LINUX
+
+#define MAX_NAME_SIZE (1204)
+
+typedef enum {
+  AOCL_MMD_KERNEL = ACL_MMD_KERNEL_HANDLE,  // Control interface into kernel interface
+  AOCL_MMD_MEMORY = ACL_MMD_MEMORY_HANDLE,  // Data interface to device memory
+  AOCL_MMD_PLL = ACL_MMD_PLL_HANDLE,        // Interface for reconfigurable PLL
+  AOCL_MMD_HOSTCH = ACL_MMD_HOSTCH_HANDLE
+} aocl_mmd_interface_t;
+
+// Describes the properties of key components in a standard ACL device
+#define PCIE_INFO_STR_LEN 1024
+#define PCIE_SLOT_INFO_STR_LEN 128
+
+struct ACL_PCIE_DEVICE_DESCRIPTION {
+  DWORD vendor_id;
+  DWORD device_id;
+  char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN];
+  char pcie_info_str[PCIE_INFO_STR_LEN];
+  bool interrupt_valid;
+  UINT32 interrupt_data;
+  UINT64 interrupt_addr;
+};
+
+#define ACL_PCIE_ASSERT(COND, ...)                        \
+  do {                                                    \
+    if (!(COND)) {                                        \
+      printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \
+      printf(__VA_ARGS__);                                \
+      fflush(stdout);                                     \
+      assert(0);                                          \
+    }                                                     \
+  } while (0)
+
+#define ACL_PCIE_ERROR_IF(COND, NEXT, ...) \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define ACL_PCIE_INFO(...)             \
+  do {                                 \
+    printf("MMD INFO : " __VA_ARGS__); \
+    fflush(stdout);                    \
+  } while (0)
+
+// Define the flag of program
+#define ACL_PCIE_PROGRAM_PR 1
+#define ACL_PCIE_PROGRAM_JTAG 0
+
+#endif  // ACL_PCIE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp
new file mode 100644
index 0000000..03c76dd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp
@@ -0,0 +1,1049 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_config.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle functions that program the FPGA.       */
+/* The declaration of the class lives in the acl_pcie_config.h.                    */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_config.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+
+// other standard header files
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <sstream>
+#if defined(WINDOWS)
+#include <process.h>
+#endif  // WINDOWS
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif  // LINUX
+
+#if defined(WINDOWS)
+#define FREEZE_STATUS_OFFSET 0
+#define FREEZE_CTRL_OFFSET 4
+#define FREEZE_VERSION_OFFSET 12
+#define FREEZE_BRIDGE_SUPPORTED_VERSION 0xad000003
+
+#define FREEZE_REQ 1
+#define RESET_REQ 2
+#define UNFREEZE_REQ 4
+
+#define FREEZE_REQ_DONE 1
+#define UNFREEZE_REQ_DONE 2
+
+#define ALT_PR_DATA_OFST 0x00
+#define ALT_PR_CSR_OFST 0x04
+#define ALT_PR_VER_OFST 0x08
+
+#define ALT_PR_CSR_PR_START 1
+#define ALT_PR_CSR_STATUS_SFT 1
+#define ALT_PR_CSR_STATUS_MSK (7 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_NRESET (0 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_BUSY (1 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_IN_PROG (2 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_SUCCESS (3 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_ERR (4 << ALT_PR_CSR_STATUS_SFT)
+
+#define ACL_DMA_PR_ALIGNMENT_BYTES 4096
+
+#define PLL_OFFSET_VERSION_ID 0x000
+#define PLL_OFFSET_ROM 0x400
+#define PLL_OFFSET_RECONFIG_CTRL_S10 0x800
+#define PLL_OFFSET_COUNTER 0x100
+#define PLL_OFFSET_RESET 0x110
+#define PLL_OFFSET_LOCK 0x120
+
+#define PLL_M_HIGH_REG_S10 0x104
+#define PLL_M_LOW_REG_S10 0x107
+#define PLL_M_BYPASS_ENABLE_REG_S10 0x105
+#define PLL_M_EVEN_DUTY_ENABLE_REG_S10 0x106
+
+#define PLL_N_HIGH_REG_S10 0x100
+#define PLL_N_LOW_REG_S10 0x102
+#define PLL_N_BYPASS_ENABLE_REG_S10 0x101
+#define PLL_N_EVEN_DUTY_ENABLE_REG_S10 0x101
+
+#define PLL_C0_HIGH_REG_S10 0x11B
+#define PLL_C0_LOW_REG_S10 0x11E
+#define PLL_C0_BYPASS_ENABLE_REG_S10 0x11C
+#define PLL_C0_EVEN_DUTY_ENABLE_REG_S10 0x11D
+
+#define PLL_C1_HIGH_REG_S10 0x11F
+#define PLL_C1_LOW_REG_S10 0x122
+#define PLL_C1_BYPASS_ENABLE_REG_S10 0x120
+#define PLL_C1_EVEN_DUTY_ENABLE_REG_S10 0x121
+
+#define PLL_LF_REG_S10 0x10A
+
+#define PLL_CP1_REG_S10 0x101
+#define PLL_CP2_REG_S10 0x10D
+
+#define PLL_REQUEST_CAL_REG_S10 0x149
+#define PLL_ENABLE_CAL_REG_S10 0x14A
+#endif  // WINDOWS
+
+#ifndef DLA_MMD
+#include "acl_check_sys_cmd.h"
+#include "pkg_editor.h"
+#endif
+
+// MAX size of line read from pipe-ing the output of find_jtag_cable.tcl to MMD
+#define READ_SIZE 1024
+// MAX size of command passed to system for invoking find_jtag_cable.tcl from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+// Function to install the signal handler for Ctrl-C
+// Implemented inside acl_pcie.cpp
+extern int install_ctrl_c_handler(int ingore_sig);
+
+ACL_PCIE_CONFIG::ACL_PCIE_CONFIG(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma) {
+  m_handle = Handle;
+  m_io = io;
+  m_pcie = pcie;
+  m_dma = dma;
+
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+  UINT32 NumCmds = 0;
+  FpgaCmd = NULL;
+
+  // Get the number of supported commands
+  result = fpgaGetSupportedCommands(Handle, NULL, &NumCmds);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n");
+
+  // Allocate memory for the guid array based on NumCmds
+  FpgaCmd = (fpga_guid *)malloc(NumCmds * sizeof(fpga_guid));
+
+  if (FpgaCmd == NULL) {
+    throw std::bad_alloc();
+  }
+
+  ACL_PCIE_ERROR_IF(FpgaCmd == NULL, return, "malloc failed in ACL_PCIE_CONFIG().\n");
+
+  // Populate the guid array
+  result = fpgaGetSupportedCommands(Handle, FpgaCmd, &NumCmds);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n");
+#endif  // WINDOWS
+
+  return;
+}
+
+ACL_PCIE_CONFIG::~ACL_PCIE_CONFIG() {
+#if defined(WINDOWS)
+  // Free the guid array
+  if (FpgaCmd) {
+    free(FpgaCmd);
+    FpgaCmd = NULL;
+  }
+#endif
+}
+
+// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf
+// For Linux, the actual implementation of PR is inside the kernel mode driver.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len) {
+  int pr_result = 1;  // set to default - failure
+
+  ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n");
+  ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n");
+
+#if defined(WINDOWS)
+  int i;
+  uint32_t version;
+  UINT32 to_send, status;
+  UINT32 *data;
+  fpga_result result;
+
+  /* Get version ID */
+  result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version);
+  ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version);
+
+  /* Check if PR is supported */
+  if (version < (unsigned int)ACL_PR_PIO_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n");
+    pr_result = 1;
+    return pr_result;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  to_send = 0x00000001;
+  ACL_PCIE_DEBUG_MSG(":: Writing 0x%08X to PR IP status register\n", (int)to_send);
+  result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, to_send);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+
+  if ((status != 0x10) && (status != 0x0)) {
+    ACL_PCIE_ERROR_IF(1, return 1, ":: PR IP not in an usable state.\n");
+  }
+
+  data = (UINT32 *)core_bitstream;
+  ACL_PCIE_DEBUG_MSG(":: Writing %d bytes of bitstream file to PR IP at BAR %d, OFFSET 0x%08X\n",
+                     (int)core_rbf_len,
+                     (int)ACL_PRCONTROLLER_BAR,
+                     (int)ACL_PRCONTROLLER_OFFSET);
+  for (i = 0; i < (int)core_rbf_len / 4; i++) {
+    result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, data[i]);
+    ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n");
+  }
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP data register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  if (status == 0x14) {
+    ACL_PCIE_DEBUG_MSG(":: PR done!: 0x%08X\n", (int)status);
+    pr_result = 0;
+  } else {
+    ACL_PCIE_DEBUG_MSG(":: PR error!: 0x%08X\n", (int)status);
+    pr_result = 1;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: PR completed!\n");
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL};
+
+  cmd_pr.user_addr = core_bitstream;
+  cmd_pr.size = core_rbf_len;
+
+  pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr));
+
+#endif  // LINUX
+
+  return pr_result;
+}
+
+// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf
+// For Linux, the actual implementation of PR is inside the kernel mode driver.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str) {
+  int pr_result = 1;  // set to default - failure
+#if defined(WINDOWS)
+  uint32_t pll_config_array[8] = {0};
+#else
+  int pll_config_array[8] = {0};
+#endif  // WINDOWS
+  std::stringstream converter(pll_config_str);
+
+  ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n");
+  ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n");
+
+  /* parse PLL string */
+  converter >> pll_config_array[0] >> pll_config_array[1] >> pll_config_array[2] >> pll_config_array[3] >>
+      pll_config_array[4] >> pll_config_array[5] >> pll_config_array[6] >> pll_config_array[7];
+  if (converter.fail() == true) {
+    ACL_PCIE_ERROR_IF(1, return 1, "PLL configuration string requires 8 integer elements\n");
+  };
+
+#if defined(WINDOWS)
+  int i, j, k, result, count, chunk_num, frames;
+  size_t offset;
+  uint32_t to_send, status;
+  uint32_t version;
+  uint32_t *data;
+  uint32_t pll_freq_khz, pll_m, pll_n, pll_c0, pll_c1, pll_lf, pll_cp, pll_rc;
+  uint32_t pll_m_high, pll_m_low, pll_m_bypass_enable, pll_m_even_duty_enable;
+  uint32_t pll_n_high, pll_n_low, pll_n_bypass_enable, pll_n_even_duty_enable;
+  uint32_t pll_c0_high, pll_c0_low, pll_c0_bypass_enable, pll_c0_even_duty_enable;
+  uint32_t pll_c1_high, pll_c1_low, pll_c1_bypass_enable, pll_c1_even_duty_enable;
+  uint32_t pll_cp1, pll_cp2;
+  uint32_t pll_byte;
+
+  /* Get version ID */
+  result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version);
+  ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version);
+
+  /* Check if PR is supported */
+  if (version < (unsigned int)ACL_PR_PIO_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n");
+    pr_result = 1;
+    return pr_result;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n");
+
+  /* freeze bridge */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+  ACL_PCIE_DEBUG_MSG(":: Asserting region freeze\n");
+  fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, FREEZE_REQ);
+  Sleep(1);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+  ACL_PCIE_DEBUG_MSG(":: PR Beginning\n");
+
+  /* PR IP write initialisation */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_VER_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_VER_OFST version is 0x%08X\n", (int)status);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+
+  to_send = ALT_PR_CSR_PR_START;
+  ACL_PCIE_DEBUG_MSG(":: Starting PR by writing 0x%08X to ALT_PR_CSR_OFST\n", (int)to_send);
+  fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, to_send);
+
+  /* Wait for PR to be in progress */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  i = 0;
+  while (status != ALT_PR_CSR_STATUS_PR_IN_PROG) {
+    Sleep(1);
+    i++;
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  };
+  ACL_PCIE_DEBUG_MSG(":: PR IP initialization took %d ms, ALT_PR_CSR_OFST status is 0x%08X\n", i, (int)status);
+
+  // ---------------------------------------------------------------
+  // Legacy PR using PIO
+  // ---------------------------------------------------------------
+  if ((version >= (unsigned int)ACL_PR_PIO_VERSIONID) && (version < (unsigned int)ACL_PR_DMA_VERSIONID)) {
+    /* PR IP write bitstream */
+    MemoryBarrier();
+    data = (UINT32 *)core_bitstream;
+    count = (int)core_rbf_len;
+    ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X\n", (int)count);
+
+    /* Write out the complete 32-bit chunks */
+    /* Wait for a designated amount of time between 4K chunks */
+    i = 0;
+    j = 0;
+    chunk_num = 0;
+    while (count >= 4) {
+      fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, data[i]);
+      i++;
+      j++;
+      count = count - 4;
+      if (j >= 1024) {
+        chunk_num++;
+        j = 0;
+        Sleep(1);
+      }
+    }
+    ACL_PCIE_DEBUG_MSG(":: Number of 4K chunks written: %d\n", (int)chunk_num);
+    ACL_PCIE_DEBUG_MSG(":: Number of bytes in PR bitstream remaining: %d\n", (int)count);
+
+    /* Write out remaining non 32-bit chunks */
+    to_send = data[i];
+    switch (count) {
+      case 3:
+        to_send = to_send & 0x00ffffff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 2:
+        to_send = to_send & 0x0000ffff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 1:
+        to_send = to_send & 0x000000ff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 0:
+        break;
+      default:
+        /* This will never happen */
+        return 1;
+    }
+  }
+
+  // ---------------------------------------------------------------
+  // PR using DMA
+  // ---------------------------------------------------------------
+  if (version >= (unsigned int)ACL_PR_DMA_VERSIONID) {
+    /* PR IP write bitstream */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X, initiating DMA transfer to PR IP\n", (int)core_rbf_len);
+
+    /* Write PR bitstream using DMA */
+    frames = (int)core_rbf_len / ACL_DMA_PR_ALIGNMENT_BYTES;
+    ACL_PCIE_DEBUG_MSG(
+        ":: PR bitstream will be sent in %d Byte frames, a total of %d frames\n", ACL_DMA_PR_ALIGNMENT_BYTES, frames);
+
+    // sending in 4kB frames
+    for (k = 0; k < frames; k++) {
+      offset = (size_t)k * ACL_DMA_PR_ALIGNMENT_BYTES;
+      void *host_addr_new = reinterpret_cast<void *>(core_bitstream + offset);
+      size_t dev_addr_new = ACL_PCIE_PR_DMA_OFFSET;
+
+      status = (uint32_t)m_dma->read_write(host_addr_new, dev_addr_new, ACL_DMA_PR_ALIGNMENT_BYTES, NULL, false);
+
+      while (!m_dma->is_idle()) {
+        ACL_PCIE_DEBUG_MSG(":: DMA still in progress...\n");
+      }
+    }
+  }
+
+  // Wait for PR complete
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+  i = 0;
+  // wait till we get a PR_SUCCESS, or PR_ERROR, or a 1 second timeout
+  while (status != ALT_PR_CSR_STATUS_PR_SUCCESS && status != ALT_PR_CSR_STATUS_PR_ERR && i < 100000) {
+    Sleep(100);
+    i++;
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+    ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+  };
+
+  if (status == ALT_PR_CSR_STATUS_PR_SUCCESS) {
+    /* dynamically reconfigure IOPLL for kernel clock */
+    /* read kernel clock generation version ID */
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_VERSION_ID, &status);
+    ACL_PCIE_DEBUG_MSG(":: Kernel clock generator version ID is 0x%08X\n", (int)status);
+
+    /* extract PLL settings from PLL configuration array */
+    pll_freq_khz = pll_config_array[0];
+    pll_m = pll_config_array[1];
+    pll_n = pll_config_array[2];
+    pll_c0 = pll_config_array[3];
+    pll_c1 = pll_config_array[4];
+    pll_lf = pll_config_array[5];
+    pll_cp = pll_config_array[6];
+    pll_rc = pll_config_array[7];
+
+    ACL_PCIE_DEBUG_MSG(":: PLL settings are %d %d %d %d %d %d %d %d\n",
+                       pll_freq_khz,
+                       pll_m,
+                       pll_n,
+                       pll_c0,
+                       pll_c1,
+                       pll_lf,
+                       pll_cp,
+                       pll_rc);
+
+    // Measure kernel clock frequency
+    fpgaWriteMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0);
+    Sleep(1000);
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status);
+    ACL_PCIE_DEBUG_MSG(":: Before reconfig, kernel clock set to %d Hz\n", (int)status);
+
+    // extract all PLL parameters
+    pll_m_high = (pll_m >> 8) & 0xFF;
+    pll_m_low = pll_m & 0xFF;
+    pll_m_bypass_enable = (pll_m >> 16) & 0x01;
+    pll_m_even_duty_enable = (pll_m >> 17) & 0x01;
+
+    pll_n_high = (pll_n >> 8) & 0xFF;
+    pll_n_low = pll_n & 0xFF;
+    pll_n_bypass_enable = (pll_n >> 16) & 0x01;
+    pll_n_even_duty_enable = (pll_n >> 17) & 0x01;
+
+    pll_c0_high = (pll_c0 >> 8) & 0xFF;
+    pll_c0_low = pll_c0 & 0xFF;
+    pll_c0_bypass_enable = (pll_c0 >> 16) & 0x01;
+    pll_c0_even_duty_enable = (pll_c0 >> 17) & 0x01;
+
+    pll_c1_high = (pll_c1 >> 8) & 0xFF;
+    pll_c1_low = pll_c1 & 0xFF;
+    pll_c1_bypass_enable = (pll_c1 >> 16) & 0x01;
+    pll_c1_even_duty_enable = (pll_c1 >> 17) & 0x01;
+
+    pll_lf = (pll_lf >> 6) & 0xFF;
+
+    pll_cp = pll_cp & 0xFF;
+    pll_cp1 = pll_cp & 0x07;
+    pll_cp2 = (pll_cp >> 3) & 0x07;
+
+    pll_rc = pll_rc & 0x03;
+
+    /* read and write PLL settings */
+    to_send = pll_m_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_m_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_m_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_m_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_n_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_n_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_n_even_duty_enable << 7) | (pll_cp1 << 4) | pll_n_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_c0_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c0_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c0_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_c0_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_c1_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c1_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c1_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_c1_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = (pll_cp2 << 5);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_CP2_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = (pll_lf << 3) | (pll_rc << 1);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_LF_REG_S10,
+                  &to_send,
+                  1);
+
+    // start PLL calibration
+    /* read/modify/write the request calibration */
+    ACL_PCIE_DEBUG_MSG(":: Requesting PLL calibration\n");
+    result = fpgaReadMmio(m_handle,
+                          ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                          ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10,
+                          &pll_byte,
+                          1);
+    to_send = pll_byte | 0x40;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10,
+                  &to_send,
+                  1);
+    /* write 0x03 to enable calibration interface */
+    to_send = 0x03;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_ENABLE_CAL_REG_S10,
+                  &to_send,
+                  1);
+    ACL_PCIE_DEBUG_MSG(":: PLL calibration done\n");
+
+    // Measure kernel clock frequency
+    fpgaWriteMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0);
+    Sleep(1000);
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status);
+    ACL_PCIE_DEBUG_MSG(":: After reconfig, kernel clock set to %d Hz\n", (int)status);
+
+    /* assert reset */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Asserting region reset\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, RESET_REQ);
+    Sleep(10);
+
+    /* unfreeze bridge */
+    MemoryBarrier();
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status);
+
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+    ACL_PCIE_DEBUG_MSG(":: Removing region freeze\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, UNFREEZE_REQ);
+    Sleep(1);
+
+    ACL_PCIE_DEBUG_MSG(":: Checking freeze bridge status\n");
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+    /* deassert reset */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Deasserting region reset\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, 0);
+
+    MemoryBarrier();
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+    ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+    if (status == 0x6) {
+      ACL_PCIE_DEBUG_MSG(":: PR done! Status is 0x%08X\n", (int)status);
+      pr_result = 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status);
+      pr_result = 1;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status);
+    pr_result = 1;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: PR completed!\n");
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL};
+
+  cmd_pr.user_addr = core_bitstream;
+  cmd_pr.size = core_rbf_len;
+  cmd_pr.device_addr = pll_config_array;
+
+  pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr));
+
+#endif  // LINUX
+
+  return pr_result;
+}
+
+// Windows specific code to disable PCIe advanced error reporting on the
+// upstream port.
+// No-op in Linux because save_pcie_control_regs() has already disabled
+// AER on the upstream port.
+// Returns 0 on success
+int ACL_PCIE_CONFIG::disable_AER_windows(void) {
+  fpga_result result = FPGA_OK;
+
+#if defined(WINDOWS)
+  // IOCTL call to disable AER in kernel mode
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_DISABLE_AER), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when disabling AER.\n");
+#endif  // WINDOWS
+  return result;
+}
+
+// Windows specific code to enable PCIe advanced error reporting on the
+// upstream port.
+// No-op in Linux because load_pcie_control_regs() has already enabled
+// AER on the upstream port.
+// Returns 0 on success
+int ACL_PCIE_CONFIG::enable_AER_and_retrain_link_windows(void) {
+  fpga_result result = FPGA_OK;
+
+#if defined(WINDOWS)
+  // IOCTL call to enable AER and retrain link in kernel mode
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_ENABLE_AER_RETRAIN_LINK), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when enabling AER.\n");
+#endif  // WINDOWS
+  return result;
+}
+
+// Program the FPGA using a given SOF file
+// Quartus is needed for this, because,
+//   quartus_pgm is used to program the board through USB blaster
+// For Linux, when the kernel driver is asked to save/load_pcie_control_regs(),
+//   it will also disable/enable the aer on the upstream, so no need to
+//   implement those here.
+// NOTE: This function only works with single device machines - if there
+// are multiple cards (and multiple USB-blasters) in the system, it doesn't
+// properly determine which card is which.  Only the first device will be
+// programmed.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index) {
+  const int MAX_ATTEMPTS = 3;
+  int program_failed = 1;
+  int status;
+  bool use_cable_autodetect = true;
+
+  // If ad_cable value is "0", either JTAG cable autodetect failed or not
+  // supported, then use the default value
+  if (strcmp(ad_cable, "0") == 0) use_cable_autodetect = false;
+
+  const char *cable = getenv("ACL_PCIE_JTAG_CABLE");
+  if (!cable) {
+    if (use_cable_autodetect) {
+      cable = ad_cable;
+      ACL_PCIE_DEBUG_MSG("setting Cable to autodetect value %s\n", cable);
+    } else {
+      cable = "1";
+      ACL_PCIE_DEBUG_MSG("setting Cable to default value %s\n", cable);
+    }
+  }
+
+  const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX");
+  if (!device_index) {
+    if (use_cable_autodetect) {
+      device_index = ad_device_index;
+      ACL_PCIE_DEBUG_MSG("setting Device Index to autodetect value %s\n", device_index);
+    } else {
+      device_index = "1";
+      ACL_PCIE_DEBUG_MSG("setting Device Index to default value %s\n", device_index);
+    }
+  }
+
+  char cmd[4 * 1024];
+#ifdef DLA_MMD
+#if defined(WINDOWS)
+  if ((ACL_PCIE_DEBUG | 0) >= VERBOSITY_DEFAULT) {
+    snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\"", cable, filename, device_index);
+  } else {
+    snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index);
+  }
+#else
+  snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null", cable, filename, device_index);
+#endif
+  ACL_PCIE_INFO("Executing \"%s\"\n", cmd);
+#else
+#if defined(WINDOWS)
+  snprintf(
+      cmd, sizeof(cmd), "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index);
+#endif
+#if defined(LINUX)
+  snprintf(cmd,
+           sizeof(cmd),
+           "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null",
+           cable,
+           filename,
+           device_index);
+#endif
+  ACL_PCIE_DEBUG_MSG("Executing \"%s\"\n", cmd);
+#endif
+
+  // Disable AER
+  status = disable_AER_windows();
+  ACL_PCIE_ERROR_IF(status, return -1, "Failed to disable AER on Windows before programming SOF.\n");
+
+  // Set the program to ignore the ctrl-c signal
+  // This setting will be inherited by the system() function call below,
+  // so that the quartus_pgm call won't be interrupt by the ctrl-c signal.
+  install_ctrl_c_handler(1 /* ignore the signal */);
+
+  // Program FPGA by executing the command
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid system() function parameter: %s\n", cmd);
+#endif
+  for (int attempts = 0; attempts < MAX_ATTEMPTS && program_failed; attempts++) {
+    if (attempts > 0) {
+      ACL_PCIE_INFO("Execution failed.  Will try again in case the error was transient.\n");
+    }
+    program_failed = system(cmd);
+#if defined(WINDOWS)
+    Sleep(2000);
+#endif  // WINDOWS
+#if defined(LINUX)
+    sleep(2);
+#endif  // LINUX
+  }
+
+  // Restore the original custom ctrl-c signal handler
+  install_ctrl_c_handler(0 /* use the custom signal handler */);
+
+  // Enable AER
+  status = enable_AER_and_retrain_link_windows();
+  ACL_PCIE_ERROR_IF(status, return -1, "Failed to enable AER and retrain link on Windows after programming SOF.\n");
+
+  return program_failed;
+}
+
+bool ACL_PCIE_CONFIG::find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index) {
+  FILE *fp;
+  int status;
+  char line_in[READ_SIZE];
+  bool found_cable = false;
+
+  char cmd[SYSTEM_CMD_SIZE];
+  const char *aocl_boardpkg_root = getenv("AOCL_BOARD_PACKAGE_ROOT");
+  if (!aocl_boardpkg_root) {
+    ACL_PCIE_INFO("AOCL_BOARD_PACKAGE_ROOT not set!!!");
+    return false;
+  }
+
+  snprintf(cmd, sizeof(cmd), "aocl do quartus_stp -t %s/scripts/find_jtag_cable.tcl %X", aocl_boardpkg_root, cade_id);
+  ACL_PCIE_DEBUG_MSG("executing \"%s\"\n", cmd);
+
+  // Open PIPE to tcl script
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid popen() function parameter: %s\n", cmd);
+#endif
+#if defined(WINDOWS)
+  fp = _popen(cmd, "r");
+#endif  // WINDOWS
+#if defined(LINUX)
+  fp = popen(cmd, "r");
+#endif  // LINUX
+
+  if (fp == NULL) {
+    ACL_PCIE_INFO("Couldn't open fp file\n");
+  } else {
+    // Read everyline and look for matching string from tcl script
+    while (fgets(line_in, READ_SIZE, fp) != NULL) {
+      ACL_PCIE_DEBUG_MSG("%s", line_in);
+      const char *str_match_cable = "Matched Cable:";
+      const char *str_match_dev_name = "Device Name:@";
+      const char *str_match_end = ":";
+      // parsing the string and extracting the cable/index value
+      // from the output of find_jtag_cable.tcl script
+      char *pos_cable = strstr(line_in, str_match_cable);
+      if (pos_cable) {
+        found_cable = true;
+        // find the sub-string locations in the line
+        char *pos_dev_name = strstr(line_in, str_match_dev_name);
+        if (pos_dev_name) {
+          char *pos_end =
+              strstr(pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE), str_match_end);  // Find the last ":"
+          if (pos_end) {
+            // calculate the cable/index string size
+            size_t i_cable_str_len = pos_dev_name - pos_cable - strnlen(str_match_cable, MAX_NAME_SIZE);
+            size_t i_dev_index_str_len = pos_end - pos_dev_name - strnlen(str_match_dev_name, MAX_NAME_SIZE);
+            // extract the cable/index value from the line
+            snprintf(ad_cable,
+                     AD_CABLE_SIZE,
+                     "%.*s",
+                     (int)i_cable_str_len,
+                     pos_cable + strnlen(str_match_cable, MAX_NAME_SIZE));
+            snprintf(ad_device_index,
+                     AD_CABLE_SIZE,
+                     "%.*s",
+                     (int)i_dev_index_str_len,
+                     pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE));
+            ACL_PCIE_DEBUG_MSG("JTAG Autodetect device found Cable:%s, Device Index:%s\n", ad_cable, ad_device_index);
+            break;
+          }
+        }
+      }
+    }
+
+#if defined(WINDOWS)
+    status = _pclose(fp);
+#endif  // WINDOWS
+#if defined(LINUX)
+    status = pclose(fp);
+#endif  // LINUX
+
+    if (status == -1) {
+      /* Error reported by pclose() */
+      ACL_PCIE_INFO("Couldn't close find_cable_with_ISSP file\n");
+    } else {
+      /* Use macros described under wait() to inspect `status' in order
+       *        to determine success/failure of command executed by popen()
+       *        */
+    }
+  }
+
+  if (!found_cable) {
+    ACL_PCIE_INFO("Autodetect Cable not found!!\n");
+  }
+
+  return found_cable;
+}
+
+// Functions to save/load control registers form PCI Configuration Space
+// This saved registers are used to restore the PCIe link after reprogramming
+// through methods other than PR
+// For Windows, the register values are stored in this class, and do
+//   nothing else
+// For Linux, the register values are stored inside the kernel driver,
+//   And, it will disable the interrupt and the aer on the upstream,
+//   when the save_pci_control_regs() function is called. They will
+//   be enable when load_pci_control_regs() is called.
+// Return 0 on success
+int ACL_PCIE_CONFIG::save_pci_control_regs() {
+  int save_failed = 1;
+
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+
+  // IOCTL call to save PCI control register
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SAVE_PCI_CTRL_REG), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when saving PCI Control registers.\n");
+
+  save_failed = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_save = {ACLPCI_CMD_BAR, ACLPCI_CMD_SAVE_PCI_CONTROL_REGS, NULL, NULL};
+  save_failed = read(m_handle, &cmd_save, 0);
+#endif  // LINUX
+
+  return save_failed;
+}
+
+int ACL_PCIE_CONFIG::load_pci_control_regs() {
+  int load_failed = 1;
+#if defined(WINDOWS)
+
+  fpga_result result = FPGA_OK;
+  // IOCTL call to load PCI control register
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_LOAD_PCI_CTRL_REG), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when loading PCI Control registers.\n");
+
+  load_failed = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_load = {ACLPCI_CMD_BAR, ACLPCI_CMD_LOAD_PCI_CONTROL_REGS, NULL, NULL};
+  load_failed = read(m_handle, &cmd_load, 0);
+#endif  // LINUX
+
+  return load_failed;
+}
+
+// Functions to query the PCI related information
+// Use NULL as input for the info that you don't care about
+// Return 0 on success.
+int ACL_PCIE_CONFIG::query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str) {
+  int status = 0;
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+  // IOCTL call to obtain PCIe gen information
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_GEN), NULL, pcie_gen, sizeof(unsigned int));
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device gen info.\n");
+
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_LANES), NULL, pcie_num_lanes, sizeof(unsigned int));
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device lanes info.\n");
+
+  status = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd driver_cmd;
+
+  if (pcie_gen != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_GEN;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_gen;
+    driver_cmd.size = sizeof(*pcie_gen);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+
+  if (pcie_num_lanes != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_NUM_LANES;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_num_lanes;
+    driver_cmd.size = sizeof(*pcie_num_lanes);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+
+  if (pcie_slot_info_str != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_SLOT_INFO;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_slot_info_str;
+    driver_cmd.size = sizeof(pcie_slot_info_str);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+#endif  // LINUX
+  return status;
+}
+
+void ACL_PCIE_CONFIG::wait_seconds(unsigned seconds) {
+#if defined(WINDOWS)
+  Sleep(seconds * 1000);
+#endif  // WINDOWS
+
+#if defined(LINUX)
+  sleep(seconds);
+#endif  // LINUX
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h
new file mode 100644
index 0000000..3f07634
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h
@@ -0,0 +1,109 @@
+#ifndef ACL_PCIE_CONFIG_H
+#define ACL_PCIE_CONFIG_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_config.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle functions that program the FPGA.         */
+/* The actual implementation of the class lives in the acl_pcie_config.cpp,        */
+/* so look there for full documentation.                                           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#endif
+
+// Forward declaration for classes used by ACL_PCIE_DEVICE
+class ACL_PCIE_DMA;
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+
+#define PCIE_AER_CAPABILITY_ID ((DWORD)0x0001)
+#define PCIE_AER_UNCORRECTABLE_STATUS_OFFSET ((DWORD)0x4)
+#define PCIE_AER_UNCORRECTABLE_MASK_OFFSET ((DWORD)0x8)
+#define PCIE_AER_CORRECTABLE_STATUS_OFFSET ((DWORD)0x10)
+#define PCIE_AER_SURPRISE_DOWN_BIT ((DWORD)(1 << 5))
+
+// The size of the char array that holds the name of autodetect JTAG cable and device index
+#define AD_CABLE_SIZE 10
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif  // LINUX
+
+class ACL_PCIE_CONFIG {
+ public:
+  ACL_PCIE_CONFIG(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma);
+  ~ACL_PCIE_CONFIG();
+
+  // Change the core only via PCIe, using an in-memory image of the core.rbf
+  // This is supported only for Stratix V and newer devices.
+  // Return 0 on success.
+  int program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len);
+  int program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str);
+
+  // Program the FPGA using a given SOF file
+  // Input filename, autodetect cable, autodetect device index
+  // Return 0 on success.
+  int program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index);
+
+  // Look up CADEID using ISSP
+  // Return TRUE with cable value in ad_cable, ad_device_index if cable found
+  // Otherwise return FALSE
+  bool find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index);
+
+  // Functions to save/load control registers from PCI Configuration Space
+  // Return 0 on success.
+  int save_pci_control_regs();
+  int load_pci_control_regs();
+
+  // Functions to query the PCI related information
+  // Use NULL as input for the info that you don't care about
+  // Return 0 on success.
+  int query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str);
+
+  // Windows-specific code to control AER, and retrain the link
+  int enable_AER_and_retrain_link_windows(void);
+  int disable_AER_windows(void);
+
+  // Platform agnostic sleep (in seconds)
+  void wait_seconds(unsigned seconds);
+
+ private:
+  ACL_PCIE_CONFIG &operator=(const ACL_PCIE_CONFIG &) { return *this; }
+
+  ACL_PCIE_CONFIG(const ACL_PCIE_CONFIG &src) {}
+
+  fpga_handle m_handle;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_DMA *m_dma;
+#if defined(WINDOWS)
+  fpga_guid *FpgaCmd;
+#endif  // WINDOWS
+};
+
+#endif  // ACL_PCIE_CONFIG_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp
new file mode 100644
index 0000000..8afc1c7
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp
@@ -0,0 +1,61 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_debug.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#include "acl_pcie_debug.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int ACL_PCIE_DEBUG = 0;
+int ACL_PCIE_WARNING = 1;  // turn on the warning message by default
+
+int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = 0;
+
+void set_mmd_debug() {
+  char* mmd_debug_var = getenv("ACL_PCIE_DEBUG");
+  if (mmd_debug_var) {
+    char* endptr = NULL;
+    long parsed_count;
+    parsed_count = strtol(mmd_debug_var, &endptr, 10);
+    if (endptr == mmd_debug_var  // no valid characters
+        || *endptr               // an invalid character
+        || (parsed_count < 0 || parsed_count >= (long)VERBOSITY_EVERYTHING)) {
+      // malformed string, do nothing
+    } else {
+      ACL_PCIE_DEBUG = (int)parsed_count;
+      printf("\n:: MMD DEBUG LEVEL set to %d\n", ACL_PCIE_DEBUG);
+    }
+  }
+
+  char* hal_debug_dump_flash_bootsect = getenv("ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR");
+  if (hal_debug_dump_flash_bootsect) ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = atoi(hal_debug_dump_flash_bootsect);
+}
+
+void set_mmd_warn_msg() {
+  char* mmd_warn_var = getenv("ACL_PCIE_WARNING");
+  if (mmd_warn_var) {
+    ACL_PCIE_WARNING = atoi(mmd_warn_var);
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h
new file mode 100644
index 0000000..072eabc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h
@@ -0,0 +1,64 @@
+#ifndef ACL_PCIE_DEBUG_H
+#define ACL_PCIE_DEBUG_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_debug.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+enum ACL_VERBOSITY {
+  VERBOSITY_DEFAULT = 1,
+  VERBOSITY_INVOCATION = 2,  // Dump kernel invocation details
+  VERBOSITY_OP = 3,          // Dump operation invocation details
+  VERBOSITY_IRQ = 5,
+  VERBOSITY_BLOCKTX = 9,  // Dump PCIe block transfers
+  VERBOSITY_PCIE = 10,    // Dump all PCIe transactions
+  VERBOSITY_EVERYTHING = 100
+};
+
+extern int ACL_PCIE_DEBUG;
+extern int ACL_PCIE_WARNING;
+extern int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR;
+
+// This function gets the value of ACL_PCIE_DEBUG from the environment variable
+void set_mmd_debug();
+void set_mmd_warn_msg();
+
+#include <stdio.h>
+
+#define ACL_PCIE_DEBUG_MSG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_DEFAULT, m, ##__VA_ARGS__)
+#define ACL_PCIE_DEBUG_MSG_VERBOSE(verbosity, m, ...) \
+  if ((ACL_PCIE_DEBUG | 0) >= verbosity) do {         \
+      printf((m), ##__VA_ARGS__), fflush(stdout);     \
+  } while (0)
+
+#define ACL_PCIE_WARN_MSG(...)            \
+  do {                                    \
+    if (ACL_PCIE_WARNING) {               \
+      printf("** WARNING: " __VA_ARGS__); \
+      fflush(stdout);                     \
+    }                                     \
+  } while (0)
+
+#endif  // ACL_PCIE_DEBUG_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp
new file mode 100644
index 0000000..8489c32
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp
@@ -0,0 +1,2029 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_device.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle operations on a single device.         */
+/* The declaration of the class lives in the acl_pcie_device.h                     */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+#define NOMINMAX
+#include <time.h>
+#endif  // WINDOWS
+
+// common and its own header files
+#include "acl_pcie.h"
+#include "acl_pcie_device.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_config.h"
+#include "acl_pcie_debug.h"
+#include "acl_pcie_dma.h"
+#include "acl_pcie_mm_io.h"
+#if !defined(DLA_MMD) || defined(WINDOWS)
+#include "pkg_editor.h"
+#endif
+
+// other standard header files
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include "acl_pcie_hostch.h"
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif  // LINUX
+
+#define MAX_LEN 1024
+
+#define FREEZE_CTRL_OFFSET 4
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+#define ACL_VERSIONID_MIN 0xA0C7C1E0
+
+static int num_open_devices = 0;
+
+#if defined(WINDOWS)
+fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num);
+
+// Interrupt service routine for all interrupts on the PCIe interrupt line
+// PCIe interrupts in Windows XP are level-based.  The KMD is responsible for
+// masking off the interrupt until this routine can service the request at
+// user-mode priority.
+extern void pcie_interrupt_handler(void *data);
+#endif  // WINDOWS
+#if defined(LINUX)
+fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num);
+#endif  // LINUX
+
+ACL_PCIE_DEVICE::ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number)
+    : kernel_interrupt(NULL),
+      kernel_interrupt_user_data(NULL),
+      device_interrupt(NULL),
+      device_interrupt_user_data(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      m_user_signal_number(0),
+      m_io(NULL),
+      m_dma(NULL),
+      m_hostch(NULL),
+      m_config(NULL),
+      m_handle(-1),
+      m_device(INVALID_HANDLE_VALUE),
+#if ACL_USE_DMA == 1
+      m_use_dma_for_big_transfers(true),
+#else
+      m_use_dma_for_big_transfers(false),
+#endif
+      m_mmd_irq_handler_enable(false),
+      m_initialized(false),
+      m_being_programmed(false),
+      m_skip_quartus_version_check(false),
+      m_segment(0) {
+  if (NULL == name) {
+    // Throw an error and bail out
+    throw std::runtime_error("Invalid argument, passed in an empty name pointer when creating device object!");
+  }
+
+  int status = 0;
+
+  // Set debug level from the environment variable ACL_PCIE_DEBUG
+  // Determine if warning messages should be disabled depends on ACL_PCIE_WARNING
+  if (num_open_devices == 0) {
+    set_mmd_debug();
+    set_mmd_warn_msg();
+  }
+
+#if defined(WINDOWS)
+  strncpy_s(m_name, MAX_NAME_LENGTH, name, (MAX_NAME_LENGTH - 1));
+#else
+  strncpy(m_name, name, (MAX_NAME_LENGTH - 1));
+#endif
+  m_name[(MAX_NAME_LENGTH - 1)] = '\0';
+
+  m_handle = handle;
+  m_info.vendor_id = ACL_PCI_INTELFPGA_VENDOR_ID;
+  m_info.device_id = 0;  // search for all device id
+  m_info.interrupt_valid = false;
+  m_info.interrupt_data = 0x00;
+  m_info.interrupt_addr = 0x00;
+
+#if defined(WINDOWS)
+  m_device = open_device_windows(&m_info, dev_num);
+#endif  // WINDOWS
+#if defined(LINUX)
+  m_device = open_device_linux(&m_info, dev_num);
+#endif  // LINUX
+
+  // Return to caller if this is simply an invalid device.
+  if (m_device == INVALID_HANDLE_VALUE) {
+    return;
+  }
+
+  // Initialize device IO and CONFIG objects
+  m_io = new ACL_PCIE_MM_IO_MGR(m_device);
+
+  // Initialize the DMA object and enable interrupts on the DMA controller
+  try {
+    m_dma = new ACL_PCIE_DMA(m_device, m_io, this);
+  }
+
+  // Catch any memory allocation failures
+  catch (std::bad_alloc &) {
+    throw std::bad_alloc();
+  }
+
+  try {
+    m_config = new ACL_PCIE_CONFIG(m_device, m_io, this, m_dma);
+  }
+
+  catch (std::bad_alloc &) {
+    throw std::bad_alloc();
+  }
+
+  // Set the segment ID to 0 first forcing cached "segment" to all 1s
+  m_segment = ~m_segment;
+  if (this->set_segment(0x0)) {
+    return;
+  }
+
+  // performance basic I/O tests
+  if (this->version_id_test()) {
+    return;
+  }
+  if (this->wait_for_uniphy()) {
+    return;
+  }
+
+  // Get PCIE information
+  unsigned int pcie_gen, pcie_num_lanes;
+  char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN] = {0};
+
+  status = m_config->query_pcie_info(&pcie_gen, &pcie_num_lanes, pcie_slot_info_str);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to query PCIe related information.\n", m_name);
+  snprintf(m_info.pcie_info_str,
+           PCIE_INFO_STR_LEN,
+           "dev_id = " DWORD_FMT_4X ", bus:slot.func = %s, Gen%u x%u",
+           m_info.device_id,
+           pcie_slot_info_str,
+           pcie_gen,
+           pcie_num_lanes);
+
+  m_user_signal_number = user_signal_number;
+
+  // Initialize the Host Channel object
+  m_hostch = new ACL_PCIE_HOSTCH(m_device, m_io, this, m_dma);
+
+  if (this->enable_interrupts(m_user_signal_number)) {
+    return;
+  }
+
+  char *str_test_quartus_ver = getenv("ACL_SKIP_QUARTUS_VERSION_CHECK");
+  if (str_test_quartus_ver) m_skip_quartus_version_check = 1;
+
+#if defined(WINDOWS)
+  enable_msi(true);
+#endif
+
+#ifdef DLA_MMD
+  // software reset
+  uint32_t software_reset_data = 0;  // value doesn't matter, any write to software reset will cause it to trigger
+  constexpr int software_reset_offset = 0x8000;
+  status = m_io->kernel_if->write_block(software_reset_offset, sizeof(uint32_t), &software_reset_data);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] failed to write block.\n", m_name);
+  // software reset applies backpressure to the avalon interface while the reset counter is running
+  // issue a read request, which will not return until the reset counter is done
+  status = m_io->kernel_if->read_block(software_reset_offset, sizeof(uint32_t), &software_reset_data);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] failed to read block.\n", m_name);
+#endif
+  // Done!
+  m_initialized = true;
+  ACL_PCIE_DEBUG_MSG(":: [%s] successfully initialized (device id: " DWORD_FMT_X ").\n", m_name, m_info.device_id);
+  ACL_PCIE_DEBUG_MSG("::           Using DMA for big transfers? %s\n", (m_use_dma_for_big_transfers ? "yes" : "no"));
+}
+
+ACL_PCIE_DEVICE::~ACL_PCIE_DEVICE() {
+#if defined(WINDOWS)
+  enable_msi(false);
+#endif
+
+  int status = this->disable_interrupts();
+  ACL_PCIE_ERROR_IF(status, /* do nothing */, "[%s] fail disable interrupt in device destructor.\n", m_name);
+
+  if (m_hostch) {
+    delete m_hostch;
+    m_hostch = NULL;
+  }
+  if (m_config) {
+    delete m_config;
+    m_config = NULL;
+  }
+  if (m_dma) {
+    delete m_dma;
+    m_dma = NULL;
+  }
+  if (m_io) {
+    delete m_io;
+    m_io = NULL;
+  }
+
+  if (is_valid()) {
+    --num_open_devices;
+#if defined(WINDOWS)
+    fpga_result result = fpgaClose(m_device);
+    ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "[%s] failed to close the device handle.\n", m_name);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+    close(m_device);
+#endif  // LINUX
+  }
+}
+
+#if defined(WINDOWS)
+// Enable/Disable MSI
+void ACL_PCIE_DEVICE::enable_msi(bool enable) {
+  int status;
+
+  if (!m_info.interrupt_valid) {
+    return;
+  }
+
+  if (!enable) {
+    // disable MSI DATA
+    m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, 0x00);
+  } else {
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_L, m_info.interrupt_addr & 0xffffffff);
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_H, (m_info.interrupt_addr >> 0x20) & 0xffffffff);
+    MemoryBarrier();
+    // enable MSI DATA
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, PCIE_CRA_MSI_ENABLE | m_info.interrupt_data );
+  }
+  MemoryBarrier();
+}
+
+fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) {
+  fpga_result result;
+  fpga_handle device = INVALID_HANDLE_VALUE;
+  DWORD pci_class_code_rev = 0;
+  DWORD pci_subsystem_ids = 0;
+  DWORD pci_link_info = 0;
+
+  // Variables for fpga enumerate
+  fpga_properties filter = NULL;
+  UINT32 numMatches;
+  fpga_token afcToken;
+  volatile PUINT64 mmioPtr = NULL;
+
+  // Variables for fpga properties
+  fpga_properties prop = nullptr;
+  UINT8 bus;
+  UINT8 l_device;
+  UINT8 function;
+
+  const UINT8 CAP_PTR_ADDRESS = 0x34;
+  const UINT8 MSI_CAP_ID = 0x05;
+  UINT8 nextCapPtr;
+  UINT8 msiCapPtr;
+  UINT8 capID;
+  bool hasFound = false;
+  UINT8 capArray[2];
+  UINT16 msi_control;
+  UINT16 data16 = 0x00;
+  UINT32 data32 = 0x00;
+  UINT64 data64 = 0x00;
+
+  // Initialize filter structure
+  result = fpgaGetProperties(NULL, &filter);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n");
+  }
+
+  // Set object type in filter structure
+  result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set object type.\n");
+  }
+
+  // Set vendor ID in the filter structure
+  result = fpgaPropertiesSetVendorID(filter, (uint16_t)info->vendor_id);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set vendor ID.\n");
+  }
+
+  // Enumerate all PCI devices and find devices matching the filters
+  result = fpgaEnumerate(&filter, 1, &afcToken, 1, &numMatches);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to scan for the PCI device.\n");
+  }
+
+  if (numMatches < 1) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] Device not found\n", dev_num);
+    device = INVALID_HANDLE_VALUE;
+    goto DestroyTok;
+  }
+
+  // Open the device handle
+  result = fpgaOpen(afcToken, &device, 0);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyTok, "[acl" ACL_BOARD_PKG_NAME "%d] failed to open the device.\n", dev_num);
+  }
+
+  // Map MMIO number 0
+  result = fpgaMapMMIO(device, 0, (PUINT64 *)&mmioPtr);
+  if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to map MMIO.\n", dev_num);
+  }
+
+  // Read SubSystem IDs out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x2C, (PVOID)&pci_subsystem_ids, sizeof(pci_subsystem_ids));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI SubSystem IDs found: 0x%lx\n", dev_num, pci_subsystem_ids);
+  if ((ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 31, 16) != ACL_PCI_SUBSYSTEM_DEVICE_ID) ||
+      (ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 15, 0) != ACL_PCI_SUBSYSTEM_VENDOR_ID)) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME
+                       "%d] PCI SubSystem IDs do not match, found %08lx but expected %04x%04x\n",
+                       dev_num,
+                       pci_subsystem_ids,
+                       ACL_PCI_SUBSYSTEM_DEVICE_ID,
+                       ACL_PCI_SUBSYSTEM_VENDOR_ID);
+    goto Close;
+  }
+  // Save device id
+  info->device_id = ACL_PCI_SUBSYSTEM_DEVICE_ID;
+
+  // Read Class code out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 8, (PVOID)&pci_class_code_rev, sizeof(pci_class_code_rev));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code and Rev is: %lx\n", dev_num, pci_class_code_rev);
+  if (((pci_class_code_rev & (0xff00ff00)) >> 8) != ACL_PCI_CLASSCODE) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code does not match, expected %x, read %ld\n",
+                       dev_num,
+                       ACL_PCI_CLASSCODE,
+                       (pci_class_code_rev & 0xff00ff00) >> 8);
+    goto Close;
+  }
+
+  // Check PCI Revision
+  if ((pci_class_code_rev & 0x0ff) != ACL_PCI_REVISION) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Revision does not match\n", dev_num);
+    goto Close;
+  }
+
+  // Read MSI data and address
+  info->interrupt_valid = false;
+  result = fpgaReadPciConfigSpace(device, CAP_PTR_ADDRESS, (PVOID)&nextCapPtr, sizeof(nextCapPtr));
+  while (!hasFound && nextCapPtr > CAP_PTR_ADDRESS && FPGA_OK == result) {
+    result = fpgaReadPciConfigSpace(device, nextCapPtr, (PVOID)&capArray, sizeof(capArray));
+    if (FPGA_OK == result) {
+      capID = capArray[0];
+      if (capID == MSI_CAP_ID) {
+        hasFound = true;
+        info->interrupt_valid = true;
+        info->interrupt_addr = 0x00;
+        info->interrupt_data = 0x00;
+        msiCapPtr = nextCapPtr;
+        result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x02, (PVOID)&msi_control, sizeof(msi_control));
+        if (FPGA_OK == result) {
+          ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] %d-bit address,  %d-bit data\n",
+                             dev_num,
+                             (msi_control & 0x0080) ? 64 : 32,
+                             (msi_control & 0x0200) ? 32 : 16);
+          if (msi_control & 0x0080) {  // 64-bit address
+            result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data64, sizeof(data64));
+            if (FPGA_OK == result) {
+              info->interrupt_addr = data64;
+              if (msi_control & 0x0200) {  // Extended message enable
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data32, sizeof(data32));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data32;
+          }
+              } else {
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data16, sizeof(data16));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data16;
+          }
+              }
+            }
+          } else {  // 32-bit address
+            result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data32, sizeof(data32));
+            if (FPGA_OK == result) {
+              info->interrupt_addr = data32;
+              if (msi_control & 0x0200) {  // Extended message enable
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data32, sizeof(data32));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data32;
+          }
+              } else {
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data16, sizeof(data16));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data16;
+          }
+              }
+            }
+          }
+        }
+        ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME
+                           "%d] MSI Control = 0x%04x, MSI Address = 0x%llx, MSI Data = 0x%x\n",
+                           dev_num,
+                           msi_control,
+                           info->interrupt_addr,
+                           info->interrupt_data);
+      } else {
+        nextCapPtr = capArray[1];
+      }
+    }
+  }
+
+  if (result != FPGA_OK || !info->interrupt_valid)
+  {
+    ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to read MSI interrupt address/data.\n", dev_num);
+  }
+
+  result = fpgaGetProperties(afcToken, &prop);
+  if (prop) {
+    result = fpgaPropertiesGetBus(prop, &bus);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get bus.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] bus is: %d\n", dev_num, bus);
+    result = fpgaPropertiesGetDevice(prop, &l_device);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get device.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] device is: %d\n", dev_num, l_device);
+    result = fpgaPropertiesGetFunction(prop, &function);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get function.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] function is: %d\n", dev_num, function);
+    snprintf(info->pcie_slot_info_str,
+             PCIE_SLOT_INFO_STR_LEN,
+             "%u:%u.%u",
+             bus, l_device, function);
+    fpgaDestroyProperties(&prop);
+  }
+  // Read Link status out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x80, (PVOID)&pci_link_info, sizeof(pci_link_info));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Status is: 0x%lx\n", dev_num, pci_link_info);
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Speed is: %d\n",
+                     dev_num,
+                     ((pci_link_info >> 16) & 0x0F));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Negotiated Link Width is: %d\n",
+                     dev_num,
+                     ((pci_link_info >> 20) & 0x3F));
+
+  // Read Maximum Payload Size out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x78, (PVOID)&pci_link_info, sizeof(pci_link_info));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size raw data is: 0x%lx\n", dev_num, pci_link_info);
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is: %d\n", dev_num, ((pci_link_info >> 5) & 0x0007));
+  switch ((pci_link_info >> 5) & 0x0007) {
+    case 0:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 128-byte\n", dev_num);
+      break;
+    case 1:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 256-byte\n", dev_num);
+      break;
+    case 2:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 512-byte\n", dev_num);
+      break;
+    case 3:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 1024-byte\n", dev_num);
+      break;
+    case 4:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 2048-byte\n", dev_num);
+      break;
+    default:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is Unknown\n", dev_num);
+      break;
+  }
+
+  ++num_open_devices;
+  goto DestroyTok;
+
+  // Resource cleanup
+
+Close:
+  fpgaClose(device);
+  device = INVALID_HANDLE_VALUE;
+
+DestroyTok:
+
+  if (afcToken != NULL) fpgaDestroyToken(&afcToken);
+
+DestroyProp:
+
+  if (filter != NULL) fpgaDestroyProperties(&filter);
+
+End:
+  return device;
+}
+#endif  // WINDOWS
+
+#if defined(LINUX)
+fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) {
+  char buf[128] = {0};
+  char expected_ver_string[128] = {0};
+  int descriptor;
+  int oldflags;
+  int bytes_read;
+  struct acl_cmd driver_cmd;
+
+  snprintf(buf, sizeof(buf), "/dev/acl" ACL_BOARD_PKG_NAME "%d", dev_num);
+  ssize_t device = open(buf, O_RDWR);
+
+  // Return INVALID_DEVICE when the device is not available
+  if (device == -1) {
+    goto Close;
+  }
+
+  // Make sure the Linux kernel driver is recent
+  driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_DRIVER_VERSION, NULL, buf, 0};
+  bytes_read = read(device, &driver_cmd, 0);
+  ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command");
+
+  snprintf(
+      expected_ver_string, sizeof(expected_ver_string), "%s.%s", ACL_BOARD_PKG_NAME, KERNEL_DRIVER_VERSION_EXPECTED);
+  ACL_PCIE_ERROR_IF(strstr(buf, expected_ver_string) != buf,
+                    goto Close,
+                    "Kernel driver mismatch: The board kernel driver version is %s, but\nthis host program expects "
+                    "%s.\n  Please reinstall the driver using aocl install.\n",
+                    buf,
+                    expected_ver_string);
+
+  // Save the device id for the selected board
+  driver_cmd.bar_id = ACLPCI_CMD_BAR;
+  driver_cmd.command = ACLPCI_CMD_GET_PCI_DEV_ID;
+  driver_cmd.device_addr = NULL;
+  driver_cmd.user_addr = &info->device_id;
+  driver_cmd.size = sizeof(info->device_id);
+  bytes_read = read(device, &driver_cmd, sizeof(driver_cmd));
+  ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command");
+
+  // Set the FD_CLOEXEC flag for the file handle to disable the child to
+  // inherit this file handle. So the jtagd will not hold the file handle
+  // of the device and keep sending bogus interrupts after we call quartus_pgm.
+  oldflags = fcntl(device, F_GETFD, 0);
+  descriptor = fcntl(device, F_SETFD, oldflags | FD_CLOEXEC);
+  if (descriptor < 0) {
+    goto Close;
+  }
+
+  ++num_open_devices;
+  goto End;
+
+// I really don't want to use goto but it's for consistency with windows version, and convenience with macros
+Close:
+  if (device >= 0) {
+    close(device);
+  }
+  device = INVALID_HANDLE_VALUE;
+
+End:
+  return device;
+}
+
+#endif  // LINUX
+
+// This function can be used for triggering a fake device exception for testing
+void ACL_PCIE_DEVICE::test_trigger_device_interrupt() {
+  // Example:
+  // Raising ECC NON CORRECTABLE exception (exception code 2)
+  // Providing integer-type private_info (say, equals to 5)
+  unsigned long long int exception_type = 2;
+  int test_private_info = 5;
+  aocl_mmd_interrupt_info interrupt_data = {exception_type, &test_private_info, sizeof(test_private_info)};
+  this->device_interrupt(m_handle, &interrupt_data, this->device_interrupt_user_data);
+}
+
+// Perform operations required when an interrupt is received for this device
+void ACL_PCIE_DEVICE::service_interrupt(unsigned int irq_type_flag) {
+  unsigned int kernel_update = 0;
+  unsigned int dma_update = 0;
+
+  int status = this->get_interrupt_type(&kernel_update, &dma_update, irq_type_flag);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_IRQ,
+                             ":: [%s] Irq service routine called, kernel_update=%d, dma_update=%d \n",
+                             m_name,
+                             kernel_update,
+                             dma_update);
+
+  if (kernel_update && kernel_interrupt != NULL) {
+#if defined(WINDOWS)
+    status = this->mask_irqs();
+    ACL_PCIE_ERROR_IF(status, return, "[%s] failed to mask kernel interrupt.\n", m_name);
+#endif
+    // A kernel-status interrupt - update the status of running kernels
+    ACL_PCIE_ASSERT(kernel_interrupt, "[%s] received kernel interrupt before the handler is installed.\n", m_name);
+    kernel_interrupt(m_handle, kernel_interrupt_user_data);
+  } else if (dma_update) {
+    // A DMA-status interrupt - let the DMA object handle this
+    m_dma->service_interrupt();
+  }
+
+  // Unmask the kernel_irq to enable the interrupt again.
+  if (m_mmd_irq_handler_enable) {
+    status = this->unmask_irqs();
+  } else if (kernel_update) {
+    status = this->unmask_kernel_irq();
+  }
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name);
+
+  return;
+}
+
+// Enable all interrupts (DMA and Kernel)
+// Won't enable kernel irq unless kernel interrupt callback has been initialized
+// Return 0 on success
+int ACL_PCIE_DEVICE::unmask_irqs() {
+  int status = 0;
+  if (kernel_interrupt == NULL) {
+    // No masking for DMA interrupt.
+
+  } else {
+    status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC));
+  }
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask all interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Disable all interrupts to service kernel that triggered interrupt
+// If other kernels finish while the interrupt is masked, MSI will trigger again when
+// interrupts are re-enabled.
+int ACL_PCIE_DEVICE::mask_irqs() {
+  int status = 0;
+  UINT32 val = 0;
+  status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to mask the kernel interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Enable the kernel interrupt only
+// Return 0 on success
+int ACL_PCIE_DEVICE::unmask_kernel_irq() {
+  int status = 0;
+  UINT32 val = 0;
+
+  status |= (int)(m_io->pcie_cra->read32(PCIE_CRA_IRQ_ENABLE, &val));
+  val |= ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC);
+  status |= (int)(m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val));
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask the kernel interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Disable the interrupt
+// Return 0 on success
+int ACL_PCIE_DEVICE::disable_interrupts() {
+  int status;
+
+  if (m_mmd_irq_handler_enable) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Disabling interrupts.\n", m_name);
+
+    status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0);
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to disable pcie interrupt.\n", m_name);
+
+#if defined(WINDOWS)
+    // Disable KMD interrupt handling for Windows
+    fpga_properties prop = {0};
+    fpga_result result = FPGA_OK;
+    uint32_t num_interrupts = 0;
+    uint32_t i = 0;
+
+    // Get number of interrupts in the device from the properties structure
+    result = fpgaGetPropertiesFromHandle(m_device, &prop);
+    ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name);
+
+    result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts);
+    if (result != FPGA_OK) {
+      fpgaDestroyProperties(&prop);
+      ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name);
+    }
+
+    if (dev_event_handle != NULL) {
+      // Loop through all the interrupts and unregister the event and
+      // destroy event handle associated with the interrupt
+      for (i = 0; i < num_interrupts; i++) {
+        result = fpgaUnregisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i]);
+
+        if (result != FPGA_OK) {
+          fpgaDestroyProperties(&prop);
+          ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaRegisterEvent Failed\n", m_name);
+        }
+
+        result = fpgaDestroyEventHandle(&dev_event_handle[i]);
+        if (result != FPGA_OK) {
+          fpgaDestroyProperties(&prop);
+          ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaCreateEventHandle Failed\n", m_name);
+        }
+      }
+      free(dev_event_handle);
+      dev_event_handle = NULL;
+    }
+    fpgaDestroyProperties(&prop);
+#endif  // WINDOWS
+    m_mmd_irq_handler_enable = false;
+  }
+
+  return 0;  // success
+}
+
+#if defined(WINDOWS)
+
+// Enable PCI express interrupts.  Set up the KMD to mask the interrupt enable bit when
+//    an interrupt is received to prevent the level-sensitive interrupt from immediately
+//    firing again.
+// Return 0 on success
+int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) {
+  int status;
+  fpga_properties prop = NULL;
+  fpga_result result = FPGA_OK;
+  uint32_t num_interrupts = 0;
+  uint32_t i = 0;
+  HANDLE deviceStopWaitObj = NULL;
+  BOOLEAN flag;
+  int ret_value = 0;  // return 0 on success
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts.\n", m_name);
+
+  // Mask off hardware interrupts before enabling them
+  status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to mask off all interrupts before enabling them.\n", m_name);
+
+  // Enable interrupts in the KMD
+
+  // Get number of interrupts in the device from the properties structure
+  result = fpgaGetPropertiesFromHandle(m_device, &prop);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name);
+
+  result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts);
+  if (result != FPGA_OK) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name);
+  }
+
+  dev_event_handle = NULL;
+  dev_event_handle = (fpga_event_handle *)malloc(sizeof(fpga_event_handle) * num_interrupts);
+  if (dev_event_handle == NULL) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] malloc for event handle array Failed\n", m_name);
+  }
+
+  // Loop through all the interrupts and register an event and
+  // create event handle associated with the interrupt
+
+  for (i = 0; i < num_interrupts; i++) {
+    result = fpgaCreateEventHandle(&dev_event_handle[i]);
+    if (result != FPGA_OK) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaCreateEventHandle Failed\n", m_name);
+    }
+
+    result = fpgaRegisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i], i);
+    if (result != FPGA_OK) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name);
+    }
+
+    // Register the user-mode interrupt handler
+    // Executed after interrupt is recieved and processed in kernel
+    flag = (BOOLEAN)RegisterWaitForSingleObject(&deviceStopWaitObj,
+                                                dev_event_handle[i],
+                                                (WAITORTIMERCALLBACK)pcie_interrupt_handler,
+                                                static_cast<void *>(this),
+                                                INFINITE,
+                                                WT_EXECUTEINWAITTHREAD);
+
+    if (flag == 0) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name);
+    }
+  }
+  status = this->unmask_irqs();
+  if (status) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] failed to enable interrupts.\n", m_name);
+  }
+
+  m_mmd_irq_handler_enable = true;
+
+  // Resource cleanup
+End:
+  fpgaDestroyProperties(&prop);
+  return ret_value;
+}
+
+// Use irq status to determine type of interrupt
+// Result is returned in kernel_update/dma_update arguments.
+// Return 0 on success
+int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update,
+                                        unsigned int *dma_update,
+                                        unsigned int irq_type_flag) {
+  UINT32 irq_status;
+  unsigned int dma_status;
+  int status;
+
+  status = m_io->pcie_cra->read32(PCIE_CRA_IRQ_STATUS, &irq_status);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to interrupt type.\n", m_name);
+
+  *kernel_update = ACL_PCIE_READ_BIT(irq_status, ACL_PCIE_KERNEL_IRQ_VEC);
+
+  status = m_dma->check_dma_interrupt(&dma_status);
+  if (status != 1) {
+    *dma_update = dma_status;
+  }
+
+  return 0;  // success
+}
+
+#endif  // WINDOWS
+#if defined(LINUX)
+
+// For Linux, it will set-up a signal handler for signals for kernel driver
+// Return 0 on success
+int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) {
+  int status;
+  ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts on Linux (via signals).\n", m_name);
+
+  // All interrupt controls are in the kernel driver.
+  m_mmd_irq_handler_enable = false;
+
+  // Send the globally allocated signal number to the driver
+  struct acl_cmd signal_number_cmd {};
+  signal_number_cmd.bar_id = ACLPCI_CMD_BAR;
+  signal_number_cmd.command = ACLPCI_CMD_SET_SIGNAL_NUMBER;
+  signal_number_cmd.device_addr = NULL;
+  signal_number_cmd.user_addr = &user_signal_number;
+  signal_number_cmd.size = sizeof(user_signal_number);
+  status = write(m_device, &signal_number_cmd, sizeof(signal_number_cmd));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set signal number for interrupts.\n", m_name);
+
+  // Sanity check, did the driver get it
+  int readback_signal_number;
+  signal_number_cmd.user_addr = &readback_signal_number;
+  signal_number_cmd.command = ACLPCI_CMD_GET_SIGNAL_NUMBER;
+  signal_number_cmd.size = sizeof(readback_signal_number);
+  status = read(m_device, &signal_number_cmd, sizeof(signal_number_cmd));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to get signal number for interrupts.\n", m_name);
+  ACL_PCIE_ERROR_IF(readback_signal_number != user_signal_number,
+                    return -1,
+                    "[%s] got wrong signal number %d, expected %d\n",
+                    m_name,
+                    readback_signal_number,
+                    user_signal_number);
+
+  // Set "our" device id (the handle id received from acl_pcie.cpp) to correspond to
+  // the device managed by the driver. Will get back this id
+  // with signal from the driver. Will allow us to differentiate
+  // the source of kernel-done signals with multiple boards.
+
+  // the last bit is reserved as a flag for DMA completion
+  int result = m_handle << 1;
+  struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_SET_SIGNAL_PAYLOAD, NULL, &result};
+  status = write(m_device, &read_cmd, sizeof(result));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to enable interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Determine the interrupt type using the irq_type_flag
+// Return 0 on success
+int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update,
+                                        unsigned int *dma_update,
+                                        unsigned int irq_type_flag) {
+  // For Linux, the interrupt type is mutually exclusive
+  *kernel_update = irq_type_flag ? 0 : 1;
+  *dma_update = 1 - *kernel_update;
+
+  return 0;  // success
+}
+
+#endif  // LINUX
+
+// Called by the host program when there are spare cycles
+int ACL_PCIE_DEVICE::yield() {
+  // Give the DMA object a chance to crunch any pending data
+  return m_dma->yield();
+}
+
+// Set kernel interrupt and event update callbacks
+// return 0 on success
+int ACL_PCIE_DEVICE::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  int status;
+
+  kernel_interrupt = fn;
+  kernel_interrupt_user_data = user_data;
+
+  if (m_device != INVALID_HANDLE_VALUE) {
+    status = this->unmask_kernel_irq();
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set kernel interrupt callback funciton.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data) {
+  int status;
+
+  device_interrupt = fn;
+  device_interrupt_user_data = user_data;
+
+  if (m_device != INVALID_HANDLE_VALUE) {
+    status = this->unmask_kernel_irq();
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set device interrupt callback funciton.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  event_update = fn;
+  event_update_user_data = user_data;
+
+  return 0;  // success
+}
+
+// The callback function set by "set_status_handler"
+// It's used to notify/update the host whenever an event is finished
+void ACL_PCIE_DEVICE::event_update_fn(aocl_mmd_op_t op, int status) {
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update is called with a empty update function pointer.\n", m_name);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, ":: [%s] Update for event e=%p.\n", m_name, op);
+  event_update(m_handle, event_update_user_data, op, status);
+}
+
+// Forward get buffer call to host channel
+void *ACL_PCIE_DEVICE::hostchannel_get_buffer(size_t *buffer_size, int channel, int *status) {
+  return m_hostch->get_buffer(buffer_size, channel, status);
+}
+// Forward ack call to host channel
+size_t ACL_PCIE_DEVICE::hostchannel_ack_buffer(size_t send_size, int channel, int *status) {
+  return m_hostch->ack_buffer(send_size, channel, status);
+}
+
+// Memory I/O
+// return 0 on success
+int ACL_PCIE_DEVICE::write_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) {
+#ifdef DLA_MMD
+  ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::write_block");
+#else
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+#endif
+  int status = -1;  // assume failure
+
+  switch (mmd_interface) {
+    case AOCL_MMD_KERNEL:
+      status = m_io->kernel_if->write_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_MEMORY:
+      status = read_write_block(e, host_addr, dev_addr, size, false /*writing*/);
+      break;
+    case AOCL_MMD_PLL:
+      status = m_io->pll->write_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_HOSTCH:
+    default:
+      ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name);
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to write block.\n", m_name);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::read_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) {
+#ifdef DLA_MMD
+  ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::read_block");
+#else
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+#endif
+  int status = -1;  // assume failure
+
+  switch (mmd_interface) {
+    case AOCL_MMD_KERNEL:
+      status = m_io->kernel_if->read_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_MEMORY:
+      status = read_write_block(e, host_addr, dev_addr, size, true /*reading*/);
+      break;
+    case AOCL_MMD_PLL:
+      status = m_io->pll->read_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_HOSTCH:
+    default:
+      ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name);
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read block.\n", m_name);
+
+  return 0;  // success
+}
+
+// Copy a block between two locations in device memory
+// return 0 on success
+int ACL_PCIE_DEVICE::copy_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size) {
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                             ":: [%s] Copying " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X " (device) to 0x" SIZE_FMT_X
+                             " (device), with e=%p\n",
+                             m_name,
+                             size,
+                             src,
+                             dst,
+                             e);
+
+#define BLOCK_SIZE (8 * 1024 * 1024)
+#if defined(WINDOWS)
+  __declspec(align(128)) static unsigned char data[BLOCK_SIZE];
+#endif  // WINDOWS
+#if defined(LINUX)
+  static unsigned char data[BLOCK_SIZE] __attribute__((aligned(128)));
+#endif  // LINUX
+
+  do {
+    size_t transfer_size = (size > BLOCK_SIZE) ? BLOCK_SIZE : size;
+    read_block(NULL /* blocking read  */, mmd_interface, data, src, transfer_size);
+    write_block(NULL /* blocking write */, mmd_interface, data, dst, transfer_size);
+
+    src += transfer_size;
+    dst += transfer_size;
+    size -= transfer_size;
+  } while (size > 0);
+
+  if (e) {
+    this->event_update_fn(e, 0);
+  }
+
+  return 0;  // success
+}
+
+// Forward create hostchannel call to host channel
+int ACL_PCIE_DEVICE::create_hostchannel(char *name, size_t queue_depth, int direction) {
+  return m_hostch->create_hostchannel(name, queue_depth, direction);
+}
+
+// Forward destroy hostchannel call to host channel
+int ACL_PCIE_DEVICE::destroy_channel(int channel) { return m_hostch->destroy_hostchannel(channel); }
+
+// Read or Write a block of data to device memory.
+// Use either DMA or directly read/write through BAR
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(host_addr);
+
+  int status = 0;
+  size_t dma_size = 0;
+
+#ifdef DLA_MMD
+  // CoreDLA runtime assumes host/device transfers are thread safe, enforce that here
+  // mutex will unlock when its lock goes out of scope
+  std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+#endif
+
+  if (reading) {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Reading " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X
+                               " (device) to %p (host), with e=%p\n",
+                               m_name,
+                               size,
+                               dev_addr,
+                               host_addr,
+                               e);
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Writing " SIZE_FMT_U " bytes data from %p (host) to 0x" SIZE_FMT_X
+                               " (device), with e=%p\n",
+                               m_name,
+                               size,
+                               host_addr,
+                               dev_addr,
+                               e);
+  }
+
+  // Return immediately if size is zero
+  if (size == 0) {
+    if (e) {
+      this->event_update_fn(e, 0);
+    }
+    return 0;
+  }
+
+  bool aligned = ((uintptr_host & DMA_ALIGNMENT_BYTE_MASK) | (dev_addr & DMA_ALIGNMENT_BYTE_MASK)) == 0;
+  if (m_use_dma_for_big_transfers && aligned && (size >= 1024)) {
+    // DMA transfers must END at aligned boundary.
+    // If that's not the case, use DMA up to such boundary, and regular
+    // read/write for the remaining part.
+    dma_size = size - (size & DMA_ALIGNMENT_BYTE_MASK);
+  } else if (m_use_dma_for_big_transfers && (size >= 1024)) {
+    ACL_PCIE_WARN_MSG("[%s] NOT using DMA to transfer " SIZE_FMT_U
+                      " bytes from %s to %s because of lack of alignment\n"
+                      "**                 host ptr (%p) and/or dev offset (0x" SIZE_FMT_X
+                      ") is not aligned to %u bytes\n",
+                      m_name,
+                      size,
+                      (reading ? "device" : "host"),
+                      (reading ? "host" : "device"),
+                      host_addr,
+                      dev_addr,
+                      DMA_ALIGNMENT_BYTES);
+  }
+
+  // Perform read/write through BAR if the data is not fit for DMA or if there is remaining part from DMA
+  if (dma_size < size) {
+    void *host_addr_new = reinterpret_cast<void *>(uintptr_host + dma_size);
+    size_t dev_addr_new = dev_addr + dma_size;
+    size_t remain_size = size - dma_size;
+
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Perform read/write through BAR for remaining " SIZE_FMT_U
+                               " bytes (out of " SIZE_FMT_U " bytes)\n",
+                               m_name,
+                               remain_size,
+                               size);
+
+    status = read_write_block_bar(host_addr_new, dev_addr_new, remain_size, reading);
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to perform read/write through BAR.\n", m_name);
+  }
+
+  if (dma_size != 0) {
+    m_dma->read_write(host_addr, dev_addr, dma_size, e, reading);
+
+    // Block if event is NULL
+    if (e == NULL) {
+      m_dma->stall_until_idle();
+    }
+  } else {
+    if (e != NULL) {
+      this->event_update_fn(e, 0);
+    }
+  }
+
+  return 0;  // success
+}
+
+// Read or Write a block of data to device memory through BAR
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  void *cur_host_addr = host_addr;
+  size_t cur_dev_addr = dev_addr;
+  size_t bytes_transfered = 0;
+
+  for (bytes_transfered = 0; bytes_transfered < size;) {
+    // decide the size to transfer for current iteration
+    size_t cur_size = ACL_PCIE_MEMWINDOW_SIZE - (cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE);
+    if (bytes_transfered + cur_size >= size) {
+      cur_size = size - bytes_transfered;
+    }
+
+    // set the proper window segment
+    set_segment(cur_dev_addr);
+    size_t window_rel_ptr_start = cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE;
+    size_t window_rel_ptr = window_rel_ptr_start;
+
+    // A simple blocking read
+    // The address should be in the global memory range, we assume
+    // any offsets are already accounted for in the offset
+    ACL_PCIE_ASSERT(window_rel_ptr + cur_size <= ACL_PCIE_MEMWINDOW_SIZE,
+                    "[%s] trying to access out of the range of the memory window.\n",
+                    m_name);
+
+    // Workaround a bug in Jungo driver.
+    // First, transfer the non 8 bytes data at the front, one byte at a time
+    // Then, transfer multiple of 8 bytes (size of size_t) using read/write_block
+    // At the end, transfer the remaining bytes, one byte at a time
+    size_t dev_odd_start = std::min(sizeof(size_t) - window_rel_ptr % sizeof(size_t), cur_size);
+    if (dev_odd_start != sizeof(size_t)) {
+      read_write_small_size(cur_host_addr, window_rel_ptr, dev_odd_start, reading);
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, dev_odd_start);
+      cur_size -= dev_odd_start;
+    }
+
+    size_t tail_size = cur_size % sizeof(size_t);
+    size_t size_mul_8 = cur_size - tail_size;
+
+    if (size_mul_8 != 0) {
+      if (reading) {
+        m_io->mem->read_block(window_rel_ptr, size_mul_8, cur_host_addr);
+      } else {
+        m_io->mem->write_block(window_rel_ptr, size_mul_8, cur_host_addr);
+      }
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, size_mul_8);
+    }
+
+    if (tail_size != 0) {
+      read_write_small_size(cur_host_addr, window_rel_ptr, tail_size, reading);
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, tail_size);
+      cur_size -= tail_size;
+    }
+
+    // increase the current device address to be transferred
+    cur_dev_addr += (window_rel_ptr - window_rel_ptr_start);
+  }
+
+  return 0;  // success
+}
+
+// Read or Write a small size of data to device memory, one byte at a time
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  UINT8 *ucharptr_host = static_cast<UINT8 *>(host_addr);
+  int status;
+
+  for (size_t i = 0; i < size; ++i) {
+    if (reading) {
+      status = m_io->mem->read8(dev_addr + i, ucharptr_host + i);
+    } else {
+      status = m_io->mem->write8(dev_addr + i, ucharptr_host[i]);
+    }
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read write with odd size.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+// Set the segment that the memory windows is accessing to
+// Return 0 on success
+int ACL_PCIE_DEVICE::set_segment(size_t addr) {
+  UINT64 segment_readback;
+  UINT64 cur_segment = addr & ~(ACL_PCIE_MEMWINDOW_SIZE - 1);
+  int status = 0;
+
+  // Only execute the PCI write if we need to *change* segments
+  if (cur_segment != m_segment) {
+    // PCIe reordering rules could cause the segment change to get reordered,
+    // so read before and after!
+    status |= (int)(m_io->window->read64(0, &segment_readback));
+
+    status |= (int)(m_io->window->write64(0, cur_segment));
+    m_segment = cur_segment;
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::::: [%s] Changed segment id to %llu.\n", m_name, m_segment);
+
+    status |= (int)(m_io->window->read64(0, &segment_readback));
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set segment for memory access windows.\n", m_name);
+
+  return 0;  // success
+}
+
+void ACL_PCIE_DEVICE::incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr) {
+  const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(*host);
+
+  *host = reinterpret_cast<void *>(uintptr_host + incr);
+  *dev += incr;
+  *counter += incr;
+}
+
+// Query the on-chip temperature sensor
+bool ACL_PCIE_DEVICE::get_ondie_temp_slow_call(cl_int *temp) {
+  cl_int read_data;
+
+  // We assume this during read later
+  ACL_PCIE_ASSERT(sizeof(cl_int) == sizeof(INT32), "sizeof(cl_int) != sizeof(INT32)");
+
+#ifndef ACL_PCIE_HAS_TEMP_SENSOR
+  ACL_PCIE_DEBUG_MSG(":: [%s] On-chip temperature sensor not supported by this board.\n", m_name);
+  return false;
+#endif
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Querying on-chip temperature sensor...\n", m_name);
+
+  // read temperature sensor
+  m_io->temp_sensor->read32(0, (UINT32 *)&read_data);
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Read temp sensor data.  Value is: %i\n", m_name, read_data);
+  *temp = read_data;
+  return true;
+}
+
+void *ACL_PCIE_DEVICE::shared_mem_alloc(size_t size, unsigned long long *device_ptr_out) {
+#if defined(WINDOWS)
+  return NULL;
+#endif  // WINDOWS
+#if defined(LINUX)
+#ifdef ACL_HOST_MEMORY_SHARED
+  void *host_ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, m_device, 0);
+
+  if (device_ptr_out != NULL && host_ptr == (void *)-1) {
+    // when mmap fails, it returns (void*)-1, not NULL
+    host_ptr = NULL;
+    *device_ptr_out = (unsigned long long)0;
+
+  } else if (device_ptr_out != NULL) {
+    /* map received host_ptr to FPGA-usable address. */
+    void *dev_ptr = NULL;
+    struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PHYS_PTR_FROM_VIRT, &dev_ptr, &host_ptr, sizeof(dev_ptr)};
+
+    bool failed_flag = (read(m_device, &read_cmd, sizeof(dev_ptr)) != 0);
+    ACL_PCIE_DEBUG_MSG(
+        "  Mapped vaddr %p to phys addr %p. %s\n", host_ptr, dev_ptr, failed_flag == 0 ? "OK" : "FAILED");
+    if (failed_flag) {
+      *device_ptr_out = (unsigned long long)NULL;
+    } else {
+      /* When change to 64-bit pointers on the device, update driver code
+       * to deal with larger-than-void* ptrs. */
+      *device_ptr_out = (unsigned long long)dev_ptr;
+
+      /* Now need to add offset of the shared system. */
+    }
+  }
+
+  return host_ptr;
+#else
+  return NULL;
+#endif
+#endif  // LINUX
+}
+
+void ACL_PCIE_DEVICE::shared_mem_free(void *vptr, size_t size) {
+#if defined(WINDOWS)
+  return;
+#endif  // WINDOWS
+#if defined(LINUX)
+  if (vptr != NULL) {
+    munmap(vptr, size);
+  }
+#endif  // LINUX
+}
+
+#ifdef DLA_MMD
+
+int ACL_PCIE_DEVICE::pause_and_save_pcie()
+{
+  int failed_cont_reg_save;
+
+  // set the being_programmed flag
+  m_being_programmed = true;
+
+  // disable interrupt and save control registers
+  const int failed_int_disable = this->disable_interrupts();
+  ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup_save, "could not disable interrupt.\n");
+
+  // Do this last before programming
+  failed_cont_reg_save = m_config->save_pci_control_regs();
+  ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup_save, "could not save control regs\n");
+
+  return 0;
+
+  cleanup_save:
+
+  m_being_programmed = false;
+  return 1;
+}
+
+int ACL_PCIE_DEVICE::restore_and_resume_pcie()
+{
+#if defined(LINUX)
+  m_config->load_pci_control_regs();
+#endif
+
+  if (wait_for_uniphy()) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+
+    m_being_programmed = false;
+
+    return 1;
+  }
+
+  m_being_programmed = false;
+  return 0;
+}
+
+// JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core
+// Return 0 on success
+int ACL_PCIE_DEVICE::reprogram_sof(const char *sof_filename, const bool skipSaveRestore) {
+  int saveRetCode = 0;
+
+  if (!skipSaveRestore)
+  {
+    saveRetCode = pause_and_save_pcie();
+    if (saveRetCode)
+    {
+      return saveRetCode;
+    }
+  }
+
+  int reprogram_failed = 1;  // assume failure
+
+  // JTAG programming the device
+  ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name);
+  reprogram_failed = m_config->program_with_SOF_file(sof_filename, "0" /*ad_cable*/, "0" /*ad_device_index*/);
+
+  int restoreRetCode = 0;
+
+  if (!skipSaveRestore)
+  {
+    restoreRetCode = restore_and_resume_pcie();
+    if (restoreRetCode)
+    {
+      return restoreRetCode;
+    }
+  }
+
+  if (!(reprogram_failed)) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name);
+  }
+
+  return reprogram_failed;
+}
+#else
+// perform PR reprogram by attempting to program the board using an RBF. If this is not possible due to
+// 1) Envoking the user of JTAG_PROGRAMMING via ACL_PCIE_USE_JTAG_PROGRAMMING
+// 2) RBF or HASH are not present
+// 3) PR Base ID does not match that with which the RBF was compiled
+// 4) UniPhy fails to calibrate
+// Then returns 1. Returns 0 on success. Always returns flag from arguments indicating source of failure
+int ACL_PCIE_DEVICE::pr_reprogram(struct acl_pkg_file *pkg,
+                                  const char *SOFNAME,
+                                  int *rbf_or_hash_not_provided,
+                                  int *hash_mismatch,
+                                  unsigned *use_jtag_programming,
+                                  int *quartus_compile_version_mismatch) {
+  // Environment variable to control when to use JTAG instead of PR (overriding the default programming method: PR)
+  int reprogram_failed = 1;
+  size_t core_rbf_len = 0, pr_import_version_len = 0, quartus_version_len = 0, pll_config_len = 0;
+  *use_jtag_programming = 0;
+  char *str_use_jtag_programming = getenv("ACL_PCIE_USE_JTAG_PROGRAMMING");
+  if (str_use_jtag_programming) *use_jtag_programming = 1;
+
+  // 1. Default programming method: PR
+  if (!*use_jtag_programming) {
+    // checking that rbf and hash sections exist in fpga.bin
+    if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_CORE_RBF, &core_rbf_len) &&
+        acl_pkg_section_exists(pkg, ACL_PKG_SECTION_HASH, &pr_import_version_len) &&
+        (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len) || m_skip_quartus_version_check)) {
+      *rbf_or_hash_not_provided = 0;
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Programming kernel region using PR with rbf file size %i\n", m_name, (UINT32)core_rbf_len);
+
+      // read rbf and hash from fpga.bin
+      char *core_rbf;
+      acl_aligned_malloc((void **)&core_rbf, core_rbf_len + 1);
+      int read_core_rbf_ok = acl_pkg_read_section(pkg, ACL_PKG_SECTION_CORE_RBF, core_rbf, core_rbf_len + 1);
+
+      if (!m_skip_quartus_version_check) {
+        char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1);
+        if (quartus_compile_version_str) {
+          int quartus_compile_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1);
+
+          if (quartus_compile_version_ok) {
+            // Remove Linux and Windows new-line ending in .acl.qversion
+            if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' ||
+                                              quartus_compile_version_str[quartus_version_len - 1] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 1] = '\0';
+            }
+            if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 2] = '\0';
+            }
+
+            *quartus_compile_version_mismatch = quartus_ver_test(quartus_compile_version_str);
+          } else {
+            *quartus_compile_version_mismatch = 1;
+          }
+          free(quartus_compile_version_str);
+          quartus_compile_version_str = NULL;
+        } else {
+          *quartus_compile_version_mismatch = 1;
+        }
+      } else {
+        *quartus_compile_version_mismatch = 0;
+      }
+
+      if (*quartus_compile_version_mismatch == 0) {
+        char *pr_import_version_str = (char *)malloc(pr_import_version_len + 1);
+        if (pr_import_version_str) {
+          int pr_import_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_HASH, pr_import_version_str, pr_import_version_len + 1);
+
+          // checking that hash was successfully read from section .acl.hash within fpga.bin
+          if (pr_import_version_ok) {
+            unsigned int pr_import_version = (unsigned int)strtol(pr_import_version_str, NULL, 10);
+
+            // checking that base revision hash matches import revision hash and aocx and programmed sof is from same
+            // Quartus version
+            if (pr_base_id_test(pr_import_version) == 0) {
+              *hash_mismatch = 0;
+
+              // Kernel driver wants it aligned to 4 bytes.
+              int aligned_to_4_bytes(0 == (3 & (uintptr_t)(core_rbf)));
+              reprogram_failed = 1;  // Default to fail before PRing
+
+              // checking that rbf was successfully read from section .acl.core.rbf within fpga.bin
+              if (read_core_rbf_ok && !(core_rbf_len % 4) && aligned_to_4_bytes && !version_id_test()) {
+                // reprogram Arria 10 devices
+                if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name);
+                  reprogram_failed = m_config->program_core_with_PR_file_a10((char *)core_rbf, core_rbf_len);
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name);
+                };
+
+                // reprogram Stratix 10 devices
+                if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+                  acl_pkg_section_exists(pkg, ACL_PKG_SECTION_PLL_CONFIG, &pll_config_len);
+                  char *pll_config_str = (char *)malloc(pll_config_len + 1);
+                  if (pll_config_str) {
+                    int pll_config_ok =
+                        acl_pkg_read_section(pkg, ACL_PKG_SECTION_PLL_CONFIG, pll_config_str, pll_config_len + 1);
+                    if (pll_config_ok) {
+                      ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name);
+                      reprogram_failed = m_config->program_core_with_PR_file_s10(
+                          (char *)core_rbf, core_rbf_len, (char *)pll_config_str);
+                      ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name);
+                    };
+                  };
+                  free(pll_config_str);
+                  pll_config_str = NULL;
+                };
+
+                if (reprogram_failed) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] PR programming failed.\n", m_name);
+                  // PR failed. Check if device I/O is blocked.
+                  if (check_kernel_region_status() == -1) {
+                    ACL_PCIE_INFO("[%s] Partial Reconfiguration of FPGA has failed.\n", m_name);
+                    ACL_PCIE_INFO("[%s] FPGA device will not be available until host has been powercycled.\n", m_name);
+                    exit(1);
+                  }
+                } else if (version_id_test()) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name);
+                  reprogram_failed = 1;
+                } else if (wait_for_uniphy()) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+                  reprogram_failed = 1;
+                } else {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] PR programming passed.\n", m_name);
+                }
+              }
+            }
+          }
+          free(pr_import_version_str);
+          pr_import_version_str = NULL;
+        }
+      }
+      acl_aligned_free(core_rbf);
+    }
+  }
+
+  return reprogram_failed;
+}
+
+// Reprogram the device with given binary file.
+// There are two ways to program:
+// 1. PR to replace the OpenCL kernel partition
+// 2. JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core
+// Return 0 on success
+int ACL_PCIE_DEVICE::reprogram(void *data, size_t data_size, int program_mode) {
+  int reprogram_failed = 1;           // assume failure
+  int rbf_or_hash_not_provided = 1;   // assume no rbf or hash are provided in fpga.bin
+  int hash_mismatch = 1;              // assume base revision and import revision hashes do not match
+  unsigned use_jtag_programming = 0;  // assume no need for jtag programming
+  int quartus_compile_version_mismatch = 1;
+  size_t quartus_version_len;
+
+  const char *SOFNAME = "reprogram_temp.sof";
+  size_t sof_len = 0;
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Starting to program device...\n", m_name);
+
+  struct acl_pkg_file *pkg = acl_pkg_open_file_from_memory((char *)data, data_size, ACL_PKG_SHOW_ERROR);
+  ACL_PCIE_ERROR_IF(pkg == NULL, return reprogram_failed, "cannot open file from memory using pkg editor.\n");
+
+  // set the being_programmed flag
+  m_being_programmed = true;
+
+  // the new reprogram flow: first try PR, if failed falls back to the old reprogram flow
+  int try_pr_failed = 0;
+  // if choose to try reprogram with preserving memory
+  if (program_mode == ACL_PCIE_PROGRAM_PR) {
+    // only try PR, no fall back to JTAG
+    ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name);
+    reprogram_failed = pr_reprogram(pkg,
+                                    SOFNAME,
+                                    &rbf_or_hash_not_provided,
+                                    &hash_mismatch,
+                                    &use_jtag_programming,
+                                    &quartus_compile_version_mismatch);
+    // clean up
+    if (reprogram_failed || use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch ||
+        (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) {
+      // try PR failed
+      try_pr_failed = 1;
+    }
+    if (pkg) acl_pkg_close_file(pkg);
+    m_being_programmed = false;
+    return try_pr_failed;
+  }
+
+  // the old reprogram flow. Try PR and then Try JTAG
+  // 1. Default to PR reprogramming
+  ACL_PCIE_DEBUG_MSG("[%s] Reprogram the device with data saving and restoring\n", m_name);
+  ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name);
+  reprogram_failed = pr_reprogram(pkg,
+                                  SOFNAME,
+                                  &rbf_or_hash_not_provided,
+                                  &hash_mismatch,
+                                  &use_jtag_programming,
+                                  &quartus_compile_version_mismatch);
+
+  // Autodetect JTAG cable & device index
+  // Cable and Index value should't overflow
+  char ad_cable[AD_CABLE_SIZE];
+  char ad_device_index[AD_CABLE_SIZE];
+
+  // 2. Fallback programming method: JTAG full-chip programming
+  if (use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch ||
+      (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) {
+    ACL_PCIE_DEBUG_MSG("[%s] Trying Full-Chip Reconfiguration (JTAG)\n", m_name);
+
+    // checking that sof section exist in fpga.bin
+    if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_SOF, &sof_len)) {
+      // check if aocx is fast-compiled or not - if so, then sof is a base revision,
+      // and does not necessarily contain the desired kernel. Requires sof with
+      // matching pr_base.id to be programmed (base.sof) followed by PR programming
+      // with the given .rbf
+      size_t fast_compile_len = 0;
+      char *fast_compile_contents = NULL;
+      int fast_compile = 0;
+      if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_len) &&
+          acl_pkg_read_section_transient(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_contents)) {
+        fast_compile = 1;
+        ACL_PCIE_DEBUG_MSG(":: [%s] Fast-compile fpga.bin detected.\n", m_name);
+      }
+      // Find jtag cable for the board
+      // Returns 0 for both ad_cable,ad_device_index if not found
+      // or if Autodetect is disabled
+      this->find_jtag_cable(ad_cable, ad_device_index);
+
+      // write out a SOF file
+      const int wrote_sof = acl_pkg_read_section_into_file(pkg, ACL_PKG_SECTION_SOF, SOFNAME);
+      ACL_PCIE_ERROR_IF(!wrote_sof, goto cleanup, "could not write %s.\n", SOFNAME);
+
+      // disable interrupt and save control registers
+      const int failed_int_disable = this->disable_interrupts();
+      ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup, "could not disable interrupt.\n");
+
+      // Do this last before programming
+      const int failed_cont_reg_save = m_config->save_pci_control_regs();
+      ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup, "could not save control regs\n");
+
+      // JTAG programming the device
+      ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name);
+      reprogram_failed = m_config->program_with_SOF_file(SOFNAME, ad_cable, ad_device_index);
+
+#if defined(LINUX)
+      m_config->load_pci_control_regs();
+#endif
+
+      ACL_PCIE_ERROR_IF(reprogram_failed, goto cleanup, "Failed to JTAG program\n");
+
+      if (!m_skip_quartus_version_check &&
+          acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len)) {
+        char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1);
+        if (quartus_compile_version_str) {
+          int quartus_compile_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1);
+          if (quartus_compile_version_ok) {
+            // Remove Linux and Windows new-line ending in .acl.qversion
+            if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' ||
+                                              quartus_compile_version_str[quartus_version_len - 1] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 1] = '\0';
+            }
+            if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 2] = '\0';
+            }
+            // Last character is NULL added by acl_pkg_read_section
+            m_io->quartus_ver->write_block(0, quartus_version_len + 1, quartus_compile_version_str);
+          }
+          free(quartus_compile_version_str);
+          quartus_compile_version_str = NULL;
+        }
+      }
+
+      if (version_id_test()) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name);
+        reprogram_failed = 1;
+      } else if (wait_for_uniphy()) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+        reprogram_failed = 1;
+      }
+      if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+        // S10 PR
+        if (deassert_pr_reset()) {
+          ACL_PCIE_DEBUG_MSG(":: [%s] PR region controller reset source deasserted.\n", m_name);
+        }
+      };
+      if (fast_compile) {
+        // need to rerun pr_reprogram because design should be loaded now
+        hash_mismatch = 0;
+        rbf_or_hash_not_provided = 0;
+        reprogram_failed = pr_reprogram(pkg,
+                                        SOFNAME,
+                                        &rbf_or_hash_not_provided,
+                                        &hash_mismatch,
+                                        &use_jtag_programming,
+                                        &quartus_compile_version_mismatch);
+      }
+      if (!(reprogram_failed)) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name);
+      }
+
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Could not read SOF file from fpga.bin.\n", m_name);
+      reprogram_failed = 1;
+    }
+  }
+
+cleanup:
+  // Clean up
+  if (pkg) acl_pkg_close_file(pkg);
+  m_being_programmed = false;
+
+  return reprogram_failed;
+}
+#endif
+
+// Perform a simple version id read to test the basic PCIe read functionality
+// Return 0 on success
+int ACL_PCIE_DEVICE::version_id_test() {
+  unsigned int version = ACL_VERSIONID ^ 1;  // make sure it's not what we hope to find.
+  unsigned int iattempt;
+  unsigned int max_attempts = 1;
+  unsigned int usleep_per_attempt = 20;  // 20 ms per.
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Doing PCIe-to-fabric read test ...\n", m_name);
+  for (iattempt = 0; iattempt < max_attempts; iattempt++) {
+    m_io->version->read32(0, &version);
+    if ((version >= (unsigned int)ACL_VERSIONID_MIN) && (version <= (unsigned int)ACL_VERSIONID)) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] PCIe-to-fabric read test passed\n", m_name);
+      return 0;
+    }
+#if defined(WINDOWS)
+    Sleep(usleep_per_attempt);
+#endif  // WINDOWS
+#if defined(LINUX)
+    usleep(usleep_per_attempt * 1000);
+#endif  // LINUX
+  }
+
+  // Kernel read command succeed, but got bad data. (version id doesn't match)
+  ACL_PCIE_INFO("[%s] PCIe-to-fabric read test failed, read 0x%0x after %u attempts\n", m_name, version, iattempt);
+  return -1;
+}
+
+// Perform a read of the kernel region status IP
+// Return 0 on success (PR region is unfrozen and ready to use)
+int ACL_PCIE_DEVICE::check_kernel_region_status() {
+#if defined(LINUX)
+  unsigned int value;
+  struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PR_REGION_STATUS, NULL, &value, sizeof(value)};
+  if (read(m_device, &driver_cmd, sizeof(driver_cmd)) == -1) {
+    return -1;
+  } else {
+    return value;
+  }
+#endif  // Linux
+  return 0;
+}
+
+// Performs a write to PR region controller to deassert reset to PR region
+// Return 0 on success
+int ACL_PCIE_DEVICE::deassert_pr_reset() {
+  ACL_PCIE_DEBUG_MSG(":: [%s] Deasserting PR region controller reset ...\n", m_name);
+  m_io->pr_region_ctrl->write32(FREEZE_CTRL_OFFSET, 0);
+
+  return 0;
+}
+
+// Quartus Compile Version check
+// Return 0 on success
+int ACL_PCIE_DEVICE::quartus_ver_test(char *pkg_qversion_str) {
+  char *fpga_qversion_str;
+  unsigned int version;
+
+  // Check version ID to ensure feature supported in HW
+  m_io->version->read32(0, &version);
+  if (version < (unsigned int)ACL_QUARTUSVER_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Programming on board without Quartus Version RAM\n", m_name);
+    return 1;
+  }
+
+  // Allocate buffer for Quartus version read from FPGA with
+  // largest expected size + 1 for NULL
+  fpga_qversion_str = reinterpret_cast<char*>(malloc(ACL_QUARTUSVER_ROM_SIZE + 1));
+  if (NULL == fpga_qversion_str) {
+    ACL_PCIE_DEBUG_MSG(":: Memory allocation failed, allocating %d bytes\n", ACL_QUARTUSVER_ROM_SIZE + 1);
+    free(fpga_qversion_str);
+    return 1;
+  }
+  // Make sure it's not what we hope to find
+  memset(fpga_qversion_str, 0, ACL_QUARTUSVER_ROM_SIZE + 1);
+
+  m_io->quartus_ver->read_block(0, ACL_QUARTUSVER_ROM_SIZE, fpga_qversion_str);
+
+  size_t fpga_qversion_len = 0;
+  fpga_qversion_len = strnlen(fpga_qversion_str, MAX_LEN);
+
+  size_t pkg_qversion_len = 0;
+  if (pkg_qversion_str) {
+    pkg_qversion_len = strnlen(pkg_qversion_str, MAX_LEN);
+
+    if (fpga_qversion_len != pkg_qversion_len) {
+      // Kernel read command succeed, but got bad data. (Quartus Version doesn't match)
+      ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name);
+      ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+      ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+      free(fpga_qversion_str);
+      return 1;
+    }
+
+    if (strncmp(pkg_qversion_str, fpga_qversion_str, fpga_qversion_len) == 0) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Quartus versions for base and import compile match\n", m_name);
+      ACL_PCIE_DEBUG_MSG(":: [%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+      ACL_PCIE_DEBUG_MSG(":: [%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+      free(fpga_qversion_str);
+      return 0;
+    }
+
+    // Kernel read command succeed, but got bad data. (Quartus Version doesn't match)
+    ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name);
+    ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+    ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+  }
+  free(fpga_qversion_str);
+  return 1;
+}
+
+// Perform a simple read to the PR base ID in the static region and compare it with the given ID
+// Return 0 on success
+int ACL_PCIE_DEVICE::pr_base_id_test(unsigned int pr_import_version) {
+  unsigned int pr_base_version = 0;  // make sure it's not what we hope to find.
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Reading PR base ID from fabric ...\n", m_name);
+  m_io->pr_base_id->read32(0, &pr_base_version);
+  if (pr_base_version == pr_import_version) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR base and import compile IDs match\n", m_name);
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version);
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR import compile ID is 0x%0x\n", m_name, pr_import_version);
+    return 0;
+  };
+
+  // Kernel read command succeed, but got bad data. (version id doesn't match)
+  ACL_PCIE_DEBUG_MSG("[%s] PR base and import compile IDs do not match\n", m_name);
+  ACL_PCIE_DEBUG_MSG("[%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version);
+  ACL_PCIE_DEBUG_MSG("[%s] PR import compile expects ID to be 0x%0x\n", m_name, pr_import_version);
+  return -1;
+}
+
+// 1. Write a random value to cade_id register, do a read to confirm the write
+// 2. Use the random value to find the JTAG cable for that board
+// 3. Return "0" on ad_cable,ad_device_index if cable not found
+void ACL_PCIE_DEVICE::find_jtag_cable(char *ad_cable, char *ad_device_index) {
+  bool jtag_ad_disabled = false;
+  bool jtag_ad_cable_found = false;
+  unsigned int version = 0;
+
+  // Check if Autodetect is disabled
+  const char *cable = getenv("ACL_PCIE_JTAG_CABLE");
+  const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX");
+  if (cable || device_index) {
+    jtag_ad_disabled = true;
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled!!!\n", m_name);
+  }
+
+  // Check version ID to ensure feature supported in HW
+  m_io->version->read32(0, &version);
+  if (version < (unsigned int)ACL_CADEID_VERSIONID) {
+    jtag_ad_disabled = true;
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled due to old HW version!!!\n", m_name);
+  }
+
+  // If JTAG autodetect is enabled, program the CADEID register
+  // and look for the value using in system sources and probes
+  if (!jtag_ad_disabled) {
+    // Only use random device here because we only want one value. Normally use mersenne twister for more values
+    std::random_device rd;
+    std::uniform_int_distribution<unsigned int> dist(0u, 0xFFFFFFFFu);
+    unsigned int cade_id_write = dist(rd) & 0xFFFFFFFF;
+    cade_id_write = cade_id_write | 0x80000000;  // Write a full 32 bit value
+    unsigned int cade_id_read = 0x0;
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Writing Cade ID to fabric ...\n", m_name);
+    m_io->cade_id->write32(0, cade_id_write);
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Reading Cade ID from fabric ...\n", m_name);
+    m_io->cade_id->read32(0, &cade_id_read);
+
+    if (cade_id_write == cade_id_read) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read success ...\n", m_name);
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Cade ID  cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read);
+
+      // Returns NULL on ad_cable,ad_device_index if no cable found
+      jtag_ad_cable_found = m_config->find_cable_with_ISSP(cade_id_write, ad_cable, ad_device_index);
+
+      if (!jtag_ad_cable_found) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Using default cable 1 ...\n", m_name);
+      } else {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Found Cable ...\n", m_name);
+      }
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read failed. Check BSP version or PCIE link...\n", m_name);
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Cade ID  cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read);
+    }
+  }
+
+  if (jtag_ad_disabled || !jtag_ad_cable_found) {
+    snprintf(ad_cable, AD_CABLE_SIZE, "%s", "0");
+    snprintf(ad_device_index, AD_CABLE_SIZE, "%s", "0");
+  }
+}
+
+// Wait until the uniphy calibrated
+// Return 0 on success
+int ACL_PCIE_DEVICE::wait_for_uniphy() {
+  const unsigned int ACL_UNIPHYSTATUS = 0;
+  unsigned int status = 1, retries = 0;
+
+  while (retries++ < 8) {
+    m_io->uniphy_status->read32(0, &status);
+
+    if (status == ACL_UNIPHYSTATUS) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Uniphys are calibrated\n", m_name);
+      return 0;  // success
+    }
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy status read was %x\n", m_name, status);
+    ACL_PCIE_DEBUG_MSG(":: [%s] Resetting Uniphy try %d\n", m_name, retries);
+    m_io->uniphy_reset->write32(0, 1);
+
+#if defined(WINDOWS)
+    Sleep(400);
+#endif  // WINDOWS
+#if defined(LINUX)
+    usleep(400 * 1000);
+#endif  // LINUX
+  }
+
+  ACL_PCIE_INFO("[%s] uniphy(s) did not calibrate.  Expected 0 but read %x\n", m_name, status);
+
+  // Failure! Was it communication error or actual calibration failure?
+  if (ACL_PCIE_READ_BIT(status, 3))  // This bit is hardcoded to 0
+    ACL_PCIE_INFO(
+        "                Uniphy calibration status is corrupt.  This is likely a communication error with the board "
+        "and/or uniphy_status module.\n");
+  else {
+    // This is a 32-bit interface with the first 4 bits aggregating the
+    // various calibration signals.  The remaining 28-bits would indicate
+    // failure for their respective memory core.  Tell users which ones
+    // failed
+    for (int i = 0; i < 32 - 4; i++) {
+      if (ACL_PCIE_READ_BIT(status, 4 + i)) ACL_PCIE_INFO("  Uniphy core %d failed to calibrate\n", i);
+    }
+    ACL_PCIE_INFO("     If there are more failures than Uniphy controllers connected, \n");
+    ACL_PCIE_INFO("     ensure the uniphy_status core is correctly parameterized.\n");
+  }
+
+  return -1;  // failure
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h
new file mode 100644
index 0000000..29f5128
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h
@@ -0,0 +1,209 @@
+#ifndef ACL_PCIE_DEVICE_H
+#define ACL_PCIE_DEVICE_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_device.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle operations on a single device.           */
+/* The actual implementation of the class lives in the acl_pcie_device.cpp         */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// Forward declaration for classes used by ACL_PCIE_DEVICE
+class ACL_PCIE_DMA;
+class ACL_PCIE_CONFIG;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_HOSTCH;
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif  // LINUX
+
+#ifdef DLA_MMD
+// CoreDLA runtime assumes host/device transfers are thread safe
+#include <mutex>
+// don't assume opencl has been installed
+typedef int cl_int;
+#endif
+
+// Encapsulates the functionality of an ACL device connected to the host
+// through a PCI express bus.
+class ACL_PCIE_DEVICE {
+ public:
+  ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number);
+  ~ACL_PCIE_DEVICE();
+  ACL_PCIE_DEVICE(const ACL_PCIE_DEVICE&) = delete;
+  ACL_PCIE_DEVICE& operator= (const ACL_PCIE_DEVICE&) = delete;
+
+  bool is_valid() { return m_device != INVALID_HANDLE_VALUE; };
+  bool is_initialized() { return m_initialized; };
+  bool is_being_programmed() { return m_being_programmed; };
+
+  // Perform operations required when an interrupt is received for this device
+  void service_interrupt(unsigned int irq_type_flag = 0);
+  // This function can be used for triggering a fake device exception for
+  void test_trigger_device_interrupt();
+
+  // The callback function set by "set_status_handler"
+  // It's used to notify/update the host whenever an event is finished
+  void event_update_fn(aocl_mmd_op_t op, int status);
+
+  // Called by the host program when there are spare cycles
+  int yield();
+
+  // Memory I/O
+  // return 0 on success
+  int write_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int read_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int copy_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size);
+
+  // Create channel. return handle to channel on success, negative otherwise
+  int create_hostchannel(char *name, size_t queue_depth, int direction);
+
+  // return 0 on success
+  int destroy_channel(int channel);
+
+  // return pointer that user can write to for write channel, and read from for read channel
+  void *hostchannel_get_buffer(size_t *buffer_size, int channel, int *status);
+
+  // return the size in bytes of the amount of buffer that was acknlowedged to channel
+  size_t hostchannel_ack_buffer(size_t send_size, int channel, int *status);
+
+  // Set kernel, device interrupts and event update callbacks
+  // return 0 on success
+  int set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  int set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data);
+  int set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+
+  // Query PCIe information of the device
+  char *get_dev_pcie_info() { return m_info.pcie_info_str; };
+
+  // Query on-die temperature sensor, if available
+  bool get_ondie_temp_slow_call(cl_int *temp);
+
+  // Shared memory manipulation functions
+  void *shared_mem_alloc(size_t size, unsigned long long *device_ptr_out);
+  void shared_mem_free(void *host_ptr, size_t size);
+
+  // Reprogram the device with given binary file
+  // return 0 on success
+#ifdef DLA_MMD
+  int pause_and_save_pcie();
+  int restore_and_resume_pcie();
+  int reprogram_sof(const char *sof_filename, const bool skipSaveRestore = false);
+#else
+  int reprogram(void *data, size_t data_size, int program_mode);
+#endif
+
+ private:
+  // Helper routines for interrupts
+  // return 0 on success, negative on error
+  int mask_irqs();
+  int unmask_irqs();
+  int unmask_kernel_irq();
+  int disable_interrupts();
+  int enable_interrupts(int user_signal_number);
+  int get_interrupt_type(unsigned int *kernel_update, unsigned int *dma_update, unsigned int irq_type_flag);
+#if defined(WINDOWS)
+  void enable_msi(bool enable);
+#endif  // WINDOWS
+
+  // Helper routines for read or write operations
+  // return 0 on success, negative on error (except for the "incr_ptrs" routine)
+  int read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int set_segment(size_t addr);
+  void incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr);
+  int does_base_periph_match_new_periph(struct acl_pkg_file *pkg, const char *dev_name);
+
+  // Helper routines for simple functionality test
+  // return 0 on success, negative on error
+  int version_id_test();
+  int wait_for_uniphy();
+  int pr_base_id_test(unsigned int pr_import_version);
+  int deassert_pr_reset();
+  int quartus_ver_test(char *pkg_qversion_str);
+  int check_kernel_region_status();
+
+  // Write a random value to cade_id register, do a read to confirm the write
+  // Use the random value to find the JTAG cable for that board
+  // Return 0 on ad_cable,ad_device_index if cable not found
+  void find_jtag_cable(char *ad_cable, char *ad_device_index);
+
+#ifndef DLA_MMD
+  // Performs PR reprogramming if possible, and returns different statuses on
+  // PR Hash, JTAG programming, RBF or Hash Presence
+  // Returns 0 on success, 1 on reprogram fail
+  int pr_reprogram(struct acl_pkg_file *pkg,
+                   const char *SOFNAME,
+                   int *rbf_or_hash_not_provided,
+                   int *hash_mismatch,
+                   unsigned *use_jtag_programming,
+                   int *quartus_compile_version_mismatch);
+#endif
+
+  // Kernel interrupt handler and event update callbacks
+  aocl_mmd_interrupt_handler_fn kernel_interrupt;
+  void *kernel_interrupt_user_data;
+  aocl_mmd_device_interrupt_handler_fn device_interrupt;
+  void *device_interrupt_user_data;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+  int m_user_signal_number;
+
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DMA *m_dma;
+  ACL_PCIE_HOSTCH *m_hostch;
+  ACL_PCIE_CONFIG *m_config;
+
+  static const int MAX_NAME_LENGTH = 32;
+  int m_handle;
+  char m_name[MAX_NAME_LENGTH];
+  fpga_handle m_device;
+  ACL_PCIE_DEVICE_DESCRIPTION m_info;
+
+  bool m_use_dma_for_big_transfers;
+  bool m_mmd_irq_handler_enable;
+  bool m_initialized;
+  bool m_being_programmed;
+  bool m_skip_quartus_version_check;
+
+  // IRQ acknowledgement commands in the KMD
+  static const unsigned int NUM_ACK_CMDS = 3;
+#if defined(WINDOWS)
+  fpga_event_handle *dev_event_handle;
+#endif  // WINDOWS
+
+  // For the host, memory is segmented.  This stores the last used segment
+  // ID so we don't needlessly update it in hardware
+  UINT64 m_segment;
+
+#ifdef DLA_MMD
+  std::mutex m_dma_mutex;
+#endif
+};
+
+#endif  // ACL_PCIE_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h
new file mode 100644
index 0000000..ec9fdb1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h
@@ -0,0 +1,37 @@
+#ifndef ACL_PCIE_DMA_H
+#define ACL_PCIE_DMA_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma.h  ----------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+#if defined(LINUX)
+#include "acl_pcie_dma_linux.h"
+#endif  // LINUX
+
+#endif  // ACL_PCIE_DMA_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp
new file mode 100644
index 0000000..a83b0dd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp
@@ -0,0 +1,141 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_dma_linux.cpp  --------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Linux-specific DMA operations.         */
+/* The declaration of the class lives in the acl_pcie_dma_linux.h                  */
+/* The actual implementation of DMA operation is inside the Linux kernel driver.   */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+
+// common and its own header files
+#include "acl_pcie_dma_linux.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+
+// other standard header files
+#include <stdio.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie) {
+  ACL_PCIE_ASSERT(dev != INVALID_DEVICE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+
+  m_handle = dev;
+  m_pcie = pcie;
+  m_io = io;
+  m_event = NULL;
+}
+
+ACL_PCIE_DMA::~ACL_PCIE_DMA() {
+  struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_DMA_STOP, NULL, NULL};
+  int bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  ACL_PCIE_ASSERT(bytes_read != -1, "failed to read driver command \n");
+}
+
+bool ACL_PCIE_DMA::is_idle() {
+  unsigned int result = 0;
+  int bytes_read;
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = ACLPCI_CMD_BAR;
+  driver_cmd.command = ACLPCI_CMD_GET_DMA_IDLE_STATUS;
+  driver_cmd.device_addr = NULL;
+  driver_cmd.user_addr = &result;
+  driver_cmd.size = sizeof(result);
+  bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+
+  return (bytes_read != -1 && result != 0);
+}
+
+// Perform operations required when a DMA interrupt comes
+// For Linux,
+//    All of the DMA related interrupts are handled inside the kernel driver,
+//    so when MMD gets a signal from the kernel driver indicating DMA is finished,
+//    it only needs to call the event_update_fn when it's needed.
+void ACL_PCIE_DMA::service_interrupt() {
+  if (m_event) {
+    // Use a temporary variable to save the event data and reset m_event
+    // before calling event_update_fn to avoid race condition that the main
+    // thread may start a new DMA transfer before this work-thread is able to
+    // reset the m_event.
+    // therefore, an assertion is implemented here, as defensively preventing
+    // sending interrupt signals incorrectly.
+    ACL_PCIE_ASSERT(
+        this->is_idle(),
+        "The dma is still in running, cannot service an interrupt to invoke another read/write operation\n");
+    aocl_mmd_op_t temp_event = m_event;
+    m_event = NULL;
+
+    m_pcie->event_update_fn(temp_event, 0);
+  }
+}
+
+// relinquish the CPU to let any other thread to run
+// return 0 since there is no useful work to be performed here
+int ACL_PCIE_DMA::yield() {
+  usleep(0);
+  return 0;
+}
+
+// Transfer data between host and device
+// This function returns right after the transfer is scheduled
+// Return 0 on success
+int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) {
+  // Currently dma cannot operate multiple read/write the same time.
+  // This means the read/write should be executed if and only if the dma is idle.
+  // Otherwise, it would cause assertion failure in the kernel space of the OS,
+  // which result in hanging, and even kernel panic and machine frozen as worst case.
+  // An assertion is implemented here, as defensively preventing race condition or incorrect sending of signal.
+  ACL_PCIE_ASSERT(this->is_idle(),
+                  "The dma is still in running, cannot perform another %s operation concurrently.\n",
+                  reading ? "read" : "write");
+
+  m_event = e;
+
+  // There are two scenarios of the read/write operation
+  // 1. the referred event is NULL, MMD would be stalled and keep polling the DMA until it is idle.
+  // 2. the referred event is valid, MMD would return immediately, runtime will wait for
+  //    the DMA service interrupt signal to update the status of the read/write operation.
+  //
+  // Therefore, the dma service interrupt is expected only when the event is valid.
+  struct acl_cmd driver_cmd {};
+  driver_cmd.bar_id = ACLPCI_DMA_BAR;
+  driver_cmd.command = m_event ? ACLPCI_CMD_DMA_SERVICE_SIGNAL : ACLPCI_CMD_DMA_NO_SIGNAL;
+  driver_cmd.device_addr = reinterpret_cast<void *>(dev_addr);
+  driver_cmd.user_addr = host_addr;
+  driver_cmd.size = bytes;
+  if (reading) {
+    if (read(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1;  // reading failed
+  } else {
+    if (write(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1;
+  }
+  return 0;  // success
+}
+
+#endif  // LINUX
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h
new file mode 100644
index 0000000..2ad1762
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h
@@ -0,0 +1,75 @@
+#ifndef ACL_PCIE_DMA_LINUX_H
+#define ACL_PCIE_DMA_LINUX_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma_linux.h  ----------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Linux-specific DMA operations.           */
+/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include "aocl_mmd.h"
+typedef int fpga_handle;
+#endif
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+
+class ACL_PCIE_DMA {
+ public:
+  ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie);
+  ~ACL_PCIE_DMA();
+
+  bool is_idle();
+  void stall_until_idle() {
+    while (!is_idle()) yield();
+  };
+
+  // Perform operations required when a DMA interrupt comes
+  void service_interrupt();
+
+  // Relinquish the CPU to let any other thread to run
+  // Return 0 since there is no useful work to be performed here
+  int yield();
+
+  // Transfer data between host and device
+  // This function returns right after the transfer is scheduled
+  // Return 0 on success
+  int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading);
+
+ private:
+  aocl_mmd_op_t m_event;
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+};
+
+#endif  // LINUX
+
+#endif  // ACL_PCIE_DMA_LINUX_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp
new file mode 100644
index 0000000..ab5e7b2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp
@@ -0,0 +1,1381 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_dma_windows.cpp  ------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Windows-specific DMA operations.       */
+/* The declaration of the class lives in the acl_pcie_dma_windows.h                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+
+// common and its own header files
+#include "acl_pcie.h"
+#include "acl_pcie_dma_windows.h"
+#include "hw_pcie_constants.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie_timer.h"
+#include "acl_pcie_debug.h"
+#include <iostream>
+#include <stdlib.h>
+
+#define ACL_PCIE_DMA_DEBUG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, m, __VA_ARGS__)
+
+// The callback function to be scheduled inside the interrupt handler
+// It will release the semaphore to allow new work to be scheduled and
+// perform the dma update function
+void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  ReleaseSemaphore(m_dma->m_workqueue_semaphore, 1, NULL);
+
+  m_dma->update(true);
+}
+
+void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  m_dma->unpin_from_queue();
+}
+
+void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  m_dma->prepin_memory();
+}
+
+ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie)
+    : hostch_data(),
+      m_table_virt_addr(NULL),
+      m_table_dma_addr(),
+      m_table_dma_phys_addr(0),
+      m_active_descriptor(NULL),
+      m_last_pinned_size(0),
+      m_last_pinned_addr(NULL),
+      m_prepinned(0),
+      m_last_id(0),
+      m_event(NULL),
+      m_dev_addr(0),
+      m_host_addr(NULL),
+      m_bytes(0),
+      m_bytes_sent(0),
+      m_bytes_rem(0),
+      m_read(0),
+      m_idle(0),
+      m_interrupt_disabled(0),
+      m_pcie(NULL),
+      m_io(NULL),
+      m_timer(NULL),
+      m_callback_env(),
+      m_work(NULL),
+      m_workqueue_semaphore(NULL),
+      m_dma_unpin_pending(),
+      m_unpin_callback_env(),
+      m_unpin_threadpool(NULL),
+      m_unpin_work(NULL),
+      m_pin_callback_env(),
+      m_pin_threadpool(NULL),
+      m_pin_work(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+
+  m_handle = handle;
+  m_io = io;
+  m_pcie = pcie;
+
+  HOSTCH_DESC *h = &hostch_data;
+
+  const char *use_msi = getenv("ACL_PCIE_DMA_USE_MSI");
+  if (use_msi)
+    m_use_polling = 0;
+  else
+    m_use_polling = 1;
+
+  SecureZeroMemory(&m_active_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&m_pre_pinned_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&m_done_mem, sizeof(PINNED_MEM));
+
+  // Initialize Host Channel
+  SecureZeroMemory(&h->m_hostch_rd_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_wr_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_rd_pointer, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_wr_pointer, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_sync_thread_pointer, sizeof(PINNED_MEM));
+  h->push_valid = 0;
+  h->pull_valid = 0;
+
+  m_timer = new ACL_PCIE_TIMER();
+
+  // create the threadpool to perform work the interrupt
+  m_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_threadpool, 1);
+  bool status = SetThreadpoolThreadMinimum(m_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_callback_env);
+  SetThreadpoolCallbackPool(&m_callback_env, m_threadpool);
+
+  m_work = CreateThreadpoolWork(myWorkCallback, (void *)this, &m_callback_env);
+  ACL_PCIE_ERROR_IF(m_work == NULL, return, "failed to create work for threadpool.\n");
+
+  m_workqueue_semaphore = CreateSemaphore(NULL, 1, 1, NULL);
+  ACL_PCIE_ERROR_IF(m_workqueue_semaphore == NULL, return, "failed to create semaphore.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // Unpin thread
+  m_unpin_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_unpin_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_unpin_threadpool, 1);
+  status = SetThreadpoolThreadMinimum(m_unpin_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_unpin_callback_env);
+  SetThreadpoolCallbackPool(&m_unpin_callback_env, m_unpin_threadpool);
+
+  m_unpin_work = CreateThreadpoolWork(myWorkUnpinCallback, (void *)this, &m_unpin_callback_env);
+  ACL_PCIE_ERROR_IF(m_unpin_work == NULL, return, "failed to create work for unpin threadpool.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // pin thread
+  m_pin_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_pin_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_pin_threadpool, 1);
+  status = SetThreadpoolThreadMinimum(m_pin_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_pin_callback_env);
+  SetThreadpoolCallbackPool(&m_pin_callback_env, m_pin_threadpool);
+
+  m_pin_work = CreateThreadpoolWork(myWorkPinCallback, (void *)this, &m_pin_callback_env);
+  ACL_PCIE_ERROR_IF(m_pin_work == NULL, return, "failed to create work for unpin threadpool.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // Contiguous DMA'able memory allocation for descriptor table
+
+  fpga_result FPGA_status;
+  size_t desc_table_size = sizeof(struct DMA_DESC_TABLE);
+  size_t page_table_size = sizeof(struct HOSTCH_TABLE);
+
+  // Lock DMA_DESC_TABLE using WsId
+  FPGA_status = fpgaPrepareBuffer(
+      m_handle, (UINT64)desc_table_size, (PVOID *)&m_table_virt_addr, &m_table_dma_addr.WsId, FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), &m_table_dma_addr.WsId, NULL, 0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  m_table_dma_addr.Page = (sg_element *)malloc(m_table_dma_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (m_table_dma_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, (void *)m_table_dma_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked DMA descriptor table memory.\n");
+  ACL_PCIE_ASSERT(m_table_dma_addr.dwPages == 1, "fpgaPrepareBuffer function allocated more than 1 page.\n");
+
+  if (m_table_dma_addr.Page != NULL) m_table_dma_phys_addr = m_table_dma_addr.Page[0].phys_addr;
+
+  // Lock HOSTCH_TABLE push channel using WsId
+  FPGA_status = fpgaPrepareBuffer(m_handle,
+                                  (UINT64)page_table_size,
+                                  (PVOID *)&h->push_page_table,
+                                  &hostch_data.push_page_table_addr.WsId,
+                                  FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(m_handle,
+                                     GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS),
+                                     (PVOID)&hostch_data.push_page_table_addr.WsId,
+                                     NULL,
+                                     0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, hostch_data.push_page_table_addr.WsId, (uint64_t *)&hostch_data.push_page_table_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  hostch_data.push_page_table_addr.Page =
+      (sg_element *)malloc(hostch_data.push_page_table_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (hostch_data.push_page_table_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle,
+                                       hostch_data.push_page_table_addr.WsId,
+                                       (uint64_t *)&hostch_data.push_page_table_addr.dwPages,
+                                       (void *)hostch_data.push_page_table_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table for Hostchannel memory.\n");
+  ACL_PCIE_ASSERT(hostch_data.push_page_table_addr.dwPages == 1,
+                  "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n");
+
+  if (hostch_data.push_page_table_addr.Page != NULL)
+    hostch_data.push_page_table_bus_addr = hostch_data.push_page_table_addr.Page[0].phys_addr;
+
+  // Lock HOSTCH_TABLE pull channel
+  FPGA_status = fpgaPrepareBuffer(m_handle,
+                                  (UINT64)page_table_size,
+                                  (PVOID *)&h->pull_page_table,
+                                  &hostch_data.pull_page_table_addr.WsId,
+                                  FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function for Hostchannel failed. \n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(m_handle,
+                                     GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS),
+                                     (PVOID)&hostch_data.pull_page_table_addr.WsId,
+                                     NULL,
+                                     0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, hostch_data.pull_page_table_addr.WsId, (uint64_t *)&hostch_data.pull_page_table_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  hostch_data.pull_page_table_addr.Page =
+      (sg_element *)malloc(hostch_data.pull_page_table_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (hostch_data.pull_page_table_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle,
+                                       hostch_data.pull_page_table_addr.WsId,
+                                       (uint64_t *)&hostch_data.pull_page_table_addr.dwPages,
+                                       (void *)hostch_data.pull_page_table_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table memory.\n");
+  ACL_PCIE_ASSERT(hostch_data.pull_page_table_addr.dwPages == 1,
+                  "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n");
+
+  if (hostch_data.pull_page_table_addr.Page != NULL)
+    hostch_data.pull_page_table_bus_addr = hostch_data.pull_page_table_addr.Page[0].phys_addr;
+
+  // set idle status to true when finish initialization
+  m_idle = true;
+}
+
+ACL_PCIE_DMA::~ACL_PCIE_DMA() {
+  fpga_result FPGA_status;
+  stall_until_idle();
+
+  // make sure no more work queued for threadpool
+  WaitForThreadpoolWorkCallbacks(m_work, FALSE);
+
+  // hostch_destroy is expected to be called by user but to make sure, call in the destructor
+  hostch_destroy(ACL_HOST_CHANNEL_0_ID);
+  hostch_destroy(ACL_HOST_CHANNEL_1_ID);
+
+  // Unlock all the previously allocated tables from the constructor
+  FPGA_status = fpgaReleaseBuffer(m_handle, m_table_dma_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (m_table_dma_addr.Page != NULL) {
+    free(m_table_dma_addr.Page);
+    m_table_dma_addr.Page = NULL;
+  }
+
+  FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.push_page_table_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (hostch_data.push_page_table_addr.Page != NULL) {
+    free(hostch_data.push_page_table_addr.Page);
+    hostch_data.push_page_table_addr.Page = NULL;
+  }
+
+  FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.pull_page_table_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (hostch_data.pull_page_table_addr.Page != NULL) {
+    free(hostch_data.pull_page_table_addr.Page);
+    hostch_data.pull_page_table_addr.Page = NULL;
+  }
+
+  CloseHandle(m_workqueue_semaphore);
+  CloseThreadpoolWork(m_work);
+  CloseThreadpool(m_threadpool);
+
+  CloseThreadpoolWork(m_unpin_work);
+  CloseThreadpool(m_unpin_threadpool);
+
+  CloseThreadpoolWork(m_pin_work);
+  CloseThreadpool(m_pin_threadpool);
+
+  if (m_timer) {
+    delete m_timer;
+    m_timer = NULL;
+  }
+}
+
+int ACL_PCIE_DMA::check_dma_interrupt(unsigned int *dma_update) {
+  if (!m_use_polling) {
+    if (m_last_id > 0 && m_last_id <= ACL_PCIE_DMA_DESC_MAX_ENTRIES) {
+      *dma_update = (m_table_virt_addr->header.flags[m_last_id - 1]);
+    } else {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void ACL_PCIE_DMA::unpin_from_queue() {
+  fpga_result result;
+  ACL_PCIE_ASSERT(!m_dma_unpin_pending.empty(), "m_dma_unpin_pending is empty but unpin mem thread was called\n");
+
+  QUEUE_STRUCT entry;
+
+  entry = m_dma_unpin_pending.front();
+  m_dma_unpin_pending.pop();
+
+  // IOCTL call to flush IO buffers
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID) & (entry.WsId), NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Unlock the allocated tables associated with wsId
+  result = fpgaReleaseBuffer(m_handle, entry.WsId);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n");
+  if (entry.SGListPtr != NULL) free(entry.SGListPtr);
+}
+
+void ACL_PCIE_DMA::prepin_memory() { pin_memory(&m_pre_pinned_mem, true); }
+
+void ACL_PCIE_DMA::wait_finish() {
+  UINT32 wait_timer;
+
+  while (1) {
+    wait_timer = ACL_PCIE_DMA_TIMEOUT;
+    while (wait_timer > 0) {
+      wait_timer--;
+
+      if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) {
+        ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait done\n");
+        set_desc_table_header();
+        if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+          SubmitThreadpoolWork(m_work);
+        }
+        return;
+      }
+    }
+
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait timed out. Sleeping for 1ms.\n");
+    Sleep(1);
+  }
+}
+
+#if defined(GEN3_x16)
+  // Add extra descriptor for DMA controller to report 'done status' in the DMA table
+void ACL_PCIE_DMA::add_extra_dma_desc() {
+  /*
+    One extra descriptor is required to be fetched. Two if using interrupts.
+    For reads (Host <-- FPGA), the last descriptor sets the DMA done status.
+    For writes (Host --> FPGA), the last descriptor fetches the status
+      descriptor which then sets the DMA done status.
+    When using interrupts, there is an additional descriptor that sends the
+    interrupt, handled in the same way as the above.
+   */
+  // Clear done status flag.
+  m_table_virt_addr->header.flags[m_last_id - 1] = 0; // ID = m_last_id - 1
+
+  if (m_read) {
+    // descriptor[m_last_id]: write 0x1ULL to flags[m_last_id-1] which is used to indicate DMA done.
+    set_immediate_desc( // Set status bit
+            &(m_table_virt_addr->descriptors[m_last_id]), // descriptor[m_last_id] location in user space
+            m_table_dma_phys_addr + 4*(m_last_id - 1), // physical address for 0x1ULL to write (flags[m_last_id].. flag filed size is 4 byte)
+            0x1ULL,
+            255
+	);
+  } else {
+    // Need to fetch status desc into different destination.
+	// descriptor[m_last_id]: DMA Descriptor[m_last_id+1](32 byte) to WDP register set in DMA controller.
+	m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]);
+	set_read_desc(m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER) + (m_last_id + 1) * 32,  // src: set_immediate_desc descriptor location
+                  WRITE_DESC_PRIO_OFFSET + DESC_OFFSET, // des, location of WDP register set
+                  32/4 // copy 32-byte, 8 word
+    );
+
+	// descriptor[m_last_id+1]: write 0x1ULL(4-byte) to status[m_last_id-1] which is used to indicate DMA done.
+	set_immediate_desc( // Set status bit
+            &(m_table_virt_addr->descriptors[m_last_id + 1]),
+            m_table_dma_phys_addr + 4*(m_last_id - 1), //4: size per status entry
+            0x1ULL,
+            255
+    );
+  }
+  MemoryBarrier();
+}
+#endif
+
+void ACL_PCIE_DMA::send_dma_desc() {
+  // Disabling interrupt is used in hostch_create function during polling
+#if defined(GEN3_x8)
+  if (m_read) {
+    m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI);
+    m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+    if (m_interrupt_disabled)
+      m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT);
+    else
+      m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+    MemoryBarrier();
+    m_io->dma->write32(ACL_PCIE_DMA_WR_LAST_PTR, m_last_id - 1);
+  } else {
+    m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI);
+    m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+    if (m_interrupt_disabled)
+      m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT);
+    else
+      m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+    MemoryBarrier();
+    m_io->dma->write32(ACL_PCIE_DMA_RD_LAST_PTR, m_last_id - 1);
+  }
+#elif defined(GEN3_x16)
+  DMA_DESC_ENTRY dt_fetch_desc;
+  UINT32 ctrl, *pValue32;
+  UINT64 dt_fetch_queue_addr64;
+  int i;
+
+  add_extra_dma_desc();
+  // init a descriptor for start dma
+  dt_fetch_desc.src_addr = m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER); // physical addrees of first desciptor (assume dma always start from ID 0)
+  dt_fetch_desc.dst_addr = m_read ? WRITE_DESC_NORM_OFFSET : READ_DESC_NORM_OFFSET;
+  dt_fetch_desc.dst_addr += DESC_OFFSET;
+  ctrl = ((m_last_id - 1) + 2) * 8; // interrupt is not enabled case ... (ID+3)*8 if interrupted is enabled (note: ID = m_last_id-1)
+  ctrl |= 1 << 20;    // Single destination
+  ctrl |= 0xFE << 24; // Special descriptor ID
+  dt_fetch_desc.ctrl = ctrl;
+
+  dt_fetch_queue_addr64 = m_read ? READ_DESC_PRIO_OFFSET : READ_DESC_NORM_OFFSET;
+  pValue32 = (UINT32 *)(&dt_fetch_desc);
+  for (i = 0; i < 4; i++) {
+    m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + i * 4, *(pValue32 + i));
+  }
+  // Most significant DWord must be written last.
+  MemoryBarrier();
+  m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + 4 * 4,*(((uint32_t *)(&dt_fetch_desc)) + 4));
+  MemoryBarrier();
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+void ACL_PCIE_DMA::setup_dma_desc() {
+#if defined(GEN3_x8)
+  m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+  m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI);
+  m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+
+  m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+  m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI);
+  m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+#endif
+}
+
+void ACL_PCIE_DMA::set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len) {
+#if defined(GEN3_x8)
+  m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL);
+  m_active_descriptor->src_addr_udw = (source >> 32);
+  m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL);
+  m_active_descriptor->dest_addr_udw = (dest >> 32);
+  m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#elif defined(GEN3_x16)
+  m_active_descriptor->src_addr = source;
+  m_active_descriptor->dst_addr = dest;
+  m_active_descriptor->ctrl = (ctl_dma_len | (m_last_id << 24));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+void ACL_PCIE_DMA::set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len) {
+#if defined(GEN3_x8)
+  m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL);
+  m_active_descriptor->src_addr_udw = (source >> 32);
+  m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL);
+  m_active_descriptor->dest_addr_udw = (dest >> 32);
+  m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#elif defined(GEN3_x16)
+  set_read_desc(source, dest, ctl_dma_len);
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+#if defined(GEN3_x16)
+void ACL_PCIE_DMA::set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id) {
+  uint32_t ctrl;
+
+  desc->src_addr = data;      // The data to write to given address
+  desc->dst_addr = addr;
+  ctrl = 1;                   // 1 DW status
+  ctrl |= 1 << 18;            // Immediate access
+  ctrl |= id << 24;           // Status descriptor ID
+  desc->ctrl = ctrl;
+  desc->reserved[0] = 0x0;
+  desc->reserved[1] = 0x0;
+  desc->reserved[2] = 0x0;
+}
+#endif
+
+void ACL_PCIE_DMA::set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num) {
+  page_entry->page_addr_ldw = (page_addr & 0xffffffffUL);
+  page_entry->page_addr_udw = (page_addr >> 32);
+  page_entry->page_num = page_num;
+  page_entry->reserved[0] = 0;
+  page_entry->reserved[1] = 0;
+  page_entry->reserved[2] = 1;
+  page_entry->reserved[3] = 0;
+  page_entry->reserved[4] = 0;
+}
+
+void ACL_PCIE_DMA::set_desc_table_header() {
+  int i;
+  for (i = 0; i < ACL_PCIE_DMA_DESC_MAX_ENTRIES; i++) m_table_virt_addr->header.flags[i] = 0;
+}
+
+// Perform operations required when a DMA interrupt comes
+void ACL_PCIE_DMA::service_interrupt() {
+  if (!m_use_polling) {
+    // only submit a new work to the pool when there is not work in queued
+    if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+      set_desc_table_header();
+      SubmitThreadpoolWork(m_work);
+    }
+  }
+}
+
+void ACL_PCIE_DMA::spin_loop_ns(UINT64 wait_ns) {
+  cl_ulong start = m_timer->get_time_ns();
+  cl_ulong finish;
+
+  do {
+    finish = m_timer->get_time_ns();
+  } while (finish - start < wait_ns);
+}
+
+void ACL_PCIE_DMA::check_last_id(UINT32 *last_id) {
+  ACL_PCIE_ASSERT(*last_id <= (ACL_PCIE_DMA_RESET_ID + 1), "last id was greater than 255.\n");
+
+  if (*last_id == (ACL_PCIE_DMA_RESET_ID + 1)) {
+    *last_id = 0;
+    return;
+  } else if (*last_id == ACL_PCIE_DMA_TABLE_SIZE) {
+    *last_id = 0;
+    return;
+  }
+  ACL_PCIE_ASSERT(*last_id < (ACL_PCIE_DMA_TABLE_SIZE), "last id was greater than 127.\n");
+}
+
+// Relinquish the CPU to let any other thread to run
+// Return 0 since there is no useful work to be performed here
+int ACL_PCIE_DMA::yield() {
+  Sleep(0);
+  return 0;
+}
+
+// Add a byte-offset to a void* pointer
+inline void *ACL_PCIE_DMA::compute_address(void *base, uintptr_t offset) {
+  uintptr_t p = reinterpret_cast<uintptr_t>(base);
+  return reinterpret_cast<void *>(p + offset);
+}
+
+int ACL_PCIE_DMA::hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem) {
+  fpga_result FPGA_status;
+  UINT64 wsid;
+
+  // No active segment of pinned memory - pin one
+
+  // Lock HOSTCH_TABLE using WsId
+  FPGA_status = fpgaPrepareBuffer(m_handle, (UINT64)len, (PVOID *)&addr, &wsid, FPGA_BUF_PREALLOCATED);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaPrepareBuffer function for Hostchannel failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n");
+
+  new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element));
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n");
+
+  new_mem->WsId = wsid;
+  new_mem->UsrVa = (PVOID)addr;
+  new_mem->next_page = new_mem->dma_page;
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status =
+      fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh Pinning 0x%zx bytes at 0x%p.\n", len, addr);
+
+  return 0;
+}
+
+// Only 1 pin_memory can be running at a time
+void ACL_PCIE_DMA::pin_memory(PINNED_MEM *new_mem, bool prepin) {
+  fpga_result result;
+  UINT64 wsid = 0x0;
+
+  // No active segment of pinned memory - pin one
+  m_bytes_rem = prepin ? (m_bytes_rem - m_last_pinned_size) : (m_bytes - m_bytes_sent);
+  UINT32 last_id = prepin ? 0 : m_last_id;
+  check_last_id(&last_id);
+  size_t last_id_size_offset = last_id * PAGE_SIZE;
+  size_t lock_size = (m_bytes_rem > ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset)
+                         ? ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset
+                         : m_bytes_rem;
+  void *lock_addr =
+      prepin ? compute_address(m_last_pinned_addr, m_last_pinned_size) : compute_address(m_host_addr, m_bytes_sent);
+  uintptr_t last_page_portion = (reinterpret_cast<uintptr_t>(lock_addr) + lock_size) & ACL_PCIE_DMA_PAGE_ADDR_MASK;
+
+  // If doing max pinning, check if will *end* on page boundary. If not, better
+  // to pin a bit less and end up on the boundary. This way, will have fewer
+  // descriptors to send.
+  if (lock_size == (ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset) && last_page_portion != 0) {
+    lock_size -= (size_t)last_page_portion;
+  }
+
+  assert(lock_size < MAXDWORD);
+
+  // Lock memory using WsId
+  result = fpgaPrepareBuffer(m_handle, (UINT64)lock_size, (PVOID *)&lock_addr, &wsid, FPGA_BUF_PREALLOCATED);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "HostCh : fpgaPrepareBuffer function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element));
+
+  result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  new_mem->WsId = wsid;
+  new_mem->UsrVa = (PVOID)lock_addr;
+  new_mem->next_page = new_mem->dma_page;
+
+  // IOCTL call to flush CPU buffers
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  m_last_pinned_size = lock_size;
+  m_last_pinned_addr = lock_addr;
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Pinning 0x%zx bytes at 0x%p.\n", lock_size, lock_addr);
+}
+
+// Unpin Memory
+void ACL_PCIE_DMA::unpin_memory(PINNED_MEM *old_mem) {
+  fpga_result result = FPGA_OK;
+  UINT64 wsId = old_mem->WsId;
+
+  // IOCTL call to flush I/O buffers
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID)&wsId, NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // UnLock previously locked memory using WsId
+  result = fpgaReleaseBuffer(m_handle, wsId);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n");
+
+  if (old_mem->dma_page != NULL) free(old_mem->dma_page);
+
+  old_mem->next_page = NULL;
+  old_mem->dma_page = NULL;
+  old_mem->pages_rem = 0;
+  old_mem->UsrVa = NULL;
+}
+
+// Check if user's 'ack' API updated end pointer of circular buf
+// Update end pointer in IP
+int ACL_PCIE_DMA::hostch_push_update() {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (h->rd_buf_end_pointer != *h->user_rd_end_pointer) {
+    h->rd_buf_end_pointer = *h->user_rd_end_pointer;
+  } else {
+    h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter;
+    return 1;
+  }
+  h->loop_counter = HOSTCH_LOOP_COUNTER;
+
+  m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, (UINT32)h->rd_buf_end_pointer);
+
+  return 0;
+}
+
+// Check if user's 'ack' API updated front pointer of circular buf
+// Update end pointer in IP
+int ACL_PCIE_DMA::hostch_pull_update() {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (h->wr_buf_front_pointer != *h->user_wr_front_pointer) {
+    h->wr_buf_front_pointer = *h->user_wr_front_pointer;
+  } else {
+    h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter;
+    return 1;
+  }
+  h->loop_counter = HOSTCH_LOOP_COUNTER;
+
+  m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, (UINT32)h->wr_buf_front_pointer);
+  return 0;
+}
+
+// Transfer data between host and device
+// This function returns right after the transfer is scheduled
+// Return 0 on success
+int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) {
+  ACL_PCIE_ASSERT(m_event == NULL, "non-empty event before a new DMA read/write.\n");
+
+  // Copy the parameters over and mark the job as running
+  m_event = e;
+  m_read = reading;
+  m_bytes = bytes;
+  m_host_addr = host_addr;
+  m_dev_addr = dev_addr;
+
+  // Start processing the request
+  m_bytes_sent = 0;
+  m_last_id = ACL_PCIE_DMA_RESET_ID;
+  m_prepinned = 0;
+
+#if defined(GEN3_x8)
+  if (m_read) {
+    m_io->dma->read32(ACL_PCIE_DMA_WR_LAST_PTR, &m_last_id);
+    m_last_id++;
+  } else {
+    m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id);
+    m_last_id++;
+  }
+
+#elif defined(GEN3_x16)
+  m_last_id = 0;
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+  m_idle = false;
+
+  // setup the work inside the threadpool to perform the first DMA transaction
+  ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                    return -1,
+                    "failed to schedule the first work for DMA read/write.\n");
+
+  SubmitThreadpoolWork(m_work);
+
+  return 0;  // success
+}
+
+// function to be scheduled to execute whenever an interrupt arrived
+bool ACL_PCIE_DMA::update(bool forced) {
+  cl_ulong start;
+  int status;
+  UINT32 max_transfer;
+  unsigned int i;
+  HOSTCH_DESC *h = &hostch_data;
+  size_t current_transfer_size = 0;
+
+  if (!forced) return false;
+
+  if (h->pull_valid && m_idle) {
+    // Check user memory to see if there was update to user buffer pointer for pull
+    status = hostch_pull_update();
+  }
+
+  if (h->push_valid && m_idle) {
+    // Check user memory to see if there was update to user buffer pointer for push
+    status = hostch_push_update();
+  }
+
+  if ((h->push_valid | h->pull_valid) && m_idle && (h->thread_sync_valid && h->loop_counter > 0)) {
+    // setup the work inside the threadpool to perform the first DMA transaction
+    ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                      return false,
+                      "HostCh : failed to schedule the first work for DMA read/write.\n");
+    SubmitThreadpoolWork(m_work);
+    return false;
+
+  } else if (m_idle && (h->thread_sync_valid && h->loop_counter == 0)) {
+    *h->user_thread_sync = 0;
+    return false;
+
+  } else if (m_idle) {
+    return false;
+  }
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Bytes left %zu\n", m_bytes - m_bytes_sent);
+  // Process any descriptors that have completed
+  set_desc_table_header();
+  cl_ulong finish = 0;
+  if (ACL_PCIE_DEBUG >= VERBOSITY_BLOCKTX) finish = m_timer->get_time_ns();
+
+  // Check if the transaction is complete
+  if (m_bytes_sent == m_bytes) {
+    if (m_active_mem.UsrVa != NULL) unpin_memory(&m_active_mem);
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Transaction complete!\n");
+    ACL_PCIE_ASSERT(m_active_mem.UsrVa == NULL, "there is still active pinned memory after the DMA read/write.\n");
+    WaitForThreadpoolWorkCallbacks(m_unpin_work, false);
+    if (!m_dma_unpin_pending.empty()) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] Done, but pinned memory still in queue. Wait until queue is empty.\n");
+      if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+        SubmitThreadpoolWork(m_work);
+      }
+
+      Sleep(0);
+      return true;
+    }
+
+    m_last_id = ACL_PCIE_DMA_RESET_ID;
+    m_idle = true;
+
+    if (m_event) {
+      // Use a temporary variable to save the event data and reset m_event before calling event_update_fn
+      // to avoid race condition that the main thread may start a new DMA transfer before this work-thread
+      // is able to reset the m_event.
+      aocl_mmd_op_t temp_event = m_event;
+      m_event = NULL;
+
+      m_pcie->event_update_fn(temp_event, 0);
+    }
+
+    if ((h->push_valid | h->pull_valid) && (h->thread_sync_valid && h->loop_counter > 0)) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return false,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+
+    return true;
+  }
+
+  // Check if we are done with previously pinned memory.
+  if (m_active_mem.UsrVa == NULL || m_active_mem.pages_rem == 0) {
+    m_done_mem = m_active_mem;
+
+    WaitForThreadpoolWorkCallbacks(m_pin_work, false);
+
+    // Get pre-pinned memory if there are any.
+    if (m_pre_pinned_mem.UsrVa != NULL) {
+      m_active_mem = m_pre_pinned_mem;
+      m_pre_pinned_mem.UsrVa = NULL;
+      m_prepinned = 0;
+    } else if (m_prepinned) {
+      if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+        SubmitThreadpoolWork(m_work);
+      }
+      Sleep(1);
+      return true;
+    } else {
+      pin_memory(&m_active_mem, false);
+    }
+  }
+
+  // Main DMA execution
+  // 1. Transfers up to 128 descriptors
+  //    - Each descriptor can transfer up to ACL_PCIE_DMA_MAX_TRANSFER_SIZE bytes
+  // 2. Launch a thread to unpin memory
+  // 3. Launch a thread to pre-pin next memory
+  if (m_active_mem.pages_rem > 0) {
+    // Calculate how many descriptors can be sent
+    check_last_id(&m_last_id);
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] last id was %u\n", m_last_id);
+    max_transfer = ACL_PCIE_DMA_TABLE_SIZE - m_last_id;
+
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] max_transfer %u\n", max_transfer);
+
+    // Build descriptor table
+    for (i = 0; i < max_transfer; i++) {
+      if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) {
+        // A10 DMA
+        m_active_descriptor = &(m_table_virt_addr->descriptors[i]);
+      };
+      if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+        // S10 DMA
+        m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]);
+      };
+      if (m_read) {
+        if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) {
+          ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for read. Page size is %u bytes\n",
+                             ACL_PCIE_DMA_MAX_TRANSFER_SIZE,
+                             m_active_mem.next_page->length);
+          set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4);
+          m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+        } else {
+          set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, m_active_mem.next_page->length / 4);
+          m_dev_addr += m_active_mem.next_page->length;
+          m_bytes_sent += m_active_mem.next_page->length;
+          current_transfer_size += m_active_mem.next_page->length;
+          ++m_active_mem.next_page;
+          m_active_mem.pages_rem--;
+        }
+      } else {
+        if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) {
+          ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for write. Page size is %u bytes\n",
+                             ACL_PCIE_DMA_MAX_TRANSFER_SIZE,
+                             m_active_mem.next_page->length);
+          set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4);
+          m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+        } else {
+          set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, m_active_mem.next_page->length / 4);
+          m_dev_addr += m_active_mem.next_page->length;
+          m_bytes_sent += m_active_mem.next_page->length;
+          current_transfer_size += m_active_mem.next_page->length;
+          ++m_active_mem.next_page;
+          m_active_mem.pages_rem--;
+        }
+      }
+      m_last_id++;
+      if (m_active_mem.pages_rem == 0) break;
+    }
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Transferring %zu bytes using %u descriptors\n", current_transfer_size, i);
+
+    MemoryBarrier();
+    // Send descriptor table to DMA
+    start = m_timer->get_time_ns();
+    m_interrupt_disabled = FALSE;
+    send_dma_desc();
+    int pinning = 0;
+    int unpinning = 0;
+    cl_ulong unpin_start = 0, unpin_finish = 0;
+
+    // Launch unpin thread
+    if (m_done_mem.UsrVa != NULL) {
+      unpin_start = m_timer->get_time_ns();
+      unpinning = 1;
+
+      // wait for previous unpin to finish
+      WaitForThreadpoolWorkCallbacks(m_unpin_work, false);
+
+      QUEUE_STRUCT entry;
+
+      entry.WsId = m_done_mem.WsId;
+      entry.SGListPtr = (PVOID)(m_done_mem.dma_page);
+
+      m_dma_unpin_pending.push(entry);
+
+      // Make sure Push into unpin queue comes before launching unpin thread
+      MemoryBarrier();
+
+      // Launch unpin thread
+      SubmitThreadpoolWork(m_unpin_work);
+
+      m_done_mem.next_page = NULL;
+
+      // if (m_done_mem.dma_page != NULL)
+      // free(m_done_mem.dma_page);
+
+      m_done_mem.dma_page = NULL;
+
+      m_done_mem.UsrVa = NULL;
+      unpin_finish = m_timer->get_time_ns();
+    }
+
+    // Launch pre-pin thread
+    cl_ulong pin_start = 0, pin_finish = 0;
+    if (((m_bytes_rem - m_last_pinned_size) > 0) && (m_prepinned == 0)) {
+      pin_start = m_timer->get_time_ns();
+      pinning = 1;
+      m_prepinned = 1;
+
+      // This wait should pass right through.
+      // There is another wait above, before switching active and prepin memory
+      WaitForThreadpoolWorkCallbacks(m_pin_work, false);
+      SubmitThreadpoolWork(m_pin_work);
+      pin_finish = m_timer->get_time_ns();
+    }
+
+    if (m_use_polling) {
+      wait_finish();
+      finish = m_timer->get_time_ns();
+      ACL_PCIE_DMA_DEBUG(
+          ":::: [DMA] Transfer (%zu bytes) completed in %.2f us - %.2f MB/s :: pinning %i in %.2f us :: unpinning %i "
+          "in %.2f us :: pages rem %li\n",
+          current_transfer_size,
+          (finish - start) / 1000.0,
+          1000000000.0 * current_transfer_size / (finish - start) / (1024.0 * 1024.0),
+          pinning,
+          (pin_finish - pin_start) / 1000.0,
+          unpinning,
+          (unpin_finish - unpin_start) / 1000.0,
+          m_active_mem.pages_rem);
+    }
+
+    return true;
+  }
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Nothing happened\n");
+  return true;
+}
+
+// Poll DMA transfer
+// Only used during host channel create
+// Used to transfer the page table of pinned down MMD circular buffer to host channel IP
+// The size of this transfer is known to be small
+void ACL_PCIE_DMA::poll_wait() {
+  UINT32 wait_timer;
+
+  while (1) {
+    wait_timer = ACL_PCIE_DMA_TIMEOUT;
+    while (wait_timer > 0) {
+      wait_timer--;
+
+      if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) {
+        ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh : Wait done\n");
+        set_desc_table_header();
+#if defined(GEN3_x8)
+        if (m_read)
+          m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+        else
+          m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+#endif
+        m_interrupt_disabled = FALSE;
+
+        return;
+      }
+      // Delay the CPU from checking the memory for 1us. CPU is still running this thread.
+      // but reduces memory access from CPU
+      spin_loop_ns(1000);
+    }
+
+    // If DMA hasn't finished yet, free up the CPU for 1ms
+    ACL_PCIE_DMA_DEBUG(
+        ":::: [DMA] HostCh : Poll wait failed while transferring host channel page table to IP. Sleeping for 1ms.\n");
+    Sleep(1);
+  }
+}
+
+// Set IP's parameters for host channel.
+// Parameters are txs address to write updated front/end pointer to on host memory,
+// Address to DMA data to, to stream data into kernel
+void ACL_PCIE_DMA::hostch_start(int channel) {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (channel == (int)ACL_HOST_CHANNEL_0_ID) {
+    // Fix this Line
+    h->user_rd_front_pointer_bus_addr = h->m_hostch_rd_pointer.dma_page[0].phys_addr;
+
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_LOW, h->user_rd_front_pointer_bus_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_HIGH, (h->user_rd_front_pointer_bus_addr) >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_LOW, ACL_HOST_CHANNEL_0_DMA_ADDR & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_HIGH, ACL_HOST_CHANNEL_0_DMA_ADDR >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_BUF_SIZE, (UINT32)h->buffer_size);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, 0);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 1);
+
+  } else if (channel == (int)ACL_HOST_CHANNEL_1_ID) {
+    h->user_wr_end_pointer_bus_addr = h->m_hostch_wr_pointer.dma_page[0].phys_addr + sizeof(size_t);
+
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_LOW, h->user_wr_end_pointer_bus_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_HIGH, (h->user_wr_end_pointer_bus_addr) >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_LOW, ACL_HOST_CHANNEL_1_DMA_ADDR & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_HIGH, ACL_HOST_CHANNEL_1_DMA_ADDR >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_BUF_SIZE, (UINT32)h->buffer_size);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, 0);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 1);
+  }
+}
+
+void ACL_PCIE_DMA::hostch_thread_sync(void *user_addr) {
+  int status;
+  HOSTCH_DESC *h = &hostch_data;
+
+  if ((user_addr == NULL) & (h->thread_sync_valid)) {
+    if ((h->push_valid | h->pull_valid) && m_idle && (*h->user_thread_sync == 0)) {
+      h->loop_counter = HOSTCH_LOOP_COUNTER;
+      SubmitThreadpoolWork(m_work);
+      *h->user_thread_sync = 1;
+    }
+  } else {
+    status = hostch_buffer_lock(user_addr, sizeof(size_t), &(h->m_sync_thread_pointer));
+    h->user_thread_sync = (size_t *)h->m_sync_thread_pointer.UsrVa;
+    h->loop_counter = HOSTCH_LOOP_COUNTER;
+    *h->user_thread_sync = 0;
+    h->thread_sync_valid = 1;
+  }
+}
+
+int ACL_PCIE_DMA::hostch_create(void *user_addr, void *buf_pointer, size_t size, int channel) {
+  int status;
+  uint32_t i;
+  HOSTCH_DESC *h = &hostch_data;
+
+  DMA_ADDR dma_address;
+  h->buffer_size = size;
+
+  setup_dma_desc();
+#if defined(GEN3_x8)
+  m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id);
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: read dma_rd_last_id %u\n", (unsigned)m_last_id);
+
+  // Set variables before calling dma helper functions
+  m_last_id++;
+#endif
+  m_read = 0;
+
+  // Only create push channel if it's not already open
+  if ((int)ACL_HOST_CHANNEL_0_ID == channel && !h->push_valid) {
+    h->user_rd_buffer = user_addr;
+
+    // Pin push user buffer
+    status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_rd_mem));
+    status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_rd_pointer));
+
+    // Map circular push buffer's end pointer so that the driver can poll on it for update from user space
+    h->user_rd_front_pointer = (size_t *)h->m_hostch_rd_pointer.UsrVa;
+    h->user_rd_end_pointer = h->user_rd_front_pointer + 1;
+
+    // Send the circular push buffer's pinned address to IP, so IP can initiate DMA transfer by itself.
+    for (i = 0; i < (size / PAGE_SIZE); i++) {
+      dma_address = h->m_hostch_rd_mem.next_page->phys_addr;
+      set_hostch_page_entry(&(h->push_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i);
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: push page entry[%u] = %#016llx size = %#016x\n",
+                         (unsigned)i,
+                         (UINT64)dma_address,
+                         h->m_hostch_rd_mem.next_page->length);
+
+      // Make 4KB pages from an array of pages of m_hostch_rd_mem
+      if (h->m_hostch_rd_mem.next_page->length == PAGE_SIZE) {
+        ++h->m_hostch_rd_mem.next_page;
+        h->m_hostch_rd_mem.pages_rem--;
+      } else {
+        h->m_hostch_rd_mem.next_page->length -= PAGE_SIZE;
+        h->m_hostch_rd_mem.next_page->phys_addr += PAGE_SIZE;
+      }
+    }
+
+    set_desc_table_header();
+    check_last_id(&m_last_id);
+
+#if defined(GEN3_x8)
+    // Set variable before calling dma helper functions
+    m_active_descriptor = &(m_table_virt_addr->descriptors[0]);
+    set_read_desc(
+        h->push_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_RD_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4));
+    m_last_id++;
+
+    // Read Interrupt will be disabled from send_dma_desc till poll_wait
+    m_interrupt_disabled = TRUE;
+    send_dma_desc();
+    poll_wait();
+#endif
+
+    // Reset and enable the push channel on IP
+    UINT32 data;
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data);
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 1);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data);
+
+    // Set IP's control registers for push channel
+    hostch_start((int)ACL_HOST_CHANNEL_0_ID);
+
+    h->push_valid = 1;
+
+    // Only launch queue if pull channel is not open and if there is no DMA transfer
+    if (!h->pull_valid && m_idle) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return -1,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+    return 0;
+
+  } else if ((int)ACL_HOST_CHANNEL_1_ID == channel && !h->pull_valid) {
+    h->user_wr_buffer = user_addr;
+
+    // Pin pull user buffer
+    status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_wr_mem));
+    status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_wr_pointer));
+
+    // Map circular pull buffer's end pointer so that the driver can poll on it for update from user space
+    h->user_wr_front_pointer = (size_t *)h->m_hostch_wr_pointer.UsrVa;
+    h->user_wr_end_pointer = h->user_wr_front_pointer + 1;
+
+    // Send the circular pull buffer's pinned address to IP, so IP can initiate DMA transfer by itself.
+    for (i = 0; i < (size / PAGE_SIZE); i++) {
+      dma_address = h->m_hostch_wr_mem.next_page->phys_addr;
+      set_hostch_page_entry(&(h->pull_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i);
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: pull page entry[%u] = %#016llx size = %#016x\n",
+                         (unsigned)i,
+                         (UINT64)dma_address,
+                         h->m_hostch_wr_mem.next_page->length);
+
+      // Make 4KB pages from an array of pages of m_hostch_wr_mem
+      if (h->m_hostch_wr_mem.next_page->length == PAGE_SIZE) {
+        ++h->m_hostch_wr_mem.next_page;
+        h->m_hostch_wr_mem.pages_rem--;
+      } else {
+        h->m_hostch_wr_mem.next_page->length -= PAGE_SIZE;
+        h->m_hostch_wr_mem.next_page->phys_addr += PAGE_SIZE;
+      }
+    }
+
+    set_desc_table_header();
+    check_last_id(&m_last_id);
+
+#if defined(GEN3_x8)
+    // Set variable before calling dma helper functions
+    m_active_descriptor = &(m_table_virt_addr->descriptors[0]);
+    set_read_desc(
+        h->pull_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_WR_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4));
+    m_last_id++;
+
+    // Read Interrupt will be disabled from send_dma_desc till poll_wait
+    m_interrupt_disabled = TRUE;
+    send_dma_desc();
+    poll_wait();
+#endif
+
+    // Reset and enable the pull channel on IP
+    UINT32 temp;
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp);
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 1);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp);
+
+    // Set IP's control registers for pull channel
+    hostch_start((int)ACL_HOST_CHANNEL_1_ID);
+
+    h->pull_valid = 1;
+
+    // Only launch queue if push channel is not open and if there is no DMA transfer
+    if (!h->push_valid && m_idle) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return -1,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+    return 0;
+
+  } else {
+    return ERROR_INVALID_CHANNEL;
+  }
+}
+
+// Destroy channel call from user.
+// Unlock all buffers and reset IP
+int ACL_PCIE_DMA::hostch_destroy(int channel) {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if ((int)ACL_HOST_CHANNEL_0_ID == channel) {
+    if (h->push_valid) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying push host channel.");
+      m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 0);
+      MemoryBarrier();
+      m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0);
+      MemoryBarrier();
+
+      if (h->m_hostch_rd_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_mem);
+      if (h->m_hostch_rd_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_pointer);
+      h->push_valid = 0;
+
+      if (!h->pull_valid) {
+        if (h->thread_sync_valid) {
+          h->thread_sync_valid = 0;
+          if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer);
+        }
+        if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false);
+      }
+    }
+  } else if ((int)ACL_HOST_CHANNEL_1_ID == channel) {
+    if (h->pull_valid) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying pull host channel.");
+      m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 0);
+      MemoryBarrier();
+      m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0);
+      MemoryBarrier();
+
+      if (h->m_hostch_wr_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_mem);
+      if (h->m_hostch_wr_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_pointer);
+      h->pull_valid = 0;
+
+      if (!h->push_valid) {
+        if (h->thread_sync_valid) {
+          h->thread_sync_valid = 0;
+          if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer);
+        }
+        if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+#endif  // WINDOWS
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h
new file mode 100644
index 0000000..311c634
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h
@@ -0,0 +1,262 @@
+#ifndef ACL_PCIE_DMA_WINDOWS_H
+#define ACL_PCIE_DMA_WINDOWS_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma_windows.h  --------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Windows-specific DMA operations.         */
+/* The actual implementation of the class lives in the acl_pcie_dma_windows.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+// TODO: update DMA related stuff and add wsid
+
+#if defined(WINDOWS)
+
+#include "hw_host_channel.h"
+#include "hw_pcie_dma.h"
+
+#include <windows.h>
+#include <queue>
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_TIMER;
+
+typedef struct _PAGE_INFO {
+  ULONG64 pPhysicalAddr;
+  UINT32 dwBytes;
+} PAGE_INFO, *PPAGE_INFO;
+
+typedef struct _DMA_PAGE {
+  sg_element *Page;
+  DWORD dwPages;
+  UINT64 WsId;
+} DMA_PAGE, *PDMA_PAGE;
+
+typedef struct _QUEUE_STRUCT {
+  UINT64 WsId;
+  PVOID SGListPtr;
+
+} QUEUE_STRUCT, *PQUEUE_STRUCT;
+
+class ACL_PCIE_DMA {
+ public:
+  ACL_PCIE_DMA(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie);
+  ~ACL_PCIE_DMA();
+
+  bool is_idle() { return m_idle; };
+  void stall_until_idle() {
+    while (!is_idle()) yield();
+  };
+
+  // Called by acl_pcie_device to check dma interrupt status
+  int check_dma_interrupt(unsigned int *dma_update);
+
+  // Perform operations required when a DMA interrupt comes
+  void service_interrupt();
+
+  // Relinquish the CPU to let any other thread to run
+  // Return 0 since there is no useful work to be performed here
+  int yield();
+
+  // Transfer data between host and device
+  // This function returns right after the transfer is scheduled
+  // Return 0 on success
+  int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading);
+
+  // the callback function to be scheduled inside the interrupt handler
+  friend void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Seperate function to unpin memory
+  friend void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Seperate function to pin memory
+  friend void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Host channel functions
+  int hostch_create(void *user_addr, void *buf_pointer, size_t size, int reading);
+  int hostch_destroy(int reading);
+  void hostch_thread_sync(void *m_sync_thread);
+
+ private:
+  ACL_PCIE_DMA &operator=(const ACL_PCIE_DMA &) { return *this; }
+
+  ACL_PCIE_DMA(const ACL_PCIE_DMA &src) {}
+
+  struct PINNED_MEM {
+    sg_element *next_page;
+    DWORD pages_rem;
+    sg_element *dma_page;  // Pointer to the original array
+    UINT64 WsId;
+    PVOID UsrVa;
+  };
+
+  struct HOSTCH_DESC {
+    size_t buffer_size;
+    unsigned int loop_counter;
+
+    // Host channel valid
+    // If channel is open, equal to 1
+    int push_valid;
+    int pull_valid;
+
+    // User memory circular buffer
+    void *user_rd_buffer;
+    void *user_wr_buffer;
+
+    // Array of physical addresses of locked hostch pages
+    HOSTCH_TABLE *push_page_table;
+    HOSTCH_TABLE *pull_page_table;
+
+    DMA_PAGE push_page_table_addr;
+    DMA_PAGE pull_page_table_addr;
+
+    // Physical address of the page table
+    DMA_ADDR push_page_table_bus_addr;
+    DMA_ADDR pull_page_table_bus_addr;
+
+    PINNED_MEM m_hostch_rd_mem;
+    PINNED_MEM m_hostch_wr_mem;
+
+    // User memory circular buffer front and end pointers
+    size_t *user_rd_front_pointer;
+    size_t *user_rd_end_pointer;
+    size_t *user_wr_front_pointer;
+    size_t *user_wr_end_pointer;
+
+    DMA_ADDR user_rd_front_pointer_bus_addr;
+    DMA_ADDR user_wr_end_pointer_bus_addr;
+
+    PINNED_MEM m_hostch_rd_pointer;
+    PINNED_MEM m_hostch_wr_pointer;
+
+    // Keep track of push end pointer
+    size_t rd_buf_end_pointer;
+
+    // Keep track of pull front pointer
+    size_t wr_buf_front_pointer;
+
+    // User and driver thread synchronizer
+    int thread_sync_valid;
+    size_t *user_thread_sync;
+    DMA_ADDR user_thread_sync_bus_addr;
+    PINNED_MEM m_sync_thread_pointer;
+  };
+
+  // function to be scheduled to execute whenever an interrupt arrived
+  bool update(bool force_update = false);
+
+  // Helper functions
+  inline void *compute_address(void *base, uintptr_t offset);
+  void set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len);
+  void set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len);
+  void set_desc_table_header();
+  void send_dma_desc();
+  void check_last_id(UINT32 *last_id);
+  void pin_memory(PINNED_MEM *new_mem, bool prepin);
+  void unpin_memory(PINNED_MEM *old_mem);
+  void wait_finish();
+  void unpin_from_queue();
+  void prepin_memory();
+
+  void set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id);
+  void add_extra_dma_desc();
+  // Hostchannel helper function
+  void hostch_start(int channel);
+  int hostch_push_update();
+  int hostch_pull_update();
+  int hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem);
+  void poll_wait();
+  void set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num);
+  void setup_dma_desc();
+  void spin_loop_ns(UINT64 wait_ns);
+
+  // From environment variable
+  int m_use_polling;
+
+  // The dma object we are currently building transactions for
+  PINNED_MEM m_active_mem;
+  PINNED_MEM m_pre_pinned_mem;
+  PINNED_MEM m_done_mem;
+
+  // Hostchannel Struct
+  HOSTCH_DESC hostch_data;
+
+  // The transaction we are currently working on
+  DMA_DESC_TABLE *m_table_virt_addr;
+  DMA_PAGE m_table_dma_addr;
+  DMA_ADDR m_table_dma_phys_addr;
+  DMA_DESC_ENTRY *m_active_descriptor;
+
+  size_t m_last_pinned_size;
+  void *m_last_pinned_addr;
+
+  // Signal to stop multiple pre-pinning from running
+  bool m_prepinned;
+
+  // Local copy of last transfer id. Read once when DMA transfer starts
+  UINT32 m_last_id;
+
+  // variables for the read/write request
+  aocl_mmd_op_t m_event;
+  size_t m_dev_addr;
+  void *m_host_addr;
+  size_t m_bytes;
+  size_t m_bytes_sent;
+  size_t m_bytes_rem;
+  bool m_read;
+  bool m_idle;
+  bool m_interrupt_disabled;
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_TIMER *m_timer;
+
+  // variables needed for the threadpool and works that submitted to it
+  TP_CALLBACK_ENVIRON m_callback_env;
+  PTP_POOL m_threadpool;
+  PTP_WORK m_work;
+
+  // This variable is accessed by the callback function defined in acl_pcie_dma_windows.cpp
+  // This semaphore is intended to keep at most 1 work in queued (not running)
+  HANDLE m_workqueue_semaphore;
+
+  // Seperate thread to unpin
+
+  std::queue<QUEUE_STRUCT> m_dma_unpin_pending;
+
+  TP_CALLBACK_ENVIRON m_unpin_callback_env;
+  PTP_POOL m_unpin_threadpool;
+  PTP_WORK m_unpin_work;
+
+  // Separate thread to pre-pin
+
+  TP_CALLBACK_ENVIRON m_pin_callback_env;
+  PTP_POOL m_pin_threadpool;
+  PTP_WORK m_pin_work;
+};
+
+#endif  // WINDOWS
+
+#endif  // ACL_PCIE_DMA_WINDOWS_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp
new file mode 100644
index 0000000..0dc6d74
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp
@@ -0,0 +1,764 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_hostch.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Linux-specific DMA operations.         */
+/* The declaration of the class lives in the acl_pcie_dma_linux.h                  */
+/* The actual implementation of DMA operation is inside the Linux kernel driver.   */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_hostch.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie_timer.h"
+#include "hw_host_channel.h"
+
+// other standard header files
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif  // LINUX
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+
+void acl_aligned_malloc(void **result, size_t size) {
+#if defined(LINUX)
+  int posix_success;
+  *result = NULL;
+  posix_success = posix_memalign(result, PAGE_SIZE, size);
+  ACL_PCIE_ASSERT(posix_success == 0, "posix_memalign has failed.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+  *result = _aligned_malloc(size, PAGE_SIZE);
+#endif  // WINDOWS
+}
+
+void acl_aligned_free(void *ptr) {
+#if defined(LINUX)
+  free(ptr);
+#endif  // LINUX
+#if defined(WINDOWS)
+  _aligned_free(ptr);
+#endif  // WINDOWS
+}
+
+ACL_PCIE_HOSTCH::ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma)
+    : m_push_queue(NULL),
+      m_push_queue_local_end_p(0),
+      m_push_queue_size(0),
+      m_pull_queue(NULL),
+      m_pull_queue_local_front_p(0),
+      m_pull_queue_size(0),
+      m_pull_queue_available(0),
+      m_pull_queue_pointer(NULL),
+      m_push_queue_pointer(NULL),
+      m_pull_queue_front_p(NULL),
+      m_pull_queue_end_p(NULL),
+      m_push_queue_front_p(NULL),
+      m_push_queue_end_p(NULL),
+      m_sync_thread(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+  ACL_PCIE_ASSERT(dma != NULL, "passed in an empty pointer for dma when creating dma object.\n");
+
+  m_handle = handle;
+  m_pcie = pcie;
+  m_io = io;
+  m_dma = dma;
+  m_timer = new ACL_PCIE_TIMER();
+
+  // Set the valid for all the channels and helper function that checks status of driver thread
+  // to 0
+  m_hostch_push_valid = 0;
+  m_hostch_pull_valid = 0;
+  m_sync_thread_valid = 0;
+
+  const char *dma_timer = getenv("ACL_PCIE_DMA_TIMER");
+  if (dma_timer)
+    m_use_timer = 1;
+  else
+    m_use_timer = 0;
+}
+
+ACL_PCIE_HOSTCH::~ACL_PCIE_HOSTCH() {
+  // If push channel (channel 0) is valid, reset its IP and unpin the MMD buffer
+  if (m_hostch_push_valid) {
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Save the device id for the selected board
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_destroy(ACL_HOST_CHANNEL_0_ID);
+#endif  // WINDOWS
+
+    if (m_push_queue) {
+      acl_aligned_free(m_push_queue);
+      m_push_queue = NULL;
+    }
+
+    if (m_push_queue_pointer) {
+      acl_aligned_free(m_push_queue_pointer);
+      m_push_queue_pointer = NULL;
+    }
+
+    m_hostch_push_valid = 0;
+  }
+
+  // If pull channel (channel 1) is valid, reset its IP and unpin the MMD buffer
+  if (m_hostch_pull_valid) {
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Save the device id for the selected board
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_destroy(ACL_HOST_CHANNEL_1_ID);
+#endif  // WINDOWS
+
+    if (m_pull_queue) {
+      acl_aligned_free(m_pull_queue);
+      m_pull_queue = NULL;
+    }
+
+    if (m_pull_queue_pointer) {
+      acl_aligned_free(m_pull_queue_pointer);
+      m_pull_queue_pointer = NULL;
+    }
+
+    m_hostch_pull_valid = 0;
+  }
+
+  if (m_timer) {
+    delete m_timer;
+    m_timer = NULL;
+  }
+}
+
+// Get host channel version of currently programmed device
+unsigned int ACL_PCIE_HOSTCH::get_hostch_version() {
+  // Make sure version is not what you expect
+  unsigned int version = ACL_VERSIONID ^ 1;
+  unsigned int hostch_version = ACL_HOSTCH_ZERO_CHANNELS ^ 1;
+
+  // Read device version
+  m_io->version->read32(0, &version);
+
+  if (!ACL_HOSTCH_ENABLE) {
+    return ACL_HOSTCH_ZERO_CHANNELS;
+  }
+
+  // Read hostchannel version
+  m_io->hostch_ver->read32(0, &hostch_version);
+
+  return hostch_version;
+}
+
+// Function to check that the driver thread that update host channel IP with
+// user's updates to MMD buffer's end and front index, is still running.
+// Ack call will call sync_thread() if driver thread has timed out.
+// Linux kernel space driver thread is set to timeout in 1ms
+// if there hasn't been any changes to circular buffer pointer from the host.
+int ACL_PCIE_HOSTCH::launch_sync_thread() {
+  if (m_sync_thread_valid == 0) {
+    acl_aligned_malloc((void **)&m_sync_thread, sizeof(size_t));
+
+    if (m_sync_thread == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+#if defined(LINUX)
+    // Save the device id for the selected board
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = m_sync_thread;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_thread_sync(m_sync_thread);
+#endif  // WINDOWS
+
+    m_sync_thread_valid = 1;
+  } else {
+    return 1;
+  }
+  return 0;
+}
+
+int ACL_PCIE_HOSTCH::sync_thread() {
+  if (m_sync_thread_valid && (*m_sync_thread == 0)) {
+#if defined(LINUX)
+    // Save the device id for the selected board
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_thread_sync(NULL);
+#endif  // WINDOWS
+
+    return 0;
+  }
+  return 1;
+}
+
+// This is called only when there aren't any host channels open
+// m_sync_thread is unpinned as part of destroy call to driver. Now free it.
+void ACL_PCIE_HOSTCH::destroy_sync_thread() {
+  if (m_sync_thread_valid) {
+    if (m_sync_thread != NULL) acl_aligned_free(m_sync_thread);
+
+    m_sync_thread_valid = 0;
+    m_sync_thread = NULL;
+  }
+}
+
+// Create host channel. Allocate circular buffer and pin it.
+// Then set channel to valid.
+int ACL_PCIE_HOSTCH::create_hostchannel(char *name, size_t queue_depth, int direction) {
+  int status;
+  unsigned int hostch_version;
+
+  hostch_version = get_hostch_version();
+  ACL_PCIE_DEBUG_MSG_VERBOSE(
+      VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel version read was %u\n", hostch_version);
+
+  // Check if channel name user wants to open exists
+  if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) &&
+      (strncmp(ACL_HOST_CHANNEL_0_NAME, name, strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) == 0)) {
+    int channel = ACL_HOST_CHANNEL_0_ID;
+    // Check if hostchannel version is one that has ACL_HOST_CHANNEL_0_ID
+    if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX,
+                                 ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n",
+                                 ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_INVALID_CHANNEL;
+    }
+
+    // check if the direction for the channel is correct
+    if (direction != ACL_HOST_CHANNEL_0_WRITE) return ERROR_INCORRECT_DIRECTION;
+
+    // Check if channel was already opened previously
+    if (m_hostch_push_valid) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_CHANNEL_PREVIOUSLY_OPENED;
+    }
+
+    // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size
+    size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.)));
+    size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE)
+                               ? HOSTCH_MAX_BUF_SIZE
+                               : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE);
+
+    // Make sure the channel depth is at least 4KB
+    if (!channel_depth) channel_depth = PAGE_SIZE;
+
+    // Create circular buffer for push
+    acl_aligned_malloc(&m_push_queue, channel_depth);
+
+    if (m_push_queue == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+    // Create buffer to hold front and end pointer for the circular buffer
+    acl_aligned_malloc((void **)&m_push_queue_pointer, sizeof(size_t) * 2);
+
+    if (m_push_queue_pointer == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      acl_aligned_free(m_push_queue);
+      return -1;
+    }
+
+    // Set parameters for the push channel
+    m_push_queue_size = channel_depth;
+    m_push_queue_local_end_p = 0;
+
+    m_push_queue_front_p = m_push_queue_pointer;
+    m_push_queue_end_p = (m_push_queue_pointer + 1);
+
+    *m_push_queue_front_p = 0;
+    *m_push_queue_end_p = 0;
+
+    // sync_thread() used to check if kernel thread is still running when user has additional data available.
+    status = launch_sync_thread();
+    if (status == -1) {
+      acl_aligned_free(m_push_queue);
+      acl_aligned_free(m_push_queue_pointer);
+      return -1;
+    }
+
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Send the pointers for the 2 buffers to driver, along with queue size
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_RD;
+    driver_cmd.device_addr = m_push_queue_pointer;
+    driver_cmd.user_addr = m_push_queue;
+    driver_cmd.size = channel_depth;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_create(m_push_queue, m_push_queue_pointer, channel_depth, channel);
+#endif  // WINDOWS
+
+    m_hostch_push_valid = 1;
+    return channel;
+  } else if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) &&
+             (strncmp(ACL_HOST_CHANNEL_1_NAME, name, strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) == 0)) {
+    int channel = ACL_HOST_CHANNEL_1_ID;
+
+    // Check if hostchannel version is one that has ACL_HOST_CHANNEL_1_ID
+    if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX,
+                                 ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n",
+                                 ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_INVALID_CHANNEL;
+    }
+
+    // Check if direction is correct
+    if (direction != ACL_HOST_CHANNEL_1_WRITE) return ERROR_INCORRECT_DIRECTION;
+
+    // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size
+    size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.)));
+    size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE)
+                               ? HOSTCH_MAX_BUF_SIZE
+                               : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE);
+
+    // Make sure the circular buffer is at least 4KB
+    if (!channel_depth) channel_depth = PAGE_SIZE;
+
+    // Check if pull channel was previously opened
+    if (m_hostch_pull_valid) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_CHANNEL_PREVIOUSLY_OPENED;
+    }
+
+    // Create circular buffer
+    acl_aligned_malloc(&m_pull_queue, channel_depth);
+
+    if (m_pull_queue == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+    // Create buffer to hold front and end pointer of the circular buffer
+    acl_aligned_malloc((void **)&m_pull_queue_pointer, sizeof(size_t) * 2);
+
+    if (m_pull_queue_pointer == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      acl_aligned_free(m_pull_queue);
+      return -1;
+    }
+
+    // Set pull channel parameters
+    m_pull_queue_size = channel_depth;
+    m_pull_queue_available = 0;
+    m_pull_queue_local_front_p = 0;
+
+    m_pull_queue_front_p = m_pull_queue_pointer;
+    m_pull_queue_end_p = (m_pull_queue_pointer + 1);
+
+    *m_pull_queue_front_p = 0;
+    *m_pull_queue_end_p = 0;
+
+    // sync_thread() used to check if kernel thread is dead or alive when user pulls data
+    status = launch_sync_thread();
+    if (status == -1) {
+      acl_aligned_free(m_pull_queue);
+      acl_aligned_free(m_pull_queue_pointer);
+      return -1;
+    }
+
+#if defined(LINUX)
+    // Send the pointers for the 2 buffers to driver, along with queue size, and initiate IP
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_WR;
+    driver_cmd.device_addr = m_pull_queue_pointer;
+    driver_cmd.user_addr = m_pull_queue;
+    driver_cmd.size = channel_depth;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_create(m_pull_queue, m_pull_queue_pointer, channel_depth, channel);
+#endif  // WINDOWS
+
+    m_hostch_pull_valid = 1;
+    return channel;
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel does not exist.\n");
+    return ERROR_INVALID_CHANNEL;
+  }
+}
+
+// Destroy Channel. Unlock all buffer, and set channel to invalid.
+int ACL_PCIE_HOSTCH::destroy_hostchannel(int channel) {
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    if (m_hostch_push_valid) {
+      // set pull IP to reset and unlock all buffers
+#if defined(LINUX)
+      struct acl_cmd driver_cmd;
+      int bytes_read;
+      driver_cmd.bar_id = ACLPCI_CMD_BAR;
+      driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD;
+      driver_cmd.device_addr = NULL;
+      driver_cmd.user_addr = NULL;
+      driver_cmd.size = 0;
+      bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+      ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+      m_dma->hostch_destroy(channel);
+#endif  // WINDOWS
+
+      if (m_push_queue) {
+        acl_aligned_free(m_push_queue);
+        m_push_queue = NULL;
+      }
+      if (m_push_queue_pointer) {
+        acl_aligned_free(m_push_queue_pointer);
+        m_push_queue_pointer = NULL;
+      }
+
+      m_hostch_push_valid = 0;
+      if (m_hostch_pull_valid == 0) {
+        destroy_sync_thread();
+      }
+      return 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_CHANNEL_CLOSED;
+    }
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid) {
+#if defined(LINUX)
+      // set push IP to reset and unlock all buffers
+      struct acl_cmd driver_cmd;
+      int bytes_read;
+      driver_cmd.bar_id = ACLPCI_CMD_BAR;
+      driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR;
+      driver_cmd.device_addr = NULL;
+      driver_cmd.user_addr = NULL;
+      driver_cmd.size = 0;
+      bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+      ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+      m_dma->hostch_destroy(channel);
+#endif  // WINDOWS
+
+      if (m_pull_queue) {
+        acl_aligned_free(m_pull_queue);
+        m_pull_queue = NULL;
+      }
+
+      if (m_pull_queue_pointer) {
+        acl_aligned_free(m_pull_queue_pointer);
+        m_pull_queue_pointer = NULL;
+      }
+
+      m_hostch_pull_valid = 0;
+
+      if (m_hostch_push_valid == 0) {
+        destroy_sync_thread();
+      }
+
+      return 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_CHANNEL_CLOSED;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+  }
+
+  return ERROR_INVALID_CHANNEL;
+}
+
+// Call for user to get pointer to location in circular buffer
+// User can then write data or read data from the buffer, depending on direction.
+void *ACL_PCIE_HOSTCH::get_buffer(size_t *buffer_size, int channel, int *status) {
+  // Check if channel exists
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    // Check if channel was created
+    if (m_hostch_push_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      *buffer_size = 0;
+      return NULL;
+    }
+    *status = 0;
+
+    char *temp_input_queue = (char *)m_push_queue;
+
+    size_t push_queue_end, push_queue_front;
+
+    // m_push_queue_front_p is directly updated by host channel IP
+    // through write over Txs. Save value in local variable,
+    // so it doesn't get modified in middle of get_buffer call
+    push_queue_end = *m_push_queue_end_p;
+    push_queue_front = *m_push_queue_front_p;
+
+    // Calculate available free space in host to device push buffer
+    size_t push_buf_avail;
+    if (push_queue_end > push_queue_front)
+      push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32;
+    else if (push_queue_end < push_queue_front)
+      push_buf_avail = push_queue_front - push_queue_end - 32;
+    else
+      push_buf_avail = m_push_queue_size - 32;
+
+    // Calculate how much of the free space is before loop around and after loop around
+    size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail)
+                           ? push_buf_avail
+                           : m_push_queue_size - m_push_queue_local_end_p;
+    size_t loop_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail)
+                           ? 0
+                           : (m_push_queue_local_end_p + push_buf_avail - m_push_queue_size);
+
+    // Return to user the pointer to circular buffer for
+    // space that's available without loop around
+    if (cont_push > 0) {
+      *buffer_size = cont_push;
+      return temp_input_queue + m_push_queue_local_end_p;
+    } else if (loop_push > 0) {
+      *buffer_size = loop_push;
+      return temp_input_queue;
+    } else {
+      *status = 0;
+      *buffer_size = 0;
+
+      // See if the driver thread is still running
+      sync_thread();
+
+      return NULL;
+    }
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      *buffer_size = 0;
+      return NULL;
+    }
+    *status = 0;
+
+    char *temp_output_queue = (char *)m_pull_queue;
+
+    size_t pull_queue_end, pull_queue_front;
+
+    // m_pull_queue_end_p is directly updated by host channel IP
+    // through write over Txs. Save value in local variable,
+    // so it doesn't get modified in middle of get_buffer call
+    pull_queue_end = *m_pull_queue_end_p;
+    pull_queue_front = *m_pull_queue_front_p;
+
+    // Calculate available new data in device to host pull buffer
+    if (pull_queue_end > pull_queue_front)
+      m_pull_queue_available = pull_queue_end - pull_queue_front;
+    else if (pull_queue_end < pull_queue_front)
+      m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end;
+    else
+      m_pull_queue_available = 0;
+
+    // Calculate how much of the data is before loop around and after loop around
+    size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available)
+                           ? m_pull_queue_available
+                           : (m_pull_queue_size - m_pull_queue_local_front_p);
+    size_t loop_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available)
+                           ? 0
+                           : (m_pull_queue_local_front_p + m_pull_queue_available - m_pull_queue_size);
+
+    // Return to user the pointer to circular buffer for
+    // data that's available without loop around
+    if (cont_pull > 0) {
+      *buffer_size = cont_pull;
+      return temp_output_queue + m_pull_queue_local_front_p;
+    } else if (loop_pull > 0) {
+      *buffer_size = loop_pull;
+      return temp_output_queue;
+    } else {
+      *buffer_size = 0;
+
+      // See if the driver thread is still running
+      sync_thread();
+
+      return NULL;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+    *status = ERROR_INVALID_CHANNEL;
+    *buffer_size = 0;
+    return NULL;
+  }
+}
+
+// User has acknowledged the buffer, meaning data was written to or read from the buffter.
+// Hand off to API using end pointer if push channel, and front pointer if pull channel.
+size_t ACL_PCIE_HOSTCH::ack_buffer(size_t send_size, int channel, int *status) {
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    if (m_hostch_push_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      return 0;
+    }
+    *status = 0;
+
+    size_t push_queue_end, push_queue_front;
+
+    // Same calculations as get buffer call to see how much
+    // space is available in MMD circular buffer
+    push_queue_end = *m_push_queue_end_p;
+    push_queue_front = *m_push_queue_front_p;
+
+    size_t push_buf_avail;
+    if (push_queue_end > push_queue_front)
+      push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32;
+    else if (push_queue_end < push_queue_front)
+      push_buf_avail = push_queue_front - push_queue_end - 32;
+    else
+      push_buf_avail = m_push_queue_size - 32;
+
+    // Check to see if user wants to send more than the space available in buffer
+    // Chose lesser of the two to send
+    size_t user_words = send_size / 32;
+    size_t current_push = ((user_words * 32) > push_buf_avail) ? push_buf_avail : (user_words * 32);
+
+    // User can't write back to beginning of MMD buffer, since they can't loop around from the pointer
+    // they got from get_buffer. Only send up to the end of MMD circular buffer to host channel IP
+    size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + current_push)
+                           ? current_push
+                           : (m_push_queue_size - m_push_queue_local_end_p);
+
+    // Update the end index that the driver thread will read, to write the update to host channel IP
+    // and loop around
+    m_push_queue_local_end_p =
+        (m_push_queue_local_end_p + current_push >= m_push_queue_size) ? 0 : m_push_queue_local_end_p + current_push;
+    *m_push_queue_end_p = m_push_queue_local_end_p;
+
+    // See if the driver thread is still running
+    sync_thread();
+
+    return cont_push;
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      return 0;
+    }
+    *status = 0;
+
+    size_t driver_pulled;
+
+    size_t pull_queue_end, pull_queue_front;
+
+    // Same calculations as get buffer call to see how much
+    // data is available in MMD circular buffer
+    pull_queue_end = *m_pull_queue_end_p;
+    pull_queue_front = *m_pull_queue_front_p;
+
+    if (pull_queue_end > pull_queue_front)
+      m_pull_queue_available = pull_queue_end - pull_queue_front;
+    else if (pull_queue_end < pull_queue_front)
+      m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end;
+    else
+      m_pull_queue_available = 0;
+
+    // Check to see if user read more than the data available in buffer
+    // Chose lesser of the two to tell the user how much was actually
+    // freed up for host channel IP to write to.
+    driver_pulled = (send_size > m_pull_queue_available) ? m_pull_queue_available : send_size;
+
+    // User can't loop around and read from the beginning of MMD buffer
+    // Tell the host channel IP that the buffer is free, only up to the end of the circular buffer
+    size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + driver_pulled)
+                           ? driver_pulled
+                           : (m_pull_queue_size - m_pull_queue_local_front_p);
+
+    // Update the front index that the driver thread will read, to write the update to host channel IP
+    // and loop around
+    m_pull_queue_local_front_p = (m_pull_queue_local_front_p + driver_pulled >= m_pull_queue_size)
+                                     ? 0
+                                     : m_pull_queue_local_front_p + driver_pulled;
+    *m_pull_queue_front_p = m_pull_queue_local_front_p;
+
+    // See if the driver thread is still running
+    sync_thread();
+
+    return cont_pull;
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+    *status = ERROR_INVALID_CHANNEL;
+    return 0;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h
new file mode 100644
index 0000000..e86fa61
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h
@@ -0,0 +1,136 @@
+#ifndef ACL_PCIE_HOSTCH_H
+#define ACL_PCIE_HOSTCH_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_hostch.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Linux-specific DMA operations.           */
+/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif
+#endif
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_TIMER;
+class ACL_PCIE_DMA;
+
+class ACL_PCIE_HOSTCH {
+ public:
+  ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma);
+
+  ~ACL_PCIE_HOSTCH();
+
+  // Initialize host channel specified by name, and return handle to it
+  int create_hostchannel(char *name, size_t queue_depth, int direction);
+
+  // Destroy host channel specified by channel handle
+  // return 0 on success and negative otherwise
+  int destroy_hostchannel(int channel);
+
+  // Provide pointer to user with pointer to write and read to host channel
+  // IP with. Pointer is pointer to MMD circular buffer, that's pre-pinned.
+  // Address of this pre-pinned memory is transferred to IP during create
+  void *get_buffer(size_t *buffer_size, int channel, int *status);
+
+  // Acknowledge from user that send_size bytes of data has be written to
+  // or read from host channel MMD buffer, that's provided by the channel
+  // handle. This will move end index for push channel, and front index for
+  // pull channel
+  size_t ack_buffer(size_t send_size, int channel, int *status);
+
+ private:
+  ACL_PCIE_HOSTCH &operator=(const ACL_PCIE_HOSTCH &) { return *this; }
+
+  ACL_PCIE_HOSTCH(const ACL_PCIE_HOSTCH &src) {}
+
+  // Host Channel version of programmed device
+  unsigned int get_hostch_version();
+
+  // Helper functions to see if the thread that updates
+  // host channel IP with user's buffer updates, is still running
+  int launch_sync_thread();
+  int sync_thread();
+  void destroy_sync_thread();
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DMA *m_dma;
+
+  ACL_PCIE_TIMER *m_timer;
+  int m_use_timer;
+
+  // Host Channel valid
+  // If channel is open, equal to 1
+  int m_hostch_push_valid;
+  int m_hostch_pull_valid;
+
+  // Input Queue
+  // Write data into circular buffer in MMD, that host channel
+  // can read from
+  void *m_push_queue;
+  size_t m_push_queue_local_end_p;
+  size_t m_push_queue_size;
+
+  // Information to track input queue
+  void *m_pull_queue;
+  size_t m_pull_queue_local_front_p;
+  size_t m_pull_queue_size;
+  size_t m_pull_queue_available;
+
+  // Shared front and end pointer with driver
+  // Circular buffer in MMD that the host channel IP can
+  // write into. Host will then read from it
+  size_t *m_pull_queue_pointer;
+  size_t *m_push_queue_pointer;
+
+  size_t *m_pull_queue_front_p;
+  size_t *m_pull_queue_end_p;
+  size_t *m_push_queue_front_p;
+  size_t *m_push_queue_end_p;
+
+  // User space memory that Linux kernel space has write
+  // access to. Since the MMD buffer is circular, whenever
+  // user writes to reads from it, the index for end and front
+  // changes, respectively. This needs to be sent to host channel IP
+  // and the thread in driver handles that. However, this thread will
+  // die after 1ms of inactivity to free up the CPU. When it does that,
+  // it will write to m_sync_thread with value of 0, so that MMD knows to
+  // launch it again, for subsequent get_buffer and ack_buffer calls.
+  int m_sync_thread_valid;
+  size_t *m_sync_thread;
+};
+
+void acl_aligned_malloc(void **result, size_t size);
+void acl_aligned_free(void *ptr);
+
+#endif  // ACL_PCIE_HOSTCH_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp
new file mode 100644
index 0000000..92c9cf0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp
@@ -0,0 +1,556 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_mm_io.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle memory mapped IO over PCIe.            */
+/* The declaration of the class lives in the acl_pcie_mm_io.h.                     */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+
+// other standard header files
+#include <string.h>
+
+#if defined(LINUX)
+#include <unistd.h>  // template
+#endif               // LINUX
+
+ACL_PCIE_MM_IO_DEVICE::ACL_PCIE_MM_IO_DEVICE(
+    fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid handle when creating mm_io object.\n");
+
+#if defined(WINDOWS)
+  strncpy_s(m_name, MAX_NAME_LENGTH - 1, name, (MAX_NAME_LENGTH - 1));
+#else
+  strncpy(m_name, name, (MAX_NAME_LENGTH - 1));
+#endif
+  m_name[(MAX_NAME_LENGTH - 1)] = '\0';
+
+  m_handle = handle;
+  m_bar = bar;
+  m_offset = device_offset;
+  m_diff_endian = diff_endian;
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Init: Bar " DWORD_FMT_U ", Total offset 0x%zu, diff_endian is %d \n",
+                     m_name,
+                     m_bar,
+                     (size_t)m_offset,
+                     m_diff_endian ? 1 : 0);
+}
+
+ACL_PCIE_MM_IO_DEVICE::~ACL_PCIE_MM_IO_DEVICE() {}
+
+#if defined(LINUX)
+// Helper functions to implement all other read/write functions
+template <typename T>
+DWORD linux_read(fpga_handle device, DWORD bar, KPTR address, T *data) {
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = bar;
+  driver_cmd.command = ACLPCI_CMD_DEFAULT;
+  driver_cmd.device_addr = reinterpret_cast<void *>(address);
+  driver_cmd.user_addr = data;
+  driver_cmd.size = sizeof(*data);
+  // function invoke linux_read will not write to global memory.
+  // So is_diff_endian is always false
+  driver_cmd.is_diff_endian = 0;
+
+  return read(device, &driver_cmd, sizeof(driver_cmd));
+}
+
+template <typename T>
+DWORD linux_write(fpga_handle device, DWORD bar, KPTR address, T data) {
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = bar;
+  driver_cmd.command = ACLPCI_CMD_DEFAULT;
+  driver_cmd.device_addr = reinterpret_cast<void *>(address);
+  driver_cmd.user_addr = &data;
+  driver_cmd.size = sizeof(data);
+  // function invoke linux_write will not write to global memory.
+  // So is_diff_endian is always false
+  driver_cmd.is_diff_endian = 0;
+
+  return write(device, &driver_cmd, sizeof(driver_cmd));
+}
+#endif  // LINUX
+
+int ACL_PCIE_MM_IO_DEVICE::read8(size_t addr, UINT8 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT8));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 8 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 8 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write8(size_t addr, UINT8 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT8));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 8 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 8 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read16(size_t addr, UINT16 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT16));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 16 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 16 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write16(size_t addr, UINT16 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT16));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 16 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 16 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read32(size_t addr, UINT32 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMMIO32(m_handle, m_bar, bar_addr, data);
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 32 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 32 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write32(size_t addr, UINT32 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMMIO32(m_handle, m_bar, bar_addr, data);
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 32 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 32 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read64(size_t addr, UINT64 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  // Original code had a 32-bit Read
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, data, 8);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 64 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 64 bits (0x%llx) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write64(size_t addr, UINT64 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  // Original code had a 32-bit Write
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (void *)&data, 8);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 64 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    bar_addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 64 bits (0x%llx) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write_block(size_t addr, size_t size, void *src) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X
+                             " with offset)\n",
+                             m_name,
+                             size,
+                             addr,
+                             (size_t)bar_addr);
+
+#if defined(WINDOWS)
+  DWORD FP_size = static_cast<DWORD>(size);
+  size_t alignment_size = size % 4;
+  DWORD FP_alignment_size = static_cast<DWORD>(alignment_size);
+  // 32-bit MMIO Write
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, src, FP_size - FP_alignment_size);
+  if (alignment_size) {
+    void *alignment_addr = compute_address(src, size - alignment_size);
+    KPTR alignment_bar_addr = bar_addr + size - alignment_size;
+    status = fpgaWriteMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size);
+  }
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  // Can't use templated linux_write here because *src doesn't give you the size to read.
+  struct acl_cmd driver_cmd {};
+  driver_cmd.bar_id = m_bar;
+  driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr);
+  driver_cmd.user_addr = src;
+  driver_cmd.size = size;
+  // Notify the driver if the host and device's memory have different endianess.
+  driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0;
+  status = write(m_handle, &driver_cmd, sizeof(driver_cmd));
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    size,
+                    addr,
+                    (size_t)bar_addr);
+  return 0;  // success
+}
+
+inline void *ACL_PCIE_MM_IO_DEVICE::compute_address(void *base, uintptr_t offset) {
+  uintptr_t p = reinterpret_cast<uintptr_t>(base);
+  return reinterpret_cast<void *>(p + offset);
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read_block(size_t addr, size_t size, void *dst) {
+  DWORD status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Reading block (" SIZE_FMT_U " bytes) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X
+                             " with offset)\n",
+                             m_name,
+                             size,
+                             addr,
+                             (size_t)bar_addr);
+
+#if defined(WINDOWS)
+  DWORD FP_size = static_cast<DWORD>(size);
+  size_t alignment_size = size % 4;
+  DWORD FP_alignment_size = static_cast<DWORD>(alignment_size);
+  // 32-bit MMIO Read
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, dst, FP_size - FP_alignment_size);
+  if (alignment_size) {
+    void *alignment_addr = compute_address(dst, size - alignment_size);
+    KPTR alignment_bar_addr = bar_addr + size - alignment_size;
+    status |= fpgaReadMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size);
+  }
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  // Can't use templated linux_write here because *src doesn't give you the size to read.
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = m_bar;
+  driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr);
+  driver_cmd.user_addr = dst;
+  driver_cmd.size = size;
+  // Notify the driver if the host and device's memory have different endianess.
+  driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0;
+  status = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Reading block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    size,
+                    addr,
+                    (size_t)bar_addr);
+  return 0;  // success
+}
+
+ACL_PCIE_MM_IO_MGR::ACL_PCIE_MM_IO_MGR(fpga_handle handle)
+    : mem(NULL),
+      pcie_cra(NULL),
+      window(NULL),
+      version(NULL),
+      pr_base_id(NULL),
+      pr_region_ctrl(NULL),
+      quartus_ver(NULL),
+      cade_id(NULL),
+      uniphy_status(NULL),
+      uniphy_reset(NULL),
+      kernel_if(NULL),
+      pll(NULL),
+      temp_sensor(NULL),
+      hostch_ver(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating mm_io_mgr.\n");
+
+  // This is the PCIe's interface for directly accessing memory (which is
+  // significantly slower than using DMA).  This view of memory is segmented
+  // so that the size of this address space can be smaller than the amount of
+  // physical device memory.  The window interface controls which region of
+  // physical memory this interface currently maps to.
+  // The last flag indicate if the device on both side of transferring have
+  // different endianess.
+#ifdef ACL_BIG_ENDIAN
+  mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", true);
+#else
+  mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", false);
+#endif
+
+  // This is the CRA port of our PCIe controller.  Used for configuring
+  // interrupts and things like that.
+  pcie_cra = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_CRA_BAR, ACL_PCI_CRA_OFFSET, "PCIE-CRA");
+
+  // This interface sets the high order address bits for the PCIe's direct
+  // memory accesses via "mem" (above).
+  window = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_MEMWINDOW_BAR, ACL_PCIE_MEMWINDOW_CRA, "MEMWINDOW");
+
+  // DMA interfaces
+  dma = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_DMA_INTERNAL_BAR, ACL_PCIE_DMA_INTERNAL_CTR_BASE, "DMA-CTR");
+
+  // Version ID check
+  version = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, "VERSION");
+
+  // PR base ID check
+  pr_base_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRBASEID_BAR, ACL_PRBASEID_OFFSET, "PRBASEID");
+
+  // PR region controller
+  pr_region_ctrl = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET, "PRREGIONCTRL");
+
+  // Quartus Version
+  quartus_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_QUARTUSVER_BAR, ACL_QUARTUSVER_OFFSET, "QUARTUS-VERSION");
+
+  // Quartus Version
+  hostch_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_HOSTCH_VERSION_BAR, ACL_HOSTCH_VERSION_OFFSET, "HOSTCH-VERSION");
+
+  // Cable auto detect ID
+  cade_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_CADEID_BAR, ACL_CADEID_OFFSET, "CADEID");
+
+  // Uniphy Status
+  uniphy_status = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYSTATUS_BAR, ACL_UNIPHYSTATUS_OFFSET, "UNIPHYSTATUS");
+
+  // Uniphy Reset
+  uniphy_reset = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYRESET_BAR, ACL_UNIPHYRESET_OFFSET, "UNIPHYRESET");
+
+  // Kernel interface
+  // The DLA BSP eliminates the kernel interface present in the original PR Terasic BSP
+  // We reuse the kernel_if object here to simplify the DLA-specific changes required
+#ifdef DLA_MMD
+  kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_DLA_CSR_OFFSET, "KERNEL");
+#else
+  kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_KERNEL_CSR_OFFSET, "KERNEL");
+#endif // DLA_MMD
+
+  // PLL interface
+  pll = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET, "PLL");
+
+  // temperature sensor
+#ifdef ACL_PCIE_HAS_TEMP_SENSOR
+  temp_sensor = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_PCIE_TEMP_SENSOR_ADDRESS, "TEMP-SENSOR");
+#endif
+}
+
+ACL_PCIE_MM_IO_MGR::~ACL_PCIE_MM_IO_MGR() {
+  if (mem) {
+    delete mem;
+    mem = NULL;
+  }
+  if (pcie_cra) {
+    delete pcie_cra;
+    pcie_cra = NULL;
+  }
+  if (window) {
+    delete window;
+    window = NULL;
+  }
+  if (version) {
+    delete version;
+    version = NULL;
+  }
+  if (pr_base_id) {
+    delete pr_base_id;
+    pr_base_id = NULL;
+  }
+  if (pr_region_ctrl) {
+    delete pr_region_ctrl;
+    pr_region_ctrl = NULL;
+  }
+  if (quartus_ver) {
+    delete quartus_ver;
+    quartus_ver = NULL;
+  }
+  if (cade_id) {
+    delete cade_id;
+    cade_id = NULL;
+  }
+  if (uniphy_status) {
+    delete uniphy_status;
+    uniphy_status = NULL;
+  }
+  if (uniphy_reset) {
+    delete uniphy_reset;
+    uniphy_reset = NULL;
+  }
+  if (kernel_if) {
+    delete kernel_if;
+    kernel_if = NULL;
+  }
+  if (pll) {
+    delete pll;
+    pll = NULL;
+  }
+  if (temp_sensor) {
+    delete temp_sensor;
+    temp_sensor = NULL;
+  }
+  if (hostch_ver) {
+    delete hostch_ver;
+    hostch_ver = NULL;
+  }
+  if (dma) {
+    delete dma;
+    dma = NULL;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h
new file mode 100644
index 0000000..4db5599
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h
@@ -0,0 +1,109 @@
+#ifndef ACL_PCIE_MM_IO_H
+#define ACL_PCIE_MM_IO_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_mm_io.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle memory mapped IO over PCIe.              */
+/* The actual implementation of the class lives in the acl_pcie_mm_io.cpp,         */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#define FPGA_OK 0
+#endif  // LINUX
+
+#ifdef DLA_MMD
+#include "acl_pcie.h"
+#define ACL_DLA_CSR_OFFSET 0x0000
+#endif
+/*
+ *
+ */
+class ACL_PCIE_MM_IO_DEVICE {
+ public:
+  ACL_PCIE_MM_IO_DEVICE(fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian = false);
+  ~ACL_PCIE_MM_IO_DEVICE();
+
+  DWORD bar_id() { return m_bar; };
+  KPTR convert_to_bar_addr(size_t addr) { return addr + m_offset; };
+
+  // read/write functions to the memory-mapped io device
+  // return 0 on success, negative on error
+  int read8(size_t addr, UINT8 *data);
+  int write8(size_t addr, UINT8 data);
+  int read16(size_t addr, UINT16 *data);
+  int write16(size_t addr, UINT16 data);
+  int read32(size_t addr, UINT32 *data);
+  int write32(size_t addr, UINT32 data);
+  int read64(size_t addr, UINT64 *data);
+  int write64(size_t addr, UINT64 data);
+
+  int read_block(size_t addr, size_t size, void *dst);
+  int write_block(size_t addr, size_t size, void *src);
+
+ private:
+  static const int MAX_NAME_LENGTH = 32;
+
+  // Helper functions
+  inline void *compute_address(void *base, uintptr_t offset);
+
+  char m_name[MAX_NAME_LENGTH];
+  fpga_handle m_handle;
+  DWORD m_bar;
+  KPTR m_offset;
+  bool m_diff_endian;  // indicates if the host and this device have different endianess
+};
+
+/*
+ * Utility functions to clean up the various address translations for reads/writes
+ */
+class ACL_PCIE_MM_IO_MGR {
+ private:
+  ACL_PCIE_MM_IO_MGR &operator=(const ACL_PCIE_MM_IO_MGR &) { return *this; }
+
+  ACL_PCIE_MM_IO_MGR(const ACL_PCIE_MM_IO_MGR &src) {}
+
+ public:
+  ACL_PCIE_MM_IO_MGR(fpga_handle handle);
+  ~ACL_PCIE_MM_IO_MGR();
+
+  ACL_PCIE_MM_IO_DEVICE *mem;
+  ACL_PCIE_MM_IO_DEVICE *pcie_cra;
+  ACL_PCIE_MM_IO_DEVICE *dma;
+  ACL_PCIE_MM_IO_DEVICE *window;
+  ACL_PCIE_MM_IO_DEVICE *version;
+  ACL_PCIE_MM_IO_DEVICE *pr_base_id;
+  ACL_PCIE_MM_IO_DEVICE *pr_region_ctrl;
+  ACL_PCIE_MM_IO_DEVICE *quartus_ver;
+  ACL_PCIE_MM_IO_DEVICE *cade_id;
+  ACL_PCIE_MM_IO_DEVICE *uniphy_status;
+  ACL_PCIE_MM_IO_DEVICE *uniphy_reset;
+  ACL_PCIE_MM_IO_DEVICE *kernel_if;
+  ACL_PCIE_MM_IO_DEVICE *pll;
+  ACL_PCIE_MM_IO_DEVICE *temp_sensor;
+  ACL_PCIE_MM_IO_DEVICE *hostch_ver;
+};
+
+#endif  // ACL_PCIE_MM_IO_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp
new file mode 100644
index 0000000..855d6ba
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp
@@ -0,0 +1,67 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_timer.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to query the host's system timer.                */
+/* The declaration of the class lives in the acl_pcie_timer.h                      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_timer.h"
+#include "acl_pcie.h"
+
+// other standard header files
+#include <fstream>
+
+ACL_PCIE_TIMER::ACL_PCIE_TIMER() : m_ticks_per_second(0) {
+#if defined(WINDOWS)
+  // Cache the performance counter frequency
+  LARGE_INTEGER li;
+  QueryPerformanceFrequency(&li);
+  m_ticks_per_second = li.QuadPart;
+
+  ACL_PCIE_ASSERT(m_ticks_per_second != 0, "m_ticks_per_second == 0!\n");
+#endif  // WINDOWS
+}
+
+ACL_PCIE_TIMER::~ACL_PCIE_TIMER() {}
+
+cl_ulong ACL_PCIE_TIMER::get_time_ns() {
+#if defined(WINDOWS)
+  const INT64 NS_PER_S = 1000000000;
+  LARGE_INTEGER li;
+
+  QueryPerformanceCounter(&li);
+  INT64 ticks = li.QuadPart;
+  double seconds = ticks / (double)m_ticks_per_second;
+
+  return static_cast<cl_ulong>(seconds * NS_PER_S + 0.5);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct timespec a;
+  const cl_ulong NS_PER_S = 1000000000;
+  clock_gettime(CLOCK_REALTIME, &a);
+
+  return static_cast<cl_ulong>(a.tv_nsec) + static_cast<cl_ulong>(a.tv_sec * NS_PER_S);
+#endif  // LINUX
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h
new file mode 100644
index 0000000..646d681
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h
@@ -0,0 +1,50 @@
+#ifndef ACL_PCIE_TIMER_H
+#define ACL_PCIE_TIMER_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_timer.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to query the host's system timer.                  */
+/* The actual implementation of the class lives in the acl_pcie_timer.cpp          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+// don't assume opencl has been installed
+#include "acl_pcie.h"
+typedef UINT64 cl_ulong;
+#endif
+
+class ACL_PCIE_TIMER {
+ public:
+  ACL_PCIE_TIMER();
+  ~ACL_PCIE_TIMER();
+
+  // function to query the host's system timer
+  cl_ulong get_time_ns();
+
+ private:
+  INT64 m_ticks_per_second;
+};
+
+#endif  // ACL_PCIE_TIMER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h
new file mode 100644
index 0000000..ffecc32
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h
@@ -0,0 +1 @@
+#define ACL_DRIVER_VERSION "20.4.d41d8cd98f00b204e9800998ecf8427e"
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h
new file mode 100644
index 0000000..6d5c85e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h
@@ -0,0 +1,640 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include <cstdint>  //uint32_t
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3)
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,            /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,                /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,                   /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,                 /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                      /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                        /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                       /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,                  /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,                 /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,               /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11,      /* total # of concurrent operations read + writes*/
+  AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,       /* Min alignment that the BSP supports for host allocations (size_t) */
+  AOCL_MMD_HOST_MEM_CAPABILITIES = 13,           /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+  AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,         /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+  AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,         /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+  AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/
+  AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+  AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+                                    aocl_mmd_info_t requested_info_id,
+                                    size_t param_value_size,
+                                    void* param_value,
+                                    size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signaled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_copy(
+    int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device. That means the kernels will be idle and no read/write/copy
+ * commands are active. Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size. The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again. At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+
+#ifdef DLA_MMD
+AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle) WEAK;
+AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle) WEAK;
+// CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to
+// reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask
+// the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared).
+// BEWARE: reprogramming will invalidate the handle
+AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename, const bool skipSaveRestore = false) WEAK;
+#else
+AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+#endif
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4
+#define AOCL_MMD_ERROR_INVALID_POINTER -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+/** Memory properties*/
+typedef enum {
+  /**
+   *  Specifies the name of a global memory that can be found in the
+   *  board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  global memory interface.
+   */
+  AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1,
+  /**
+   *  Specifies the index of a bank inside the global memory interface that can be found in
+   *  the board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  memory bank. It is invalid to specify this property without also specifying
+   *  AOCL_MMD_GLOBAL_MEMORY_INTERFACE.
+   */
+  AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK
+} aocl_mmd_mem_properties_t;
+
+/**
+ *  Host allocations provide memory that is allocated on the host. Host
+ *  allocations are accessible by the host and one or more devices.
+ *  The same pointer to a host allocation may be used on the host and all
+ *  supported devices; they have address equivalence. This memory must be
+ *  deallocated with aocl_mmd_free();
+ *
+ *  Once the device has signaled completion through
+ *  aocl_mmd_interrupt_handler_fn() the host can assume it has access to the
+ *  latest contents of the memory, allocated by this call.
+ *
+ *  @param handles Handles for devices that will need access to this memory
+ *  @param num_devices Number of devices in the handles
+ *  @param size The size of the memory region
+ *  @param alignment The alignment in bytes of the allocation
+ *  @param properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t* properties,
+                                        int* error) WEAK;
+
+/**
+ * Frees memory that has been allocated by MMD
+ *
+ * @param mem The pointer to the memory region. Must be a pointer that is
+ *   allocated by the MMD.
+ * @return AOCL_MMD_ERROR_SUCCESS if success, else error code
+ */
+AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK;
+
+/**
+ *  Allocate memory that is owned by the device. This pointer can only be
+ *  accessed by the kernel; can't be accessed by the host. The host is able to
+ *  manipulate the pointer (e.g. increment it) just not access the underlying
+ *  data. This memory must be deallocated by aocl_mmd_free();
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param  alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return Pointer that can be passed into the kernel. NULL on failure.
+ */
+AOCL_MMD_CALL void* aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+/**
+ *  Shared allocations may migrate between the host and one or more associated
+ *  device. The same pointer to a shared allocation may be used on the host and
+ *  the supported device; they have address equivalence.
+ *
+ *  If the device does not support concurrent access to memory allocated by
+ *  aocl_mmd_shared_alloc() then a call must be made to
+ *  aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should
+ *  be migrated to the device before the device accesses this memory.  For
+ *  example, a call to aocl_mmd_shared_mem_migrate() should be made before a
+ *  kernel accessing this memory is launched).  Conversely,
+ *  aocl_mmd_shared_mem_migrate() should be called again to indicate that the
+ *  shared allocation should be migrated to the host before the host accesses
+ *  this memory again.  If the device supports concurrent access to memory
+ *  allocated with aocl_mmd_shared_alloc(), then the call to
+ *  aocl_mmd_shared_mem_migrate() is not necessary, but may still be made.  In
+ *  the case of concurrent access, it is the responsibility of the MMD to ensure
+ *  both the device and host can access aocl_mmd_shared_alloc() allocations at
+ *  all times.
+ *
+ *  Memory allocated by aocl_mmd_shared_alloc() must be deallocated with
+ *  aocl_mmd_free().
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported properties are
+ *    listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_.
+ *    Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+AOCL_MMD_CALL void* aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t;
+
+/**
+ *  A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared
+ *  allocations any time the accessor of the allocation changes.  For example,
+ *  aocl_mmd_shared_migrate() should be called indicating that the allocation
+ *  should be migrated to the device before a kernel accessing the allocation
+ *  is launched on the device.  Similarly, aocl_mmd_shared_migrate() should be
+ *  called indicating that the allocation is migrated to the host before the
+ *  host accesses the memory after kernel completion.
+ *
+ *  For concurrent allocations this call may be used as a performance hint, but
+ *  is not strictly required for functionality.
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc()
+ *  @param size In bytes, the size of the migration. Must be of multiple of a
+ *   page boundary that the BSP supports.
+ *  @param destination The destination of migration
+ *  @return The error code defined by AOCL_MMD_ERROR*
+ */
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle,
+                                          void* shared_ptr,
+                                          size_t size,
+                                          aocl_mmd_migrate_t destination) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h
new file mode 100644
index 0000000..dc3eae2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h
@@ -0,0 +1,100 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file access.h
+ * @brief Functions to acquire, release, and reset OPAE FPGA resources
+ */
+
+#ifndef __FPGA_ACCESS_H__
+#define __FPGA_ACCESS_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Open an FPGA object
+ *
+ * Acquires ownership of the FPGA resource referred to by 'token'.
+ *
+ * Most often this will be used to open an accelerator object to directly interact
+ * with an accelerator function, or to open an FPGA object to perform
+ * management functions.
+ *
+ * @param[in]  token    Pointer to token identifying resource to acquire
+ *                      ownership of
+ * @param[out] handle   Pointer to preallocated memory to place a handle in.
+ *                      This handle will be used in subsequent API calls.
+ * @param[in]  flags    One of the following flags:
+ *                        * FPGA_OPEN_SHARED allows the resource to be opened
+ *                          multiple times (not supported in ASE)
+ * @returns             FPGA_OK on success. FPGA_NOT_FOUND if the resource for
+ *                      'token' could not be found. FPGA_INVALID_PARAM if
+ *                      'token' does not refer to a resource that can be
+ *                       opened, or if either argument is NULL or invalid.
+ *                      FPGA_EXCEPTION if an internal exception occurred while
+ *                      creating the handle. FPGA_NO_DRIVER if the driver is
+ *                      not loaded. FPGA_BUSY if trying to open a resource that
+ *                      has already been opened in exclusive mode.
+ *                      FPGA_NO_ACCESS if the current process' privileges are
+ *                      not sufficient to open the resource.
+ */
+ __FPGA_API__ fpga_result fpgaOpen(fpga_token token, fpga_handle *handle,
+              int flags);
+
+/**
+ * Close a previously opened FPGA object
+ *
+ * Relinquishes ownership of a previously fpgaOpen()ed resource. This enables
+ * others to acquire ownership if the resource was opened exclusively.
+ * Also deallocates / unmaps MMIO and UMsg memory areas.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA object
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to an acquired resource, or if handle is NULL.
+ *                      FPGA_EXCEPTION if an internal error occurred while
+ *                      accessing the handle.
+ */
+__FPGA_API__ fpga_result fpgaClose(fpga_handle handle);
+
+/**
+ * Reset an FPGA object
+ *
+ * Performs an accelerator reset.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA object
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to an acquired resource or to a resoure that
+ *                      cannot be reset. FPGA_EXCEPTION if an internal error
+ *                      occurred while trying to access the handle or resetting
+ *                      the resource.
+ */
+__FPGA_API__ fpga_result fpgaReset(fpga_handle handle);
+
+END_C_DECL
+
+#endif // __FPGA_ACCESS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h
new file mode 100644
index 0000000..e848182
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h
@@ -0,0 +1,154 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file buffer.h
+ * @brief Functions for allocating and sharing system memory with an FPGA
+ * accelerator
+ *
+ * To share memory between a software application and an FPGA accelerator,
+ * these functions set up system components (e.g. an IOMMU) to allow
+ * accelerator access to a provided memory region.
+ *
+ * There are a number of restrictions on what memory can be shared, depending
+ * on platform capabilities. Usually, FPGA accelerators to not have access to
+ * virtual address mappings of the CPU, so they can only access physical
+ * addresses. To support this, the OPAE C library on Linux uses hugepages to
+ * allocate large, contiguous pages of physical memory that can be shared with
+ * an accalerator. It also supports sharing memory that has already been
+ * allocated by an application, as long as that memory satisfies the
+ * requirements of being physically contigous and page-aligned.
+ */
+
+#ifndef __FPGA_BUFFER_H__
+#define __FPGA_BUFFER_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Prepare a shared memory buffer
+ *
+ * Prepares a memory buffer for shared access between an accelerator and the calling
+ * process. This may either include allocation of physcial memory, or
+ * preparation of already allocated memory for sharing. The latter case is
+ * indicated by supplying the FPGA_BUF_PREALLOCATED flag.
+ *
+ * This function will ask the driver to pin the indicated memory (make it
+ * non-swappable), and program the IOMMU to allow access from the accelerator. If the
+ * buffer was not pre-allocated (flag FPGA_BUF_PREALLOCATED), the function
+ * will also allocate physical memory of the requested size and map the
+ * memory into the caller's process' virtual address space. It returns in
+ * 'wsid' an fpga_buffer object that can be used to program address registers
+ * in the accelerator for shared access to the memory.
+ *
+ * When using FPGA_BUF_PREALLOCATED, the input len must be a non-zero multiple
+ * of the page size, else the function returns FPGA_INVALID_PARAM. When not
+ * using FPGA_BUF_PREALLOCATED, the input len is rounded up to the nearest
+ * multiple of page size.
+ *
+ * @param[in]  handle     Handle to previously opened accelerator resource
+ * @param[in]  len        Length of the buffer to allocate/prepare in bytes
+ * @param[inout] buf_addr Virtual address of buffer. Contents may be NULL (OS
+ *                        will choose mapping) or non-NULL (OS will take
+ *                        contents as a hint for the virtual address).
+ * @param[out] wsid       Handle to the allocated/prepared buffer to be used
+ *                        with other functions
+ * @param[in]  flags      Flags. FPGA_BUF_PREALLOCATED indicates that memory
+ *                        pointed at in '*buf_addr' is already allocated an
+ *                        mapped into virtual memory.
+ * @returns FPGA_OK on success. FPGA_NO_MEMORY if the requested memory could
+ * not be allocated. FPGA_INVALID_PARAM if invalid parameters were provided, or
+ * if the parameter combination is not valid. FPGA_EXCEPTION if an internal
+ * exception occurred while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaPrepareBuffer(fpga_handle handle,
+              uint64_t len,
+              void **buf_addr, uint64_t *wsid, int flags);
+
+/**
+ * Release a shared memory buffer
+ *
+ * Releases a previously prepared shared buffer. If the buffer was allocated
+ * using fpgaPrepareBuffer (FPGA_BUF_PREALLOCATED was not specified), this call
+ * will deallocate/free that memory. Otherwise, it will only be returned to
+ * it's previous state (pinned/unpinned, cached/non-cached).
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  wsid     Handle to the allocated/prepared buffer
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
+
+/**
+ * Retrieve base IO address for buffer
+ *
+ * This function is used to acquire the physical base address (on some platforms
+ * called IO Virtual Address or IOVA) for a shared buffer identified by wsid.
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  wsid     Buffer handle / workspace ID referring to the buffer for
+ *                      which the IO address is requested
+ * @param[out] ioaddr   Pointer to memory where the IO address will be returned
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer.
+ */
+__FPGA_API__ fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid,
+              uint64_t *ioaddr);
+
+/**
+ * Retrieve physical address for buffer
+ *
+ * This function is used to acquire the physical addresses in a scatter gather
+ * list form for a shared buffer identified by wsid.
+ *
+ * @param[in]  handle       Handle to previously opened accelerator resource
+ * @param[in]  wsid         Buffer handle / workspace ID referring to the buffer for
+ *                          which the physical address is requested
+ * @param[out] num_pages    Number of physical pages
+ * @param[out] sglist       SG list structure where physical addresses of pages and
+ *                          number of bytes in that page used will be returned.
+ *
+ * Note:  Call this API with sg_list as NULL to update num_pages. Allocate upto
+ *        (num_pages * sg_list) memory and call the API again with a pointer to this
+ *        memory location as the last argument to retrieve the sg_list struct.
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer.
+ */
+__FPGA_API__ fpga_result fpgaGetPhysicalAddress(fpga_handle handle, uint64_t wsid, uint64_t *num_pages,
+              void *sglist);
+
+END_C_DECL
+
+#endif // __FPGA_BUFFER_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h
new file mode 100644
index 0000000..8febd44
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h
@@ -0,0 +1,144 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file dma.h
+ * @brief Functions to acquire, release, and reset OPAE FPGA DMA resources
+ */
+
+#ifndef __DMA_ACCESS_H__
+#define __DMA_ACCESS_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/*
+*  The DMA driver supports host to FPGA, FPGA to host
+*  and FPGA to FPGA transfers. The FPGA interface can
+*  be streaming or memory-mapped. Streaming interfaces
+*  are not currently
+*  supported.
+*/
+typedef enum {
+    HOST_TO_FPGA_MM = 0,
+    FPGA_TO_HOST_MM,
+    FPGA_TO_FPGA_MM,
+    FPGA_MAX_TRANSFER_TYPE,
+}fpga_dma_transfer;
+
+
+typedef enum
+{
+    DMA_OPEN = 1,
+    DMA_BUSY,
+    DMA_CLOSED
+}fpga_dma_status;
+
+/*
+ * Dma handle in user space that will be populated during fpgaDmaOpen call.
+ */
+typedef struct _fpga_dma_handle
+{
+    //
+    // Stores the handle to the fpga that was opened after fpgaOpen
+    //
+    fpga_handle fpga_h;
+
+    //
+    // Stores the current status of the DMA AFC
+    // Set to the following values:
+    // DMA_OPEN - After call to fpgaDmaOpen() and when fpgaDmaTransferSync() exits
+    // DMA_BUSY - When fpgaDmaTransferSync() is called
+    //
+    uint64_t dma_status;
+}dma_handle, *fpga_dma_handle;
+
+
+
+/**
+*
+* Opens a handle to DMA
+* Sets the status of DMA engine to DMA_OPEN
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dma_h    DMA handle allocated by the user
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource.
+*
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaOpen(
+    fpga_handle      handle,
+    fpga_dma_handle  *dma_h
+);
+
+/**
+*
+* Closes a handle to DMA
+* Sets the status of DMA engine to DMA_CLOSED
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dma_h    DMA handle allocated by the user
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource.
+*
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaClose(
+    fpga_dma_handle     dma_h
+);
+
+
+/**
+*
+* Performs a synchronous DMA transfer between FPGA and host memory.
+*
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dst      Destination address for the data transfer
+* @param[in]  src      Source address for the data transfer
+* @param[in]  count    Length of data to be transferred from src to dst
+* @param[in]  flag     Flag to indicate nature of data transfer. Flag types =
+                       HOST_TO_FPGA_MM and FPGA_TO_HOST_MM.
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource or to a resoure that
+*                      cannot be reset. FPGA_EXCEPTION if an internal error
+*                      occurred while trying to access the handle or resetting
+*                      the resource.
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaTransferSync(
+    fpga_dma_handle handle,
+    ULONG64         dst,
+    ULONG64         src,
+    ULONG64         count,
+    ULONG64         flag
+);
+
+END_C_DECL
+
+#endif // __DMA_ACCESS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h
new file mode 100644
index 0000000..ee3349b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h
@@ -0,0 +1,129 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file enum.h
+ * @brief APIs for resource enumeration and managing tokens
+ *
+ * These APIs are the first step for any application using OPAE to discover
+ * resources that are present on the system. They allow selective enumeration
+ * (i.e. getting a list of resources that match a given list of criteria) and
+ * methods to manage the lifecycle of tokens generated by fpgaEnumerate().
+ */
+
+#ifndef __FPGA_ENUM_H__
+#define __FPGA_ENUM_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Enumerate FPGA resources present in the system
+ *
+ * This call allows the user to query the system for FPGA resources that match
+ * a certain set of criteria, e.g. all accelerators that are assigned to a host
+ * interface and available, all FPGAs of a specific type, etc.
+ *
+ * fpgaEnumerate() will create a number of `fpga_token`s to represent the
+ * matching resources and populate the array `tokens` with these tokens. The
+ * `max_tokens` argument can be used to limit the number of tokens
+ * allocated/returned by fpgaEnumerate(); i.e., the number of tokens in the
+ * returned `tokens` array will be either `max_tokens` or `num_matches` (the
+ * number of resources matching the filter), whichever is smaller. Use
+ * fpgaDestroyToken() to destroy tokens that are no longer needed.
+ *
+ * To query the number of matches for a particular set of filters (e.g. to
+ * allocate a `tokens` array of the appropriate size), call fpgaEnumerate()
+ * with the parameter `tokens` set to NULL; this will only return the number of
+ * matches in `num_matches`.
+ *
+ * @Note fpgaEnumerate() will allocate memory for the created tokens returned
+ * in `tokens`. It is the responsibility of the using application to free this
+ * memory after use by calling fpgaDestroyToken() for each of the returned
+ * tokens.
+ *
+ * @param[in] filters      Array of `fpga_properties` objects describing the
+ *                         properties of the objects that should be returned. A
+ *                         resource is considered matching if its properties
+ *                         match any one of the supplied filters. Passing NULL
+ *                         will match all FPGA resources present in the system.
+ * @param[in] num_filters  Number of entries in the `filters` array.
+ * @param[out] tokens      Pointer to an array of fpga_token variables to be
+ *                         populated.  If NULL is supplied, fpgaEnumerate() will
+ *                         not create any tokens, but it will return the
+ *                         number of possible matches in `num_match`.
+ * @param[in] max_tokens   Maximum number of tokens that fpgaEnumerate() shall
+ *                         return (length of `tokens` array). There may be more
+ *                         or fewer matches than this number; `num_matches` is
+ *                         set to the number of actual matches.
+ * @param[out] num_matches Number of resources matching the `filter` criteria.
+ *                         This number can be higher than the number of tokens
+ *                         returned in the `tokens` array (depending on the
+ *                         value of `max_tokens`).
+ * @returns                FPGA_OK on success.
+ *                         FPGA_INVALID_PARAM if invalid pointers or objects
+ *                         are passed into the function.
+ *                         FPGA_NO_DRIVER if OPAE can't find the respective
+ *                         enumeration data structures usually provided by the
+ *                         driver.
+ *                         FPGA_NO_MEMORY if there was not enough memory to
+ *                         create tokens.
+ */
+__FPGA_API__ fpga_result fpgaEnumerate(const fpga_properties *filters,
+              uint32_t num_filters, fpga_token *tokens,
+              uint32_t max_tokens ,uint32_t *num_matches);
+
+/**
+ * Clone a fpga_token object
+ *
+ * Creates a copy of an fpga_token object.
+ *
+ * @Note This call creates a new token object and allocates memory for it. It
+ * is the responsibility of the using application to free this memory after use
+ * by calling fpgaDestroyToken() for the cloned token.
+ *
+ * @param[in]  src        fpga_token object to copy
+ * @param[out] dst        New fpga_token object cloned from 'src'
+ * @returns               FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaCloneToken(fpga_token src, fpga_token *dst);
+
+/**
+ * Destroy a Token
+ *
+ * This function destroys a token created by fpgaEnumerate() and frees the
+ * associated memory.
+ *
+ * @param[in] token      fpga_token to destroy
+ * @returns              FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaDestroyToken(fpga_token *token);
+
+END_C_DECL
+
+#endif // __FPGA_ENUM_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h
new file mode 100644
index 0000000..3d53554
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h
@@ -0,0 +1,151 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file event.h
+ * @brief Functions for registering events and managing the lifecycle for
+ * `fpga_event_handle`s.
+ *
+ * OPAE provides an interface to asynchronous events that can be generated by
+ * different FPGA resources. The event API provides functions to register for
+ * these events; associated with every event a process has registered for is an
+ * fpga_event_handle, which encapsulates the OS-specific data structure for
+ * event objects. On Linux, an fpga_event_handle can be used as a file
+ * descriptor and passed to select(), poll(), epoll() and similar functions to
+ * wait for asynchronous events.
+ */
+
+#ifndef __FPGA_EVENT_H__
+#define __FPGA_EVENT_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Initialize an event_handle
+ *
+ * Platform independent way to initialize an event_handle used for
+ * notifications from the driver to application. For Linux, this function
+ * creates an eventfd and returns the eventfd file descriptor in
+ * `*event_handle`.
+ *
+ * @param[out] event_handle  Pointer to event handle variable.
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL.
+ * FPGA_NOT_SUPPORTED if platform does not support events.
+ */
+__FPGA_API__ fpga_result fpgaCreateEventHandle(fpga_event_handle *event_handle);
+
+/**
+ * Destroy an event_handle
+ *
+ * Destroy handle and free resources. On Linux this corresponds
+ * to closing the file descriptor pointed to by handle
+ *
+ * @param[in] event_handle Pointer to handle to be destroyed
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL.
+ */
+__FPGA_API__ fpga_result fpgaDestroyEventHandle(fpga_event_handle *event_handle);
+
+/**
+ * Register an FPGA event
+ *
+ * This function tells the driver that the caller is interested in notification
+ * for the event specified by the type and flags pair.
+ *
+ * The event_handle points to an OS specific mechanism for event notification.
+ * An event_handle is associated with only a single event.
+ *
+ * @todo define if calling fpgaRegisterEvent multiple times with the
+ * same event_handle is an error condition or if it is silently ignored.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA resource.
+ * @param[in]  event_type     Type of event
+ * @param[in]  event_handle Handle to previously opened resource for event
+ *                           notification.
+ * @param[in]  flags    Optional argument for specifying additional
+ *                          information about event.  For example irq number
+ *                for interrupt events.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does not refer to
+ * a resource supporting the requested event, or if event_handle is not valid.
+ * FPGA_EXCEPTION if an internal exception occurred while accessing the handle
+ * or the event_handle. On Linux: FPGA_NO_DAEMON if the driver does not support the
+ * requested event and there is no FPGA Daemon (fpgad) running to proxy it.
+ */
+__FPGA_API__ fpga_result fpgaRegisterEvent(fpga_handle handle,
+                  fpga_event_type event_type,
+                  fpga_event_handle event_handle,
+                  uint32_t flags);
+
+/**
+ * Unregister an FPGA event
+ *
+ * This function tells the driver that the caller is no longer interested in
+ * notification for the event associated with the event_handle
+ *
+ * The event_handle points to an OS specific mechanism for event notification.
+ * An event_handle is associated with only a single event.
+ *
+ * @todo define if calling fpgaUnregisterEvent multiple times with the
+ * same event_handle is an error condition or if it is silently ignored.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA resource.
+ * @param[in]  event_type     Type of event.
+ * @param[in]  event_handle Handle to previously opened resource for event
+ *                           notification.
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to a resource supporting the requested event,
+ *                      or if event_handle is not valid. FPGA_EXCEPTION if an
+ *                      internal error occurred accessing the handle or the
+ *                      event_handle.
+ */
+__FPGA_API__ fpga_result fpgaUnregisterEvent(fpga_handle handle, fpga_event_type event_type,
+                                    fpga_event_handle event_handle);
+
+/**
+* Get OS object from event handle
+*
+* Check validity of event handle, and get the OS object used to
+* subscribe and unsubscribe to events. On Linux, the obkect corresponds
+* to a file descriptor.
+*
+* @param[in] event_handle Event handle to get the descriptor value from
+* @param[out] fd integer to store the descriptor value
+*
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is invalid.
+*/
+__FPGA_API__ fpga_result fpgaGetOSObjectFromEventHandle(const fpga_event_handle event_handle,
+                            int *fd);
+
+END_C_DECL
+
+#endif // __FPGA_EVENT_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h
new file mode 100644
index 0000000..f7a2c5c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h
@@ -0,0 +1,87 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file flash.h
+ * @brief Functions to erase the flash memory and reconfigure a slot with a new bitstream .
+ */
+
+#ifndef __FLASH_H__
+#define __FLASH_H__
+
+BEGIN_C_DECL
+
+/**
+*
+*   Erase flash memory
+*
+*   This function erases the flash memory of the FPGA device
+*
+*   Arguments:
+*   @param[in]   fpga_handle              handle to previously opened FPGA_DEVICE resource
+*
+*   Return Value:
+*   FPGA_OK on success.
+*   FPGA_INVALID_PARAM if the handle does not refer to an owned resource.
+*   FPGA_NOT_FOUND if this host interface number is not found .
+*   FPGA_NOT_SUPPORTED if funcionality not supported
+*
+**/
+__FPGA_API__ fpga_result
+fpgaEraseFlash(
+    fpga_handle  fpga_handle
+    );
+
+
+/**
+*   Writes flash memory
+*
+*   This function programs the flash chip on the FPGA with the provided bitstream.
+*
+*   Arguments:
+*   @param[in]  handle                handle to an FPGA_DEVICE resource
+*   @param[in]  flashBitstream        pointer to memory holding the flash bitstream
+*   @param[in]  flashBitstreamLen     length of the bitstream in bytes
+*   @param[in]  offset                offset in flash controller to begin writing from
+*
+*   Return Value:
+*   FPGA_OK on success.
+*   FPGA_INVALID_PARAM if the handle does not refer to an owned resource.
+*   FPGA_NOT_FOUND if this host interface number is not found .
+*   FPGA_NOT_SUPPORTED if funcionality not supported.
+*/
+
+__FPGA_API__ fpga_result
+fpgaWriteFlash(
+    fpga_handle handle,
+    PUINT8      flashBitstream,
+    UINT64      flashBitstreamLen,
+    UINT64      offset
+);
+
+END_C_DECL
+
+#endif // __FLASH_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h
new file mode 100644
index 0000000..e6668e8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h
@@ -0,0 +1,60 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file fpga.h
+ * \brief FPGA API
+ *
+ * This conveniently includes all APIs that a part of the OPAE release (base and
+ * extensions).
+ */
+
+#ifndef __FPGA_FPGA_H__
+#define __FPGA_FPGA_H__
+
+#define FPGA_API_VERSION_MAJOR 0
+#define FPGA_API_VERSION_MINOR 1
+
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+
+#include <opae/types.h>
+#include <opae/access.h>
+#include <opae/buffer.h>
+#include <opae/dma.h>
+#include <opae/enum.h>
+#include <opae/event.h>
+#include <opae/flash.h>
+#include <opae/manage.h>
+#include <opae/mmio.h>
+#include <opae/properties.h>
+#include <opae/umsg.h>
+#include <opae/utils.h>
+#include <opae/version.h>
+
+#endif // __FPGA_FPGA_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h
new file mode 100644
index 0000000..365cdaf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h
@@ -0,0 +1,70 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file macrodefs.h
+ * @brief Definitions of conveinence macros for the OPAE C API
+ *
+ * This file defines convenience macros for the OPAE C API functions.
+ */
+
+#ifndef __FPGA_MACRODEFS_H__
+#define __FPGA_MACRODEFS_H__
+
+// Check for conflicting definitions
+#ifdef BEGIN_C_DECL
+#error BEGIN_C_DECL already defined, but used by the OPAE library
+#endif
+
+#ifdef END_C_DECL
+#error END_C_DECL already defined, but used by the OPAE library
+#endif
+
+#ifdef __FPGA_API__
+#error __FPGA_API__ already defined, but used by the OPAE library
+#endif
+
+// Macro for symbol visibility
+#ifdef _WIN32
+#ifdef FpgaLib_EXPORTS
+#define __FPGA_API__ __declspec(dllexport)
+#else
+#define __FPGA_API__ __declspec(dllimport)
+#endif
+#else
+#define __FPGA_API__ __attribute__((visibility("default")))
+#endif
+
+// Macro for disabling name mangling
+#ifdef __cplusplus
+#define BEGIN_C_DECL extern "C" {
+#define END_C_DECL }
+#else
+#define BEGIN_C_DECL
+#define END_C_DECL
+#endif
+
+#endif // __FPGA_MACRODEFS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h
new file mode 100644
index 0000000..f93a1b1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h
@@ -0,0 +1,176 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file manage.h
+ * @brief Functions for managing FPGA configurations
+ *
+ * FPGA accelerators can be reprogrammed at run time by providing new partial
+ * bitstreams ("green bitstreams"). This file defines API functions for
+ * programming green bitstreams as well as for assigning accelerators to host
+ * interfaces for more complex deployment setups, such as virtualized systems.
+ */
+
+#ifndef __FPGA_MANAGE_H__
+#define __FPGA_MANAGE_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+* Assign Port to a host interface.
+*
+* This function assign Port to a host interface for subsequent use. Only
+* Port that have been assigned to a host interface can be opened by
+* fpgaOpen().
+*
+* @param[in]  fpga           Handle to an FPGA object previously opened that
+*                            both the host interface and the slot belong to
+* @param[in]  interface_num  Host interface number
+* @param[in]  slot_num       Slot number
+* @param[in]  flags          Flags (to be defined)
+* @returns                   FPGA_OK on success
+*                            FPGA_INVALID_PARAM if input parameter combination
+*                            is not valid.
+*                            FPGA_EXCEPTION if an exception occcurred accessing
+*                            the `fpga` handle.
+*                            FPGA_NOT_SUPPORTED if driver does not support
+*                            assignment.
+*/
+__FPGA_API__ fpga_result fpgaAssignPortToInterface(fpga_handle fpga,
+                    uint32_t interface_num,
+                    uint32_t slot_num,
+                    int flags);
+
+/**
+ * Assign an accelerator to a host interface
+ *
+ * This function assigns an accelerator to a host interface for subsequent use. Only
+ * accelerators that have been assigned to a host interface can be opened by
+ * fpgaOpen().
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened that
+ *                            both the host interface and the accelerator belong to
+ * @param[in]  afc            Accelerator to assign
+ * @param[in]  host_interface Host interface to assign accelerator to
+ * @param[in]  flags          Flags (to be defined)
+ * @returns                   FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaAssignToInterface(fpga_handle fpga,
+                  fpga_token afc,
+                  uint32_t host_interface,
+                  int flags);
+
+/**
+ * Unassign a previously assigned accelerator
+ *
+ * This function removes the assignment of an accelerator to an host interface (e.g. to
+ * be later assigned to a different host interface). As a consequence, the accelerator
+ * referred to by token 'accelerator' will be reset during the course of this function.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened that
+ *                            both the host interface and the accelerator belong to
+ * @param[in]  afc            Accelerator to unassign/release
+ * @returns                   FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaReleaseFromInterface(fpga_handle fpga,
+                     fpga_token afc);
+
+/**
+ * Reconfigure a slot
+ *
+ * Sends a green bitstream file to an FPGA to reconfigure a specific slot. This
+ * call, if successful, will overwrite the currently programmed AFU in that
+ * slot with the AFU in the provided bitstream.
+ *
+ * As part of the reconfiguration flow, all accelerators associated with this slot will
+ * be unassigned and reset.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened
+ * @param[in]  slot           Token identifying the slot to reconfigure
+ * @param[in]  bitstream      Pointer to memory holding the bitstream
+ * @param[in]  bitstream_len  Length of the bitstream in bytes
+ * @param[in]  flags          Flags (to be defined)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the bitstream data to the driver. FPGA_RECONF_ERROR
+ * on errors reported by the driver (such as CRC or protocol errors).
+ */
+__FPGA_API__ fpga_result fpgaReconfigureSlot(fpga_handle fpga,
+             uint32_t slot,
+             const uint8_t *bitstream,
+             size_t bitstream_len, int flags);
+
+/**
+ * Process device specific commands
+ *
+ * Sends a device specific command to the driver and driver performs that action
+ * and returns if needed with the data.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened
+ * @param[in]  cmd            GUID identifying the command to process
+ * @param[in]  buffer         Pointer to memory where data will be returned.
+ * @param[in]  buffer_len     Length of the buffer passed.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the data to the driver.
+ */
+__FPGA_API__ fpga_result fpgaProcessDeviceCmd(fpga_handle fpga,
+    fpga_guid cmd,
+    void *arg,
+    void *buffer,
+    size_t buffer_len);
+
+/**
+ * Enumerate all the commands supported by the device.
+ *
+ * To enumerate all the commands supported by a specific device, call this
+ * function by passing NULL to buffer arg and it returns the number of bytes
+ * that needs to be allocated to get all the commands.
+ *
+ * Then allocate buffer for that size and call this function to get the list
+ * of all device supported CMDs.
+ *
+ * @param[in]  fpga         Handle to an FPGA object previously opened
+ * @param[in]  cmds         Pointer to memory where cmds will be returned.
+ * @param[in]  num_cmds     Pointer to memory where num cmds will be returned.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the data to the driver.
+ */
+__FPGA_API__ fpga_result fpgaGetSupportedCommands(fpga_handle fpga,
+    fpga_guid *cmds,
+    uint32_t  *num_cmds);
+
+END_C_DECL
+
+#endif // __FPGA_MANAGE_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h
new file mode 100644
index 0000000..7c26d3f
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h
@@ -0,0 +1,342 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file mmio.h
+ * @brief Functions for mapping and accessing MMIO space
+ *
+ * Most FPGA accelerators provide access to control registers through
+ * memory-mappable address spaces, commonly referred to as "MMIO spaces". This
+ * file provides functions to map, unmap, read, and write MMIO spaces.
+ *
+ * Note that an accelerator may have multiple MMIO spaces, denoted by the
+ * `mmio_num` argument of the APIs below. The meaning and properties of each
+ * MMIO space are up to the accelerator designer.
+ */
+
+#ifndef __FPGA_MMIO_H__
+#define __FPGA_MMIO_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Write 64 bit value to MMIO space
+ *
+ * This function will write to MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[in]  value    Value to write (64 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaWriteMMIO64(fpga_handle handle,
+                uint32_t mmio_num, uint64_t offset,
+                uint64_t value);
+
+/**
+ * Read 64 bit value from MMIO space
+ *
+ * This function will read from MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[out] value    Pointer to memory where read value is returned (64 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaReadMMIO64(fpga_handle handle,
+               uint32_t mmio_num,
+               uint64_t offset, uint64_t *value);
+
+/**
+ * Write 32 bit value to MMIO space
+ *
+ * This function will write to MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[in]  value    Value to write (32 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaWriteMMIO32(fpga_handle handle,
+                uint32_t mmio_num, uint64_t offset,
+                uint32_t value);
+
+/**
+ * Read 32 bit value from MMIO space
+ *
+ * This function will read from MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[out] value    Pointer to memory where read value is returned (32 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaReadMMIO32(fpga_handle handle,
+               uint32_t mmio_num,
+               uint64_t offset, uint32_t *value);
+
+/**
+ * Map MMIO space
+ *
+ * This function will return a pointer to the specified MMIO space of the
+ * target object in process virtual memory. Some MMIO spaces may be restricted
+ * to privileged processes, depending on the used handle and type.
+ *
+ * After mapping the respective MMIO space, you can access it either through
+ * direct pointer operations (observing supported access sizes and alignments
+ * of the target platform and accelerator), or by using fpgaReadMMIO32(),
+ * fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(), fpgaReadMmio()
+ * and fpgaWriteMmio().
+ *
+ * @note This call only supports returning an actual mmio_ptr for hardware
+ * targets, not for ASE simulation. Use fpgaReadMMIO32(), fpgaWriteMMIO32(),
+ * fpgeReadMMIO64(), and fpgaWriteMMIO64() if you need ASE simulation
+ * capabilities. You will still need to call fpgaMapMMIO() before using these
+ * functions, though.
+ *
+ * If the caller passes in NULL for mmio_ptr, no virtual address will be
+ * returned. This implies that all accesses will be performed through
+ * fpgaReadMMIO32(), fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(),
+ * fpgaReadMmio() and fpgaWriteMmio(). This is the only supported case for ASE.
+ *
+ * The number of available MMIO spaces can be retrieved through the num_mmio
+ * property (fpgaPropertyGetNumMMIO()).
+ *
+ * @param[in]  handle   Handle to previously opened resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[out] mmio_ptr Pointer to memory where a pointer to the MMIO space
+ *                      will be returned. May be NULL, in which case no pointer
+ *                      is returned.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NO_ACCESS if the process'
+ * permissions are not sufficient to map the requested MMIO space.
+ */
+__FPGA_API__ fpga_result fpgaMapMMIO(fpga_handle handle,
+            uint32_t mmio_num, uint64_t **mmio_ptr);
+
+/**
+ * Unmap MMIO space
+ *
+ * This function will unmap a previously mapped MMIO space of the target opject,
+ * rendering any pointers to it invalid.
+ *
+ * @note This call is only supported by hardware targets, not by ASE
+ *       simulation.
+ *
+ * @param[in]  handle   Handle to previously opened resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaUnmapMMIO(fpga_handle handle,
+            uint32_t mmio_num);
+
+/**
+* Reads the value from MMIO space.
+*
+* This function will read from MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[out] buffer   Pointer to memory where read value is returned
+* @param[in]  length   Length of the MMIO to read.
+* @param[in]  accessType   Read MMIO as 8/16/32/64-bit reads.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaReadMmioType(fpga_handle handle,
+        uint32_t mmio_num,
+        uint64_t offset,
+        void* buffer,
+        uint32_t length,
+        uint32_t accessType);
+
+/**
+* Write the value to MMIO space.
+*
+* This function will write to MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[in]  buffer   Pointer to memory from where data to be written.
+* @param[in]  length   Length of the MMIO to write.
+* @param[in]  accessType   Write MMIO as 8/16/32/64-bit writes.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaWriteMmioType(fpga_handle handle,
+        uint32_t mmio_num,
+        uint64_t offset,
+        void* buffer,
+        uint32_t length,
+        uint32_t accessType);
+
+
+/**
+* Reads the value from MMIO space.
+*
+* This function will read from MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[out] buffer   Pointer to memory where read value is returned
+* @param[in]  length   Length of the MMIO to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaReadMmio(fpga_handle handle,
+    uint32_t mmio_num,
+    uint64_t offset,
+    void     *buffer,
+    uint32_t length);
+
+/**
+* Write the value to MMIO space.
+*
+* This function will write to MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[in]  buffer   Pointer to memory from where data to be written.
+* @param[in]  length   Length of the MMIO to write.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaWriteMmio(fpga_handle handle,
+    uint32_t mmio_num,
+    uint64_t offset,
+    void     *buffer,
+    uint32_t length);
+
+/**
+* Read the config space of the device.
+*
+* This function will read the configuration space of the FPGA device
+*
+* @note This call is only supported by PCIe hardware targets, not by ASE
+*       simulation.
+*
+* @param[in]  handle   Handle to previously opened resource
+* @param[in]  offset   Offset within the config space of the device.
+* @param[in]  buffer   Pointer to the buffer where data read will be returned.
+* @param[in]  length   Number of bytes to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle.
+*/
+__FPGA_API__ fpga_result fpgaReadPciConfigSpace(fpga_handle handle,
+            uint32_t offset,
+            void*    buffer,
+            uint32_t length);
+
+/**
+* Write to config space of the device.
+*
+* This function will write to configuration space of the FPGA device
+*
+* @note This call is only supported by PCIe hardware targets, not by ASE
+*       simulation.
+*
+* @param[in]  handle   Handle to previously opened resource
+* @param[in]  offset   Offset within the config space of the device.
+* @param[in]  buffer   Pointer to the buffer where data read will be returned.
+* @param[in]  length   Number of bytes to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle.
+*/
+__FPGA_API__ fpga_result fpgaWritePciConfigSpace(fpga_handle handle,
+            uint32_t offset,
+            void*    buffer,
+            uint32_t length);
+
+END_C_DECL
+
+#endif // __FPGA_MMIO_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h
new file mode 100644
index 0000000..03e5e79
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h
@@ -0,0 +1,689 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file properties.h
+ * @brief Functions for examining and manipulating `fpga_properties` objects
+ *
+ * In OPAE, `fpga_properties` objects are used both for obtaining information
+ * about resources and for selectively enumerating resources based on their
+ * properties. This file provides accessor functions (get/set) to allow reading
+ * and writing individual items of an `fpga_properties` object. Generally, not
+ * all object types supported by OPAE carry all properties. If you call a
+ * property accessor method on a `fpga_properties` object that does not support
+ * this particular property, it will return FPGA_INVALID_PARAM.
+ *
+ * # Accessor Return Values
+ * In addition to the return values specified in the documentation below, all
+ * accessor functions return FPGA_OK on success, FPGA_INVALID_PARAM if you pass
+ * NULL or invalid parameters (i.e. non-initialized properties objects),
+ * FPGA_EXCEPTION if an internal exception occurred trying to access the
+ * properties object, FPGA_NOT_FOUND if the requested property is not part of
+ * the supplied properties object.
+ */
+
+#ifndef __FPGA_PROPERTIES_H__
+#define __FPGA_PROPERTIES_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Create a fpga_properties object
+ *
+ * Initializes the memory pointed at by `prop` to represent a properties
+ * object, and populates it with the properties of the resource referred to by
+ * `token`. Individual properties can then be queried using fpgaPropertiesGet*()
+ * accessor functions.
+ *
+ * If `token` is NULL, an "empty" properties object is created to be used as a
+ * filter for fpgaEnumerate(). All individual fields are set to `don`t care`,
+ * which implies that the fpga_properties object would match all FPGA resources
+ * if used for an fpgaEnumerate() query. The matching criteria can be further
+ * refined by using fpgaSet* functions on the properties object, or the
+ * object can be populated with the actual properties of a resource by using
+ * fpgaUpdateProperties().
+ *
+ * @Note fpgaGetProperties() will allocate memory for the created properties
+ * object returned in `prop`. It is the responsibility of the using application
+ * to free this memory after use by calling fpgaDestroyProperties().
+ *
+ * @param[in]  token      Token to get properties for. Can be NULL, which will
+ *                        create an empty properties object to be used as a
+ *                        filter for fpgaEnumerate().
+ * @param[out] prop       Pointer to a variable of type fpga_properties
+ * @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated
+ * to create the `fpga_properties` object. FPGA_EXCEPTION if an exception
+ * happend while initializing the `fpga_properties` object.
+ */
+__FPGA_API__ fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop);
+
+/**
+ * Update a fpga_properties object
+ *
+ * Populates the properties object 'prop' with properties of the resource
+ * referred to by 'token'. Unlike fpgaGetProperties(), this call will not create
+ * a new properties object or allocate memory for it, but use a previously
+ * created properties object.
+ *
+ * @param[in]  token      Token to retrieve properties for
+ * @param[in]  prop       fpga_properties object to update
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `token` or `prop` are not
+ * valid objects. FPGA_NOT_FOUND if the resource referred to by `token` was
+ * not found. FPGA_NO_DRIVER if not driver is loaded. FPGA_EXCEPTION if an
+ * internal exception occured when trying to update `prop`.
+ */
+__FPGA_API__ fpga_result fpgaUpdateProperties(fpga_token token, fpga_properties prop);
+
+/**
+ * Clear a fpga_properties object
+ *
+ * Sets all fields of the properties object pointed at by 'prop' to 'don't
+ * care', which implies that the fpga_properties object would match all FPGA
+ * resources if used for an fpgaEnumerate() query. The matching criteria can be
+ * further refined by using fpgaSet* functions on the properties object.
+ *
+ * Instead of creating a new fpga_properties object every time, this function
+ * can be used to re-use fpga_properties objects from previous queries.
+ *
+ * @param[in]  prop       fpga_properties object to clear
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `prop` is not a valid
+ * object. FPGA_EXCEPTION if an * internal exception occured when trying to
+ * access `prop`.
+ */
+__FPGA_API__ fpga_result fpgaClearProperties(fpga_properties prop);
+
+/**
+ * Clone a fpga_properties object
+ *
+ * Creates a copy of an fpga_properties object.
+ *
+ * @Note This call creates a new properties object and allocates memory for it.
+ * Both the 'src' and the newly created 'dst' objects will eventually need to be
+ * destroyed using fpgaDestroyProperties().
+ *
+ * @param[in]  src        fpga_properties object to copy
+ * @param[out] dst        New fpga_properties object cloned from 'src'
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `src` is not a valid
+ * object, or if `dst` is NULL. FPGA_NO_MEMORY if there was not enough memory
+ * to allocate an `fpga_properties` object for `dst`. FPGA_EXCEPTION if an
+ * internal exception occurred either accessing `src` or updating `dst`.
+ */
+__FPGA_API__ fpga_result fpgaCloneProperties(fpga_properties src, fpga_properties *dst);
+
+/**
+ * Destroy a fpga_properties object
+ *
+ * Destroys an existing fpga_properties object that the caller has previously
+ * created using fpgaGetProperties() or fpgaCloneProperties().
+ *
+ * @param[inout]  prop    Pointer to the fpga_properties object to destroy
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM is `prop` is not a valid
+ * object. FPGA_EXCEPTION if an internal exception occurrred while trying to
+ * access `prop`.
+ */
+__FPGA_API__ fpga_result fpgaDestroyProperties(fpga_properties *prop);
+
+/**
+ * Get the token of the parent object
+ *
+ * Returns the token of the parent of the queried resource in '*parent'.
+ *
+ * @param[in]  prop   Properties object to query
+ * @param[out] parent Pointer to a token variable of the resource 'prop' is
+ *                    associated with
+ * @returns FPGA_NOT_FOUND if resource does not have a
+ * parent (e.g. an FPGA_DEVICE resource does not have parents). Also see
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetParent(const fpga_properties prop,
+                    fpga_token *parent);
+
+/**
+ * Set the token of the parent object
+ *
+ * @param[in]  prop   Properties object to modify
+ * @param[out] parent Pointer to a token variable of the resource 'prop' is
+ *                    associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetParent(fpga_properties prop,
+                    fpga_token parent);
+
+/**
+ * Get the object type of a resource
+ *
+ * Returns the object type of the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] objtype Pointer to an object type variable of the resource
+ *                     'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetObjectType(const fpga_properties prop,
+                    fpga_objtype *objtype);
+
+/**
+ * Set the object type of a resource
+ *
+ * Sets the object type of the resource. * Currently supported object types are
+ * FPGA_DEVICE and FPGA_ACCELERATOR.
+ *
+ * @param[in]  prop    Properties object to modify
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetObjectType(fpga_properties prop,
+                    fpga_objtype objtype);
+
+/**
+ * Get the PCI bus number of a resource
+ *
+ * Returns the bus number the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] bus     Pointer to a PCI bus variable of the resource 'prop'
+ *                     is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBus(const fpga_properties prop, uint8_t *bus);
+
+/**
+ * Set the PCI bus number of a resource
+ *
+ * @param[in]  prop    Properties object to modify
+ * @param[in]  bus     PCI bus number of the resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBus(fpga_properties prop, uint8_t bus);
+
+/**
+ * Get the PCI device number of a resource
+ *
+ * Returns the device number the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] device  Pointer to a PCI device variable of the resource 'prop'
+ *                     is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetDevice(const fpga_properties prop,
+                    uint8_t *device);
+
+/**
+ * Set the PCI device number of a resource
+ *
+ * Enforces the limitation on the number of devices as specified in the
+ * PCI spec.
+ *
+ * @param[in]  prop    Properties object to modify
+ * @param[in]  device  PCI device number of the resource 'prop' is associated
+ *                     with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetDevice(fpga_properties prop,
+                    uint8_t device);
+
+/**
+ * Get the PCI function number of a resource
+ *
+ * Returns the function number the queried resource.
+ *
+ * @param[in]  prop     Properties object to query
+ * @param[out] function Pointer to PCI function variable of the
+ *                      resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetFunction(const fpga_properties prop,
+                      uint8_t *function);
+
+/**
+ * Set the PCI function number of a resource
+ *
+ * Enforces the limitation on the number of functions as specified in the
+ * PCI spec.
+ *
+ * @param[in]  prop     Properties object to modify
+ * @param[in]  function PCI function number of the resource 'prop' is
+ *                      associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetFunction(fpga_properties prop,
+                      uint8_t function);
+
+/**
+ * Get the socket id of a resource
+ *
+ * Returns the socket id of the queried resource.
+ *
+ * @param[in]  prop      Properties object to query
+ * @param[out] socket_id Pointer to a socket id variable of the
+ *                       resource 'prop'
+ *                       is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ * See also "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetSocketID(const fpga_properties prop,
+                      uint8_t *socket_id);
+
+/**
+ * Set the socket id of the resource
+ *
+ * @param[in]  prop      Properties object to modify
+ * @param[in]  socket_id Socket id of the resource 'prop' is
+ *                       associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetSocketID(fpga_properties prop,
+                      uint8_t socket_id);
+
+/**
+ * Get the device id of the resource
+ *
+ * @param[in]  prop      Properties object to query
+ * @param[out] device_id Pointer to a device id variable of the
+ *                       resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetDeviceID(const fpga_properties prop,
+                      uint32_t *device_id);
+
+/**
+ * Set the device id of the resource
+ *
+ * @param[in]  prop      Properties object to modify
+ * @param[in]  device_id Device id of the resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetDeviceID(fpga_properties prop,
+                      uint32_t device_id);
+
+/**
+ * Get the number of slots of an FPGA resource property
+ *
+ * Returns the number of slots present in an FPGA.
+ *
+ * @param[in]  prop       Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] num_slots  Pointer to number of slots variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumSlots(const fpga_properties prop,
+                      uint32_t *num_slots);
+
+/**
+ * Set the number of slots of an FPGA resource property
+ *
+ * @param[in]  prop       Properties object to modify - must be of type
+ *                        FPGA_DEVICE
+ * @param[in] num_slots   Number of slots of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumSlots(fpga_properties prop,
+                      uint32_t num_slots);
+
+/**
+ * Get the BBS ID of an FPGA resource property
+ *
+ * Returns the blue bitstream id of an FPGA.
+ *
+ * @param[in]  prop       Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] bbs_id     Pointer to a bbs id variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBBSID(const fpga_properties prop,
+                   uint64_t *bbs_id);
+
+/**
+ * Set the BBS ID of an FPGA resource property
+ *
+ * @param[in]  prop       Properties object to modify - must be of type
+ *                        FPGA_DEVICE
+ * @param[in]  bbs_id     Blue bitstream id of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBBSID(fpga_properties prop,
+                   uint64_t bbs_id);
+
+/**
+ * Get the BBS Version of an FPGA resource property
+ *
+ * Returns the blue bitstream version of an FPGA.
+ *
+ * @param[in]  prop        Properties object to query - must be of type
+ *                         FPGA_DEVICE
+ * @param[out] bbs_version Pointer to a bbs version variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBBSVersion(const fpga_properties prop,
+                    fpga_version *bbs_version);
+
+/**
+ * Set the BBS Version of an FPGA resource property
+ *
+ * @param[in]  prop        Properties object to modify - must be of type
+ *                         FPGA_DEVICE
+ * @param[in]  bbs_version Blue bitstream version of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBBSVersion(fpga_properties prop,
+                    fpga_version version);
+
+/**
+ * Get the vendor id of an FPGA resource property
+ *
+ * Returns the vendor id of an FPGA.
+ *
+ * @param[in]  prop      Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] vendor_id Pointer to a vendor id variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetVendorID(const fpga_properties prop,
+                      uint16_t *vendor_id);
+
+/**
+ * Set the vendor id of an FPGA resource property
+ *
+ * @param[in]  prop      Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  vendor_id Vendor id of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetVendorID(fpga_properties prop,
+                      uint16_t vendor_id);
+
+/**
+ * Get the model of an FPGA resource property
+ *
+ * Returns the model of an FPGA.
+ *
+ * @param[in]  prop  Properties object to query - must be of type FPGA_DEVICE
+ * @param[in]  model Model of the FPGA resource (string of minimum
+ *                   FPGA_MODEL_LENGTH length
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetModel(const fpga_properties prop,
+                   char *model);
+
+/**
+ * Set the model of an FPGA resource property
+ *
+ * @param[in]  prop  Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  model Model of the FPGA resource (string of maximum
+ *                   FPGA_MODEL_LENGTH length
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetModel(fpga_properties prop,
+                   char *model);
+
+/**
+ * Get the local memory size of an FPGA resource property
+ *
+ * Returns the local memory size of an FPGA.
+ *
+ * @param[in]  prop  Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] lms   Pointer to a memory size variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties prop,
+                         uint64_t *lms);
+
+/**
+ * Set the local memory size of an FPGA resource property
+ *
+ * @param[in]  prop  Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  lms   Local memory size of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetLocalMemorySize(fpga_properties prop,
+                         uint64_t lms);
+
+/**
+ * Get the capabilities FPGA resource property
+ *
+ * Returns the capabilities of an FPGA.
+ * Capabilities is a bitfield value
+ *
+ * @param[in]  prop         Properties object to query - must be of type
+ *                          FPGA_DEVICE
+ * @param[out] capabilities Pointer to a capabilities variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetCapabilities(const fpga_properties prop,
+                      uint64_t *capabilities);
+
+/**
+ * Set the capabilities of an FPGA resource property
+ *
+ * Capabilities is a bitfield value
+ *
+ * @param[in]  prop         Properties object to modify - must be of type
+ *                          FPGA_DEVICE
+ * @param[in]  capabilities Capabilities of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetCapabilities(fpga_properties prop,
+                      uint64_t capabilities);
+
+/**
+ * Get the GUID of a resource
+ *
+ * Returns the GUID of an FPGA or accelerator object.
+ *
+ * For an accelerator, the GUID uniquely identifies a specific accelerator context type,
+ * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID
+ * is used to identify a certain instance of an FPGA, e.g. to determine whether
+ * a given bitstream would be compatible.
+ *
+ * @param[in]  prop       Properties object to query
+ * @param[out] guid       Pointer to a GUID of the slot variable
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetGUID(const fpga_properties prop,
+                  fpga_guid *guid);
+
+/**
+ * Set the GUID of a resource
+ *
+ * Sets the GUID of an FPGA or accelerator object.
+ *
+ * For an accelerator, the GUID uniquely identifies a specific accelerator context type,
+ * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID
+ * is used to identify a certain instance of an FPGA, e.g. to determine whether
+ * a given bitstream would be compatible.
+ *
+ * @param[in]  prop       Properties object to modify
+ * @param[out] guid       Pointer to a GUID of the slot variable
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid);
+
+/**
+ * Get the number of mmio spaces
+ *
+ * Returns the number of mmio spaces of an AFU properties structure.
+ *
+ * @param[in]  prop        Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] mmio_spaces Pointer to a variable for number of mmio spaces
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumMMIO(const fpga_properties prop,
+                     uint32_t *mmio_spaces);
+
+/**
+ * Set the number of mmio spaces
+ *
+ * Sets the number of mmio spaces of an AFU properties structure.
+ *
+ * @param[in] prop        Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] mmio_spaces Number of MMIO spaces of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumMMIO(fpga_properties prop,
+                     uint32_t mmio_spaces);
+
+/**
+ * Get the number of interrupts
+ *
+ * Returns the number of interrupts of an accelerator properties structure.
+ *
+ * @param[in]  prop        Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] mmio_spaces Pointer to a variable for number of interrupts
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumInterrupts(const fpga_properties prop,
+                       uint32_t *num_interrupts);
+
+/**
+ * Set the number of mmio spaces
+ *
+ * Sets the number of interrupts of an accelerator properties structure.
+ *
+ * @param[in] prop        Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] mmio_spaces Number of interrupts of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumInterrupts(fpga_properties prop,
+                       uint32_t num_interrupts);
+
+/**
+ * Get the state of a accelerator resource property
+ *
+ * Returns the accelerator state of a accelerator.
+ *
+ * @param[in]  prop   Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] status Pointer to a accelerator state variable of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetAcceleratorState(const fpga_properties prop,
+                      fpga_accelerator_state *state);
+
+
+/**
+ * Set the state of an accelerator resource property
+ *
+ * @param[in] prop    Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] status  accelerator state of the accelerator resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetAcceleratorState(fpga_properties prop,
+                      fpga_accelerator_state state);
+
+/**
+* Get the object ID of a resource
+*
+* Returns the object ID of a resource. The object ID is a 64 bit identifier
+* that is unique within a single node or system. It represents a similar
+* concept as the token, but can be used across processes (e.g. passed on the
+* command line).
+*
+* @param[in]  prop       Properties object to query
+* @param[out] object_id  Pointer to a 64bit memory location to store the object
+*                        ID in
+* @returns See "Accessor Return Values" in [properties.h](#properties-h).
+*/
+__FPGA_API__ fpga_result fpgaPropertiesGetObjectID(fpga_properties prop,
+                        uint64_t *object_id);
+
+
+/**
+* Set the object ID of a resource
+*
+* Sets the object ID of a resource. The object ID is a 64 bit identifier
+* that is unique within a single node or system. It represents a similar
+* concept as the token, but can be used across processes (e.g. passed on the
+* command line).
+*
+* @param[in]  prop       Properties object to query
+* @param[in]  object_id  A 64bit value to use as the object ID
+* @returns See "Accessor Return Values" in [properties.h](#properties-h).
+*/
+__FPGA_API__ fpga_result fpgaPropertiesSetObjectID(fpga_properties prop,
+    uint64_t object_id);
+
+/**
+* Create a fpga_properties object
+*
+* Initializes the memory pointed at by `prop` to represent a properties
+* object, and populates it with the properties of the resource referred to by
+* `handle`. Individual properties can then be queried using fpgaPropertiesGet*()
+* accessor functions.
+*
+* @note fpgaGetPropertiesFromHandle() will allocate memory for the created properties
+* object returned in `prop`. It is the responsibility of the caller
+* to free this memory after use by calling fpgaDestroyProperties().
+*
+* @param[in]  handle     Open handle to get properties for.
+* @param[out] prop       Pointer to a variable of type fpga_properties
+* @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated
+* to create the `fpga_properties` object. FPGA_EXCEPTION if an exception
+* happend while initializing the `fpga_properties` object.
+**/
+__FPGA_API__
+fpga_result
+fpgaGetPropertiesFromHandle(
+    fpga_handle     handle,
+    fpga_properties *prop
+    );
+
+END_C_DECL
+
+#endif // __FPGA_PROPERTIES_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h
new file mode 100644
index 0000000..481e6ae
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h
@@ -0,0 +1,173 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file types.h
+ * @brief Type definitions for FPGA API
+ *
+ * OPAE uses the three opaque types fpga_properties, fpga_token, and
+ * fpga_handle to create a hierarchy of objects that can be used to enumerate,
+ * reference, acquire, and query FPGA resources. This object model is designed
+ * to be extensible to account for different FPGA architectures and platforms.
+ *
+ * Initialization
+ * --------------
+ * OPAEs management of the opaque types `fpga_properties`,
+ * `fpga_token`, and `fpga_handle` relies on the proper initialization of
+ * variables of these types. In other words, before doing anything with a
+ * variable of one of these opaque types, you need to first initialize them.
+ *
+ * The respective functions that initizalize opaque types are:
+ *
+ *   * fpgaGetProperties() and fpgaCloneProperties() for `fpga_properties`
+ *   * fpgaEnumerate() and fpgaCloneToken() for `fpga_token`
+ *   * fpgaOpen() for `fpga_handle`
+ *
+ * This should intuitively make sense - fpgaGetProperties() creates
+ * `fpga_properties` objects, fpgaEnumerate() creates `fpga_token` objects,
+ * fpgaOpen() creates `fpga_handle` objects, and fpgaCloneProperties() and
+ * fpgaCloneToken() clone (create) `fpga_properties` and `fpga_token` objects,
+ * respectively.
+ *
+ * Since these opaque types are interpreted as pointers (they are typedef'd to
+ * a `void *`), passing an uninitialized opaque type into any function except
+ * the respective initailzation function will result in undefined behaviour,
+ * because OPAE will try to follow an invalid pointer. Undefined behaviour in
+ * this case may include an unexpected error code, or an application crash.
+ *
+ */
+
+#ifndef __FPGA_TYPES_H__
+#define __FPGA_TYPES_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <opae/types_enum.h>
+
+/**
+ * Object for expressing FPGA resource properties
+ *
+ * `fpga_properties` objects encapsulate all enumerable information about an
+ * FPGA resources. They can be used for two purposes: selective enumeration
+ * (discovery) and querying information about existing resources.
+ *
+ * For selective enumeration, usually an empty `fpga_properties` object is
+ * created (using fpgaGetProperties()) and then populated with the desired
+ * criteria for enumeration. An array of `fpga_properties` can then be passed
+ * to fpgaEnumerate(), which will return a list of `fpga_token` objects
+ * matching these criteria.
+ *
+ * For querying properties of existing FPGA resources, fpgaGetProperties() can
+ * also take an `fpga_token` and will return an `fpga_properties` object
+ * populated with information about the resource referenced by that token.
+ *
+ * After use, `fpga_properties` objects should be destroyed using
+ * fpga_destroyProperties() to free backing memory used by the
+ * `fpga_properties` object.
+ */
+typedef void *fpga_properties;
+
+/**
+ * Token for referencing FPGA resources
+ *
+ * An `fpga_token` serves as a reference to a specific FPGA resource present in
+ * the system. Holding an `fpga_token` does not constitute ownership of the
+ * FPGA resource - it merely allows the user to query further information about
+ * a resource, or to use fpgaOpen() to acquire ownership.
+ *
+ * `fpga_token`s are usually returned by fpgaEnumerate() or
+ * fpgaPropertiesGetParent(), and used by fpgaOpen() to acquire ownership and
+ * yield a handle to the resource. Some API calls also take `fpga_token`s as
+ * arguments if they don't require ownership of the resource in question.
+ */
+typedef void *fpga_token;
+
+/**
+ * Handle to an FPGA resource
+ *
+ * A valid `fpga_handle` object, as populated by fpgaOpen(), denotes ownership
+ * of an FPGA resource. Note that ownership can be exclusive or shared,
+ * depending on the flags used in fpgaOpen(). Ownership can be released by
+ * calling fpgaClose(), which will render the underlying handle invalid.
+ *
+ * Many OPAE C API functions require a valid token (which is synonymous with
+ * ownership of the resource).
+ */
+typedef void *fpga_handle;
+
+/**
+ * Globally unique identifier (GUID)
+ *
+ * GUIDs are used widely within OPAE for helping identify FPGA resources. For
+ * example, every FPGA resource has a `guid` property, which can be (and in the
+ * case of FPGA_ACCELERATOR resource primarily is) used for enumerating a resource of a
+ * specific type.
+  *
+ * `fpga_guid` is compatible with libuuid's uuid_t, so users can use libuuid
+ * functions like uuid_parse() to create and work with GUIDs.
+ */
+typedef uint8_t fpga_guid[16];
+
+/**
+ * Semantic version
+ *
+ * Data structure for expressing version identifiers following the semantic
+ * versioning scheme. Used in various properties for tracking component
+ * versions.
+ */
+typedef struct {
+    uint8_t major;        /**< Major version */
+    uint8_t minor;        /**< Minor version */
+    uint16_t patch;       /**< Revision or patchlevel */
+} fpga_version;
+
+/*
+ * Scatter Gather list in userspace that will be populated during fpgaGetPhysicalAddress call
+ */
+typedef struct _sg_element {
+    uint64_t  phys_addr;                 /**< Starting physical address of this scatter/gather region */
+    uint32_t  length;                    /**< length, in bytes, of a physically contiguous SG region */
+} sg_element, *psg_element;
+
+/** Handle to an event object
+ *
+ * OPAE provides an interface to asynchronous events that can be generated by
+ * different FPGA resources. The event API provides functions to register for
+ * these events; associated with every event a process has registered for is an
+ * `fpga_event_handle`, which encapsulates the OS-specific data structure for
+ * event objects.
+ *
+ * On Linux, an `fpga_event_handle` can be used as a file descriptor and passed
+ * to select(), poll(), epoll() and similar functions to wait for asynchronous
+ * events.
+ */
+#ifndef _WIN32
+typedef int fpga_event_handle;
+#else
+typedef HANDLE fpga_event_handle;
+#endif
+
+#endif // __FPGA_TYPES_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h
new file mode 100644
index 0000000..6fc4de2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h
@@ -0,0 +1,196 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file types_enum.h
+ * @brief Definitions of enumerated types for the OPAE C API
+ *
+ * This file defines return and error codes, event and object types, states,
+ * and flags as used or reported by OPAE C API functions.
+ */
+
+#ifndef __FPGA_TYPES_ENUM_H__
+#define __FPGA_TYPES_ENUM_H__
+
+#ifdef _WIN32
+#ifdef FpgaLib_EXPORTS
+#define __FPGA_API__ __declspec(dllexport)
+#else
+#define __FPGA_API__ __declspec(dllimport)
+#endif
+#else
+#define __FPGA_API__ __attribute__((visibility("default")))
+#endif
+
+#ifdef __cplusplus
+#define BEGIN_C_DECL extern "C" {
+#define END_C_DECL }
+#else
+#define BEGIN_C_DECL
+#define END_C_DECL
+#endif
+
+/**
+ * OPAE C API function return codes
+ *
+ * Every public API function exported by the OPAE C library will return one of
+ * these codes. Usually, FPGA_OK denotes successful completion of the requested
+ * operation, while any return code *other* than FPGA_OK indicates an error or
+ * other deviation from the expected behavior. Users of the OPAE C API should
+ * always check the return codes of the APIs they call, and not use output
+ * parameters of functions that did not execute successfully.
+
+ * The fpgaErrStr() function converts error codes into printable messages.
+ *
+ * OPAE also has a logging mechanism that allows a developer to get more
+ * information about why a particular call failed with a specific message. If
+ * enabled, any function that returns an error code different from FPGA_OK will
+ * also print out a message with further details. This mechanism can be enabled
+ * by setting the environment variable `LIBOPAE_LOG` to 1 before running the
+ * respective application.
+ */
+
+//
+// Minimum alignment requirement for DMA BBB
+//
+#define FPGA_DMA_ALIGN_BYTES 64
+
+//
+// Maximum size (in bytes0 descriptor of each SGDMA
+// block can transfer. For pre-alpha maximum transfer size is
+// One Meg minus some bytes.
+
+#define FPGA_DMA_BUF_SIZE    (1020*1024)
+
+//
+// Number of DMA blocks supported by SGDMA.
+// Currently only one is supported by pre-alpha
+// bitstream
+//
+#define NDMA                 1
+
+typedef enum {
+    FPGA_OK = 0,         /**< Operation completed successfully */
+    FPGA_INVALID_PARAM,  /**< Invalid parameter supplied */
+    FPGA_BUSY,           /**< Resource is busy */
+    FPGA_EXCEPTION,      /**< An exception occurred */
+    FPGA_NOT_FOUND,      /**< A required resource was not found */
+    FPGA_NO_MEMORY,      /**< Not enough memory to complete operation */
+    FPGA_NOT_SUPPORTED,  /**< Requested operation is not supported */
+    FPGA_NO_DRIVER,      /**< Driver is not loaded */
+    FPGA_NO_DAEMON,      /**< FPGA Daemon (fpgad) is not running */
+    FPGA_NO_ACCESS,      /**< Insufficient privileges or permissions */
+    FPGA_RECONF_ERROR    /**< Error while reconfiguring FPGA */
+} fpga_result;
+
+ /*
+ * FPGA events
+ *
+ * OPAE currently defines the following event types that applications can
+ * register for.Note that not all FPGA resources and target platforms may
+ * support all event types.
+ */
+typedef enum
+{
+    FPGA_NO_EVENT = 0,
+    FPGA_EVENT_INTERRUPT,     /**< Interrupt generated by an accelerator */
+    FPGA_EVENT_ERROR,         /**< Infrastructure error event */
+    FPGA_EVENT_POWER_THERMAL, /**< Infrastructure thermal event */
+    FPGA_EVENT_PORT_ERROR,
+    FPGA_EVENT_FME_ERROR,
+    FPGA_LIFECYCLE_APPEAR_EVENT,
+    FPGA_LIFECYCLE_DISAPPEAR_EVENT,
+    FPGA_EVENT_AFC_INTERRUPT,
+    FPGA_EVENT_TYPE_MAX,
+    FPGA_EVENT_AP_EVENT,
+    FPGA_MAX_EVENT
+} fpga_event_type;
+
+/* TODO: consider adding lifecycle events in the future
+ * to help with orchestration.  Need a complete specification
+ * before including them in the API.  Proposed events:
+ * FPGA_EVENT_APPEAR
+ * FPGA_EVENT_DISAPPEAR
+ * FPGA_EVENT_CHANGE
+ */
+
+/** accelerator state */
+typedef enum {
+    FPGA_ACCELERATOR_ASSIGNED = 0,  /**< accelerator is opened exclusively by another process */
+    FPGA_ACCELERATOR_UNASSIGNED,    /**< accelerator is free to be opened */
+    FPGA_ACCELERATOR_STATE_MAX
+} fpga_accelerator_state;
+
+/**
+ * OPAE FPGA resources (objects)
+ *
+ * These are the FPGA resources currently supported by the OPAE object model.
+ */
+typedef enum {
+    /** FPGA_DEVICE objects represent FPGA devices and their management functionality.
+    * These objects can be opened (typically requires a certain privilege level or
+    * access permissions) and used for management functions like fpgaReconfigreSlot(). */
+    FPGA_DEVICE = 0,
+    /** FPGA_ACCELERATOR objects represent allocatable units for accessing
+    * accelerated functions on the FPGA. They are frequently opened for
+    * interacting via control registers (MMIO), shared memory, or other,
+    * possibly platform-specific functions. */
+    FPGA_ACCELERATOR,
+    FPGA_OBJTYPE_MAX
+} fpga_objtype;
+
+/**
+ * Buffer flags
+ *
+ * These flags can be passed to the fpgaPrepareBuffer() function.
+ */
+enum fpga_buffer_flags {
+    FPGA_BUF_PREALLOCATED = (1u << 0),  /**< Use existing buffer */
+    FPGA_BUF_QUIET = (1u << 1),         /**< Suppress error messages */
+    FPGA_BUF_NOCACHE = (1u << 2),
+    FPGA_BUF_LARGE_PAGE = (1u << 4)    /*< For 2MB page support in VTP */
+};
+
+/**
+ * Open flags
+ *
+ * These flags can be passed to the fpgaOpen() function.
+ */
+enum fpga_open_flags {
+    FPGA_OPEN_SHARED = (1u << 0) /**< Open FPGA resource for shared access */
+};
+
+/**
+ * Reconfiguration flags
+ *
+ * These flags can be passed to the fpgaReconfigure() function.
+ */
+enum fpga_reconf_flags {
+    /** Reconfigure the slot without checking if it is in use */
+    FPGA_RECONF_FORCE = (1u << 0)
+};
+
+#endif // __FPGA_TYPES_ENUM_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h
new file mode 100644
index 0000000..6e073ee
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h
@@ -0,0 +1,112 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file umsg.h
+ * \brief FPGA UMsg API
+ */
+
+#ifndef __FPGA_UMSG_H__
+#define __FPGA_UMSG_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Get number of Umsgs
+ *
+ * Retuns number of umsg supported by AFU.
+ *
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[out] value    Returns number of UMsgs
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaGetNumUmsg(fpga_handle handle, uint64_t *value);
+
+/**
+ * Sets Umsg hint
+ *
+ * Writes usmg hint bit.
+ *
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[in]  value    Value to use for UMsg hint, Umsg hit is N wide bitvector
+ *                      where N = number of Umsgs.
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaSetUmsgAttributes(fpga_handle handle,
+                  uint64_t value);
+
+/**
+ * Trigger Umsg
+ *
+ * Writes a 64-bit value to trigger low-latency accelerator notification mechanism
+ * (UMsgs).
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[in]  value    Value to use for UMsg
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaTriggerUmsg(fpga_handle handle, uint64_t value);
+
+/**
+ * Access UMsg memory directly
+ *
+ * This function will return a pointer to the memory allocated for low latency
+ * accelerator notifications (UMsgs).
+ * @note This call is only supported by hardware targets, not by ASE
+ * simulation. Use fpgaTriggerUmsg() if you need ASE simulation capabilities.
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[out] umsg_ptr Pointer to memory where a pointer to the virtual
+ *                      address space will be returned
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ *                      FPGA_NO_MEMORY if memory allocation fails or system
+ *                      doesn't configure huge pages.
+ */
+__FPGA_API__ fpga_result fpgaGetUmsgPtr(fpga_handle handle, uint64_t **umsg_ptr);
+
+END_C_DECL
+
+#endif // __FPGA_UMSG_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h
new file mode 100644
index 0000000..5b57cbd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h
@@ -0,0 +1,54 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file utils.h
+ * \brief Utility functions and macros for the FPGA API
+ */
+
+#ifndef __FPGA_UTILS_H__
+#define __FPGA_UTILS_H__
+
+#include <opae/types.h>
+#include <stdio.h>
+
+BEGIN_C_DECL
+
+/**
+ * Return human-readable error message
+ *
+ * Returns a pointer to a human-readable error message corresponding to the
+ * provided fpga_error error code.
+ *
+ * @param[in]  e   Error code (as returned by another FPGA API function
+ * @returns        Pointer to a descriptive error message string
+ */
+__FPGA_API__ const char *fpgaErrStr(fpga_result e);
+
+END_C_DECL
+
+#endif // __FPGA_UTILS_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h
new file mode 100644
index 0000000..66bd18b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h
@@ -0,0 +1,79 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef __FPGA_VERSION_H__
+#define __FPGA_VERSION_H__
+
+#include <opae/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Get version information about the OPAE library
+ *
+ * Retrieve major version, minor version, and revision information about the
+ * OPAE library.
+ *
+ * @param[out]  version  FPGA version
+ * @returns FPGA_INVALID_PARAM if any of the output parameters is NULL, FPGA_OK
+ * otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECVersion(fpga_version *version);
+
+/**
+ * Get version information about the OPAE library as a string
+ *
+ * Retrieve major version, minor version, and revision information about the
+ * OPAE library, encoded in a human-readable string (e.g. "1.0.0").
+ *
+ * @param[out]  version_str  String to copy version information into
+ * @param[in]   len          Length of `version_str`
+ * @returns FPGA_INVALID_PARAM if `version_str` is NULL, FPGA_EXCEPTION if the
+ * version string cannot be copied into `version_str`, FPGA_OK otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECVersionString(char *version_str, size_t len);
+#define FPGA_VERSION_STR_MAX 10
+
+/**
+ * Get build information about the OPAE library as a string
+ *
+ * Retrieve the build identifier of the OPAE library.
+ *
+ * @param[out]  build_str  String to copy build information into
+ * @param[in]   len        Length of `build_str`
+ * @returns FPGA_INVALID_PARAM if `build_str` is NULL, FPGA_EXCEPTION if the
+ * version string cannot be copied into `build_str`, FPGA_OK otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECBuildString(char *build_str, size_t len);
+#define FPGA_BUILD_STR_MAX 41
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // __FPGA_VERSION_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h
new file mode 100644
index 0000000..27f4f1e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h
@@ -0,0 +1,170 @@
+/* Editor for Altera OpenCL package files
+ *
+ * Dmitry Denisenko, June 2012.
+ *
+ * This provides higher-level functions for ELF work.
+ * The idea is to put content into sections, one "piece" of content
+ * per section, and use section names to identify the content.
+ * The interface enforces unique section names (not true for generic ELFs)
+ * and hides all the ugly ELF interface calls and structures.
+ */
+
+#ifndef PKG_FILE_EDITOR_H
+#define PKG_FILE_EDITOR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_STRING_LENGTH 100000
+
+/* Modes for open_struct acl_pkg_file() call.
+ * Exactly one of ACL_PKG_READ, ACL_PKG_READ_WRITE must be supplied.
+ * Other flags may be bitwise OR'd into the mode.
+ *
+ * You can combine other modes with ACL_PKG_SHOW_* to control messages.
+ */
+#define ACL_PKG_READ          (1<<0)  /* Only reading the package */
+#define ACL_PKG_READ_WRITE    (1<<1)  /* Expect to read and write the binary. File must already exist. */
+#define ACL_PKG_CREATE        (1<<2)  /* Also creating.  Can only be used with ACL_PKG_READ_WRITE */
+
+#define ACL_PKG_SHOW_ERROR    (1<<8)  /*print errors to stderr*/
+#define ACL_PKG_SHOW_INFO     (1<<9) /*print info messages to stdout*/
+
+#define ACL_PKG_SECTION_ACL_VERSION    ".acl.version"
+#define ACL_PKG_SECTION_ACL_BUILD      ".acl.build"
+#define ACL_PKG_SECTION_QVERSION       ".acl.qversion"
+#define ACL_PKG_SECTION_HASH           ".acl.hash"
+#define ACL_PKG_SECTION_BOARD          ".acl.board"
+#define ACL_PKG_SECTION_COMPILEOPTIONS ".acl.compileoptions"
+#define ACL_PKG_SECTION_SOURCE         ".acl.source"
+#define ACL_PKG_SECTION_LLVMIR         ".acl.llvmir"
+#define ACL_PKG_SECTION_VERILOG        ".acl.verilog"
+#define ACL_PKG_SECTION_PROFILE_BASE   ".acl.profile_base"
+#define ACL_PKG_SECTION_AUTODISCOVERY  ".acl.autodiscovery"
+#define ACL_PKG_SECTION_RBF            ".acl.rbf"
+#define ACL_PKG_SECTION_CORE_RBF       ".acl.core.rbf"
+#define ACL_PKG_SECTION_PERIPH_RBF     ".acl.periph.rbf"
+#define ACL_PKG_SECTION_BASE_RBF       ".acl.base_revision.rbf"
+#define ACL_PKG_SECTION_SOF            ".acl.sof"
+#define ACL_PKG_SECTION_VFABRIC        ".acl.vfabric"
+#define ACL_PKG_SECTION_PLL_CONFIG     ".acl.pll_config"
+#define ACL_PKG_SECTION_FPGA_BIN       ".acl.fpga.bin"
+#define ACL_PKG_SECTION_EMULATOR_OBJ_LINUX   ".acl.emulator_object.linux"
+#define ACL_PKG_SECTION_EMULATOR_OBJ_WINDOWS ".acl.emulator_object.windows"
+#define ACL_PKG_SECTION_AUTODISCOVERY_XML    ".acl.autodiscovery.xml"
+#define ACL_PKG_SECTION_BOARDSPEC_XML  ".acl.board_spec.xml"
+#define ACL_PKG_SECTION_PERIPH_HASH    ".acl.periph.hash"
+#define ACL_PKG_SECTION_PROFILER_XML   ".acl.profiler.xml"
+#define ACL_PKG_SECTION_COMPILE_REV    ".acl.compile_revision"
+#define ACL_PKG_SECTION_PCIE_DEV_ID    ".acl.pcie.dev_id"
+#define ACL_PKG_SECTION_BASE_PERIPH_HASH    ".acl.base_revision.periph.hash"
+#define ACL_PKG_SECTION_ADJUST_PLLS_OUTPUT ".acl.quartus_report"
+#define ACL_PKG_SECTION_KERNEL_ARG_INFO_XML ".acl.kernel_arg_info.xml"
+#define ACL_PKG_SECTION_FAST_COMPILE ".acl.fast_compile"
+
+/* Minimum alignment in memory. */
+#define ACL_PKG_MIN_SECTION_ALIGNMENT  128
+
+/* Open and close the pkg file */
+struct acl_pkg_file *acl_pkg_open_file (const char *fname, int mode);
+/* You can call close on a NULL pointer: it will do nothing.
+ * Closing the package file will also free its memory, so you better lose
+ * the pointer reference.
+ */
+int acl_pkg_close_file (struct acl_pkg_file *pkg);
+
+/* Set message output mode: show_mode is some combination of the bits
+ * in ACL_PKG_SHOW_INFO and ACL_PKG_SHOW_ERROR
+ */
+void acl_pkg_set_show_mode( struct acl_pkg_file* pkg, int show_mode );
+
+/* Open memory image of pkg file. Only good for reading!
+ * The show_mode argument is an OR combination of zero or more of
+ *    ACL_PKG_SHOW_INFO,
+ *    ACL_PKG_SHOW_ERROR.
+ */
+struct acl_pkg_file *acl_pkg_open_file_from_memory (char *pkg_image, size_t pkg_image_size, int show_mode);
+
+
+/* Does the given named section exist?
+ * Returns 1 for yes, 0 for no.
+ * If the section exists, and size_ret is not-NULL, then the size (in bytes) of the
+ * section is stored into *size_ret. The size does NOT include NULL terminator, just like strlen().
+ */
+int acl_pkg_section_exists (const struct acl_pkg_file *pkg, const char *sect_name, size_t* size_ret);
+
+/* Return list of ALL (useful) section names in the package.
+ * The buffer must be pre-allocated by the caller upto max_len bytes.
+ * Each section name is separated by '\n'
+ * Returns 1 on success, 0 on failure.
+ */
+int acl_pkg_section_names (const struct acl_pkg_file *pkg, char *buf, size_t max_len);
+
+
+/* Add a new section with specified content.
+ * If a section with such name already exists, nothing is done.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_add_data_section           (struct acl_pkg_file *pkg, const char *sect_name, const void* content, size_t len);
+int acl_pkg_add_data_section_from_file (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file);
+
+/* Read content of an existing section.
+ * For read_section(), the buffer must be pre-allocated by caller to hold at least len bytes.
+ * This function will add '\0' at the end, therefore, the 'len' argument passed to this function
+ * must be one larger than the value returned by acl_pkg_section_exists.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_read_section              (const struct acl_pkg_file *pkg, const char *sect_name, char *buf, size_t len);
+int acl_pkg_read_section_into_file    (struct acl_pkg_file *pkg, const char *sect_name, const char *out_file);
+
+/* Get a transient pointer to a section's data, via buf_ptr.
+ * The pointer is transient: It might move if you update the package in any way.
+ * This is a "fast" path in comparison to acl_pkg_read_section, so you
+ * don't have to allocate space to copy into.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_read_section_transient(const struct acl_pkg_file *pkg, const char *sect_name, char** buf_ptr);
+
+/* Update content of an existing section.
+ * Old content is discarded. The section must already exist.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_update_section            (struct acl_pkg_file *pkg, const char *sect_name, const void *new_content, size_t new_len);
+int acl_pkg_update_section_from_file  (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file);
+
+/* List all pkg sections to stdout.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_list_file_sections   (struct acl_pkg_file *pkg);
+
+/* Read full content of file into a buffer.
+ * The buffer is allocated by this function but must be freed by the caller.
+ * File length is returned in the second argument */
+void *acl_pkg_read_file_into_buffer (const char *in_file, size_t *file_size_out);
+
+/* support for package/unpackage */
+
+/* Package the input files and directory trees (NULL terminated list in input_files_dirs)
+ * and put them into the output file (out_file).
+ * Returns 0 on failure, non-zero on success
+ */
+int acl_pkg_pack (const char* out_file, const char** input_files_dirs);
+
+/* Unpack the input file (or stdin if filename is ACL_PKG_UNPACKAGE_STDIN)
+ * created by acl_pkg_pack into directory out_dir.
+ * Returns 0 on failure, non-zero on success
+ */
+#define ACL_PKG_UNPACKAGE_STDIN "-"
+int acl_pkg_unpack (const char* in_file, const char* out_dir);
+
+/* Unpack the buffer created by acl_pkg_pack into directory out_dir.
+ * Returns 0 on failure, non-zero on success
+ */
+int acl_pkg_unpack_buffer (const char* buffer, size_t buffer_size, const char* out_dir);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PKG_FILE_EDITOR_H */
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib
new file mode 100755
index 0000000..2f26b62
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib
new file mode 100755
index 0000000..6c7f423
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore
new file mode 100644
index 0000000..0948b39
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore
@@ -0,0 +1,20 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
+
+backup
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt
new file mode 100644
index 0000000..d8bf50d
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt
@@ -0,0 +1,59 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+# Select PCIE Gen3 x8
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x8")
+
+# from the opencl makefile
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -DOPTION3=1 -DACL_USE_DMA=1 -DACL_COMPILER_IS_MSVC=0 -Wall -Wno-unknown-pragmas -DACL_HAS_STDLIB_STDIO")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -DACL_HOST_RUNTIME_IS_STATIC=0")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_SYS=linux -DACL_TARGET_BIT=64 -DLINUX -DACL_MAX_DEVICE=128")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2 -O3")
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+if(RUNTIME_POLLING)
+  add_definitions(-DRUNTIME_POLLING)
+endif(RUNTIME_POLLING)
+
+set(MMD_SRC
+   ./host/acl_hps.cpp
+   ./host/mmd_device.cpp
+   ./host/dma_device.cpp
+   ./host/uio_device.cpp
+)
+
+add_library(hps_platform_mmd SHARED ${MMD_SRC})
+
+target_include_directories(hps_platform_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+target_link_libraries(hps_platform_mmd)
+
+install(TARGETS hps_platform_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT hps_platform_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp
new file mode 100644
index 0000000..53055ef
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp
@@ -0,0 +1,473 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- HPS.cpp  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) HPS MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions that are defined in aocl_mmd.h               */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_hps.h"
+
+// other standard header files
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "mmd_device.h"
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <unistd.h>
+#endif  // LINUX
+
+#define MAX_HPS_FPGA_DEVICES (1)
+
+// MAX size of line read from pipe-ing the output of system call to MMD
+#define BUF_SIZE 1024
+// MAX size of command passed to system for invoking system call from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+#ifndef DLA_MMD
+// static helper functions
+static bool blob_has_elf_signature(void *data, size_t data_size);
+#endif
+
+
+// Function to return the number of boards installed in the system
+unsigned int get_offline_num_boards() {
+  board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES);
+  return (unsigned int)names.size();
+}
+
+// Get information about the board using the enum aocl_mmd_offline_info_t for
+// offline info (called without a handle), and the enum aocl_mmd_info_t for
+// info specific to a certain board.
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_UNSIGNED(X)                                  \
+  {                                                         \
+    *((unsigned *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(unsigned); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+#if defined(WINDOWS)
+#define RESULT_STR(X)                                                                                         \
+  do {                                                                                                        \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                                              \
+    memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                                               \
+  } while (0)
+#else
+#define RESULT_STR(X)                                                                     \
+  do {                                                                                    \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                          \
+    memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                           \
+  } while (0)
+#endif
+#define ACL_VENDOR_NAME "Intel"
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  unsigned int num_boards;
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(MMD_VERSION);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      num_boards = MAX_HPS_FPGA_DEVICES;
+      RESULT_INT((int)num_boards);
+      break;
+    }
+    case AOCL_MMD_BOARD_NAMES: {
+      // Retrieve all the CoreDLA cores in the system
+      board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES);
+      // Construct a list of all possible devices supported by this MMD layer
+      std::ostringstream board;
+      auto name = names.begin();
+      while(name != names.end() )
+      {
+        board << *name;
+        name++;
+        if( name != names.end() )
+        {
+          board << ";";
+        }
+      }
+
+      RESULT_STR(board.str().c_str());
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME: {
+      RESULT_STR(ACL_VENDOR_NAME);
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0); /* TODO: Can we yield? */
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY); /* TODO: Confirm this is the right memory type */
+      break;
+  }
+  return 0;
+}
+
+// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime
+// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure
+// the runtime doesn't get to reference them after MMD destructors have been called.
+// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does.
+// Implemented as a singleton.
+class DeviceMapManager final {
+public:
+  typedef std::map<int, mmd_device_ptr> map_handle_to_dev_t;
+  ~DeviceMapManager()
+  {
+  }
+
+  int add_device(const char *name)
+  {
+    int handle = idx++;
+
+    mmd_device_ptr spDevice = std::make_shared<mmd_device>(name, handle);
+    if( spDevice->bValid() )
+    {
+      auto it = handle_to_dev.find(handle);
+      HPS_ERROR_IF( it != handle_to_dev.end(), return FAILURE, "Error: Handle already used.\n" );
+      handle_to_dev.insert({handle, spDevice});
+      return handle;
+    }
+    return FAILURE;
+  }
+
+  mmd_device_ptr get_device(const int handle)
+  {
+    auto it = handle_to_dev.find(handle);
+    HPS_ERROR_IF( it == handle_to_dev.end(), return nullptr, "Error: Invalid handle.\n" );
+    return it->second;
+  }
+
+  bool remove_device(const int handle)
+  {
+    auto it = handle_to_dev.find(handle);
+    HPS_ERROR_IF( it == handle_to_dev.end(), return false, "Error: Handle does not exist.\n" );
+    handle_to_dev.erase(it);
+    return true;
+  }
+
+  DeviceMapManager()
+  {
+  }
+private:
+  map_handle_to_dev_t handle_to_dev = {};
+  int                 idx = {0};
+};
+static DeviceMapManager _gDeviceMapManager;
+
+int aocl_mmd_get_info(
+  int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  HPS_ERROR_IF(true,
+                    return FAILURE,
+                    "aocl_mmd_get_info not supported on platform. \n");
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+
+// Open and initialize the named device.
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+  return _gDeviceMapManager.add_device(name);
+}
+
+// Close an opened device, by its handle.
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+  if ( _gDeviceMapManager.remove_device(handle) )
+    return SUCCESS;
+  return FAILURE;
+}
+
+// Set the interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->set_interrupt_handler(fn, user_data);
+}
+
+// Set the device interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void *user_data) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+// Set the operation status handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+ return -1;
+}
+
+// Called when the host is idle and hence possibly waiting for events to be
+// processed by the device
+int AOCL_MMD_CALL aocl_mmd_yield(int handle)
+{
+      printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+// Read, write and copy operations on a single interface.
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->read_block(op, mmd_interface, dst, offset, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->write_block(op, mmd_interface, src, offset, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// Initialize host channel specified in channel_name
+int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// reset the host channel specified with channel handle
+int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// Get the pointer to buffer the user can write/read from the kernel with
+AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return NULL;
+}
+
+// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer
+size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+#ifdef DLA_MMD
+// Reprogram the device given the sof file name
+int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* We don't support reprogramming the SOF on a HPS device */
+  return -1;
+}
+#else
+// Reprogram the device based on the program mode
+int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* We don't support reprogramming the SOF on a HPS device */
+  return -1;
+}
+#endif
+// Shared memory allocator
+AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return NULL;
+}
+
+// Shared memory de-allocator
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return;
+}
+
+#ifndef DLA_MMD
+// This function checks if the input data has an ELF-formatted blob.
+// Return true when it does.
+static bool blob_has_elf_signature(void *data, size_t data_size) {
+  bool result = false;
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  if (data && data_size > 4) {
+    unsigned char *cdata = (unsigned char *)data;
+    const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'};  // Little endian
+    result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) &&
+             (cdata[3] == elf_signature[3]);
+  }
+  return result;
+}
+#endif
+
+// Return a positive number when single device open. Otherwise, return -1
+AOCL_MMD_CALL int get_open_handle() {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t *properties,
+                                        int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_free(void *mem) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return 0;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return 0;
+}
+
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances()
+{
+  return 1;
+}
+
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() {
+  return 1ULL << 29;
+}
+
+// AGX7 HPS board uses 333.3325 MHz (1333.33/4) for the DLA DDR Clock
+// All other boards use 266.666666 MHz (1066.66666/4)
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() {
+#ifdef HPS_AGX7
+  return 333.332500;
+#else
+  return 266.666666;
+#endif
+}  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) {
+  return (0x1000 * instance) + addr;
+}
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) {
+  return addr;
+}
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(
+      handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+
+#ifdef STREAM_CONTROLLER_ACCESS
+AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->bStreamControllerValid();
+}
+
+AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr);
+}
+
+AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) {
+  return aocl_mmd_read(
+      handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr);
+}
+#endif
+
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  return 200;
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h
new file mode 100644
index 0000000..7c85a24
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h
@@ -0,0 +1,111 @@
+#ifndef ACL_HPS_H
+#define ACL_HPS_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_hps.h  --------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) HPS MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file defines macros and types that are used inside the MMD driver          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifndef ACL_HPS_EXPORT
+#define ACL_HPS_EXPORT __declspec(dllimport)
+#endif
+
+#define MMD_VERSION AOCL_MMD_VERSION_STRING
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#ifdef DLA_MMD
+#include <cstdint>
+#endif
+#include "aocl_mmd.h"
+
+#include "hps_types.h"
+
+#if defined(WINDOWS)
+#error Currently not available for windows
+#endif
+
+#if defined(LINUX)
+typedef uintptr_t KPTR;
+typedef int fpga_handle;
+typedef unsigned int fpga_result;
+#define FPGA_OK 0
+
+typedef unsigned int DWORD;
+typedef unsigned long long QWORD;
+typedef char INT8;
+typedef unsigned char UINT8;
+typedef int16_t INT16;
+typedef uint16_t UINT16;
+typedef int INT32;
+typedef unsigned int UINT32;
+typedef long long INT64;
+typedef unsigned long long UINT64;
+
+#define INVALID_HANDLE_VALUE ((int)(-1))
+
+#define INVALID_DEVICE (-1)
+#define WD_STATUS_SUCCESS 0
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%u"
+#define DWORD_FMT_X "%x"
+#define DWORD_FMT_4X "%04X"
+
+// define for the format string for size_t type
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+
+#endif  // LINUX
+
+#define MAX_NAME_SIZE (1204)
+
+#define HPS_ASSERT(COND, ...)                        \
+  do {                                                    \
+    if (!(COND)) {                                        \
+      printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \
+      printf(__VA_ARGS__);                                \
+      fflush(stdout);                                     \
+      assert(0);                                          \
+    }                                                     \
+  } while (0)
+
+#define HPS_ERROR_IF(COND, NEXT, ...) \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define HPS_INFO(...)             \
+  do {                                 \
+    printf("MMD INFO : " __VA_ARGS__); \
+    fflush(stdout);                    \
+  } while (0)
+
+#endif  // ACL_HPS_H
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp
new file mode 100644
index 0000000..e403823
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp
@@ -0,0 +1,120 @@
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- dma_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         dma device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the dma device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "dma_device.h"
+#include <unistd.h>
+#include <glob.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+#include <memory.h>
+
+// Copied from Linux driver: /drivers/dma/altera-msgdma.c
+#define MSGDMA_DESC_NUM 1024
+
+// Same page size as used in /meta-intel-fpga-coredla/recipes-drivers/msgdma-userio/files/msgdma_userio_chr.c
+#define PAGE_SIZE 4096
+
+//////////////////////////////////////////////////////
+
+#define ERR(format, ...) \
+printf("%s:%u() **ERROR** : " format, \
+    __func__, __LINE__,  ##__VA_ARGS__)
+
+//////////////////////////////////////////////////////
+dma_device::dma_device(std::string &name)
+{
+    _pFile = fopen(name.c_str(), "r+");
+    if( _pFile == nullptr )
+    {
+        ERR("dma_device::dma_device failed to open %s\n", name.c_str());
+        return;
+    }
+
+    // Turn off buffering
+    setvbuf(_pFile, NULL, _IONBF, 0);
+}
+
+dma_device::~dma_device()
+{
+    if( _pFile )
+    {
+        fclose(_pFile);
+        _pFile = NULL;
+    }
+}
+
+int  dma_device::read_block(void *host_addr, size_t offset, size_t size)
+{
+    // Use 32bit seek as DDR memory current < 32bits
+    if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) {
+        return FAILURE;
+    }
+
+    size_t read_size = fread(host_addr, 1, size, _pFile);
+    return (read_size == size) ? SUCCESS : FAILURE;
+}
+
+int  dma_device::write_block(const void *host_addr, size_t offset, size_t size)
+{
+    // The MSGDMA driver only supports a maximum of 1024 x 4096 = 4MBytes in the worst case scenario,
+    // in the event that the virtual buffer is fully fragmented. As the buffer gets more fragmented it's
+    // possible to run out of DMA descriptors. To prevent this, slice the data into 4MB chunks.
+
+    // chunk_size is chosen based on the size of a page (12 bits) and default number of descriptors (1024).
+    // The descriptor count is reduced by 1 since if the host_addr is not aligned to a page then an extra page
+    // will be added at the end. This would then increase the descriptor count by 1.
+    size_t chunk_size = PAGE_SIZE * (MSGDMA_DESC_NUM - 1);
+    size_t write_size = 0;
+
+    // Use 32bit seek as DDR memory current < 32bits
+    if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) {
+        return FAILURE;
+    }
+
+    for (size_t host_addr_offset = 0; host_addr_offset < size; host_addr_offset += chunk_size) {
+        size_t current_size = chunk_size;
+
+        // If the current address is within one chunk_size from the end of the data, set current_size
+        // to the bytes left to send
+        if (size - host_addr_offset < chunk_size) {
+            current_size = size - host_addr_offset;
+        }
+
+        size_t current_write_size = fwrite((uint8_t *)host_addr + host_addr_offset, 1, current_size, _pFile);
+
+        if (current_write_size != current_size) {
+            return FAILURE;
+        }
+
+        write_size += current_write_size;
+    }
+
+    return (write_size == size) ? SUCCESS : FAILURE;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h
new file mode 100644
index 0000000..24f89e4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h
@@ -0,0 +1,56 @@
+#ifndef DMA_DEVICE_H_
+#define DMA_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- dma_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         dma device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the dma device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "hps_types.h"
+
+class dma_device
+{
+public:
+  dma_device(std::string &name);
+  ~dma_device();
+
+  int read_block(void *host_addr, size_t offset, size_t size);
+  int write_block(const void *host_addr, size_t offset, size_t size);
+
+  bool bValid() { return _pFile != nullptr; };
+private:
+
+  dma_device() = delete;
+  dma_device(dma_device const&) = delete;
+  void operator=(dma_device const &) = delete;
+
+  FILE *_pFile = {nullptr}; // File pointer to UIO - Used to indicate the the uio_device is valid
+};
+typedef std::shared_ptr<dma_device> dma_device_ptr;
+
+#endif // DMA_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h
new file mode 100644
index 0000000..3f11c4a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h
@@ -0,0 +1,44 @@
+#ifndef HPS_TYPES_H_
+#define HPS_TYPES_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- hps_types.h  -------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Useful HPS Types                                        */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file contains useful type definition                                       */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+
+#define SUCCESS (0)
+#define FAILURE (1)
+
+typedef std::vector<std::string> board_names;
+
+typedef enum {
+  HPS_MMD_COREDLA_CSR_HANDLE = 1, // COREDLA CSR Interface
+  HPS_MMD_MEMORY_HANDLE = 2,      // Device Memory transfers
+  HPS_MMD_STREAM_CONTROLLER_HANDLE = 3   // Stream Controller Interface
+} hps_mmd_interface_t;
+
+#endif // HPS_TYPES_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp
new file mode 100644
index 0000000..b52c1d8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp
@@ -0,0 +1,129 @@
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- mmd_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         mmd device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the mmd device object            */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#include "mmd_device.h"
+
+// Defined names of the UIO Nodes
+#define UIO_COREDLA_PREFIX "coredla"
+#define STREAM_CONTROLLER_PREFIX "stream_controller"
+
+// Defined name of the msgdma device
+#define DMA_DEVICE_PREFIX "/dev/msgdma_coredla"
+#define UIO_DEVICE_PREFIX "uio"
+
+board_names mmd_get_devices(const int max_fpga_devices)
+{
+    return uio_get_devices(UIO_COREDLA_PREFIX, max_fpga_devices);
+}
+
+
+/////////////////////////////////////////////////////////
+mmd_device::mmd_device(std::string name, const int mmd_handle)
+: _name(name), _mmd_handle(mmd_handle) {
+    _spCoredlaDevice = std::make_shared<uio_device>(name, _mmd_handle, true);
+    int32_t index = extract_index(_name);
+    if( (index >= 0) && _spCoredlaDevice && _spCoredlaDevice->bValid() )
+    {
+        std::string dma_name(DMA_DEVICE_PREFIX);
+        dma_name += std::to_string(index);
+        _spDmaDevice = std::make_shared<dma_device>(dma_name);
+
+        if( (_spDmaDevice==nullptr) || (!_spDmaDevice->bValid()) ) {
+            _spDmaDevice = nullptr;
+            return;
+        }
+        std::string stream_controller_name = uio_get_device(STREAM_CONTROLLER_PREFIX, index);
+        if( !stream_controller_name.empty() ) {
+            // Create a uio_device but don't attach any interrupt support as the stream controller
+            // does not require interrupts
+            _spStreamControllerDevice = std::make_shared<uio_device>(stream_controller_name, _mmd_handle, false);
+            if( _spStreamControllerDevice && !_spStreamControllerDevice->bValid() ) {
+                // The stream controller does not exist
+                _spStreamControllerDevice = nullptr;
+            }
+        }
+    }
+}
+
+int mmd_device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size)
+{
+    if( op ) {
+        LOG_ERR("op not support : %s\n", __func__ );
+        return FAILURE;
+    }
+    if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) {
+        return _spDmaDevice->read_block(host_addr, offset, size);
+    } else if( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) {
+        return _spCoredlaDevice->read_block(host_addr, offset, size);
+    } else if( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) {
+        if ( _spStreamControllerDevice ) {
+            return _spStreamControllerDevice->read_block(host_addr, offset, size);
+        }
+    }
+
+    return FAILURE;
+}
+
+int mmd_device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size)
+{
+     if( op ) {
+        LOG_ERR("op not support : %s\n", __func__ );
+        return FAILURE;
+    }
+    if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) {
+        return _spDmaDevice->write_block(host_addr, offset, size);
+    } else if ( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) {
+        return _spCoredlaDevice->write_block(host_addr, offset, size);
+    } else if ( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) {
+        if( _spStreamControllerDevice ) {
+            return _spStreamControllerDevice->write_block(host_addr, offset, size);
+        }
+    }
+    return FAILURE;
+}
+
+int mmd_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+    if( _spCoredlaDevice ) {
+        return _spCoredlaDevice->set_interrupt_handler(fn, user_data);
+    }
+    return FAILURE;
+}
+
+// Returns the index of a uio device
+// If index cannot be found then returns -1
+int mmd_device::extract_index(const std::string name) {
+    std::string prefix(UIO_DEVICE_PREFIX);
+
+  if (name.length() <= prefix.length() && name.compare(0, prefix.length(), prefix)) {
+    LOG_ERR("Error parsing device name '%s'\n", name.c_str());
+    return -1;
+  }
+
+  std::string device_num_str = name.substr(prefix.length());
+  int32_t index = std::stoi(device_num_str, 0, 10);
+  return index;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h
new file mode 100644
index 0000000..9cb0c71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h
@@ -0,0 +1,75 @@
+#ifndef MMD_DEVICE_H_
+#define MMD_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- mmd_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         mmd device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the mmd device object            */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <memory>
+#include <string>
+
+#include "hps_types.h"
+#include "dma_device.h"
+#include "uio_device.h"
+
+#include "aocl_mmd.h"
+
+// LOG ERRORS
+#define MMD_ERR_LOGGING 1
+#ifdef MMD_ERR_LOGGING
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define LOG_ERR(...)
+#endif
+
+class mmd_device {
+public:
+  mmd_device(std::string name, const int mmd_handle);
+
+  bool bValid() { return _spCoredlaDevice && _spCoredlaDevice->bValid() && _spDmaDevice && _spDmaDevice->bValid(); };
+  bool bStreamControllerValid() { return _spCoredlaDevice && _spStreamControllerDevice && _spStreamControllerDevice->bValid(); };
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size);
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size);
+
+  int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+private:
+  int32_t extract_index(const std::string name);
+
+  mmd_device() = delete;
+  mmd_device(mmd_device const&) = delete;
+  void operator=(mmd_device const &) = delete;
+  std::string _name;
+
+  uio_device_ptr _spCoredlaDevice;
+  uio_device_ptr _spStreamControllerDevice;
+  dma_device_ptr _spDmaDevice;
+  int            _mmd_handle;
+};
+
+typedef std::shared_ptr<mmd_device> mmd_device_ptr;
+
+extern board_names mmd_get_devices(const int max_fpga_devices);
+
+#endif // MMD_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp
new file mode 100644
index 0000000..95a9567
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp
@@ -0,0 +1,469 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- uio_device.cpp  ----------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         uio device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the uio device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "uio_device.h"
+#include <unistd.h>
+#include <glob.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <poll.h>
+
+#include <cinttypes>
+#include <memory.h>
+
+//////////////////////////////////////////////////////
+#define UIO_BASE_NAME "uio*"
+#define UIO_BASE_PATH "/sys/class/uio/"
+#define UIO_BASE_SEARCH UIO_BASE_PATH UIO_BASE_NAME
+#define UIO_MAX_PATH (256)
+
+#define ERR(format, ...) \
+fprintf(stderr, "%s:%u **ERROR** : " format, \
+    __FILE__, __LINE__,  ##__VA_ARGS__)
+
+//////////////////////////////////////////////////////
+#define MAX_NAME (20)
+bool uio_read_sysfs_uint64(const char *device_name, const char *sysfs_name, uint64_t &value)
+{
+       FILE *fp;
+    char param_path[UIO_MAX_PATH];
+
+    if( snprintf(param_path, sizeof(param_path), "%s/%s", device_name, sysfs_name) < 0 )
+    {
+        ERR("Path too long. %s, %s\n", device_name, sysfs_name);
+        return false;
+    }
+
+    fp = fopen(param_path, "r");
+    if( !fp )
+    {
+        ERR("Failed to fopen - %s\n", param_path);
+        return false;
+    }
+
+    if( fscanf(fp, "%" PRIx64, &value) != 1 )
+    {
+        ERR("Failed fscanf - %s\n", param_path);
+        fclose(fp);
+        return false;
+    }
+
+    fclose(fp);
+    return true;
+}
+
+bool uio_read_sysfs_string(const char *uio_path, const char *sysfs_name, std::string &result)
+{
+    char uio_name[MAX_NAME];
+    FILE *fp;
+    char param_path[UIO_MAX_PATH];
+
+    if( snprintf(param_path, sizeof(param_path), "%s/%s", uio_path, sysfs_name) < 0 )
+    {
+        ERR("Path too long. %s, %s\n", uio_path, sysfs_name);
+        return false;
+    }
+
+    fp = fopen(param_path, "r");
+    if( !fp )
+    {
+        ERR("Failed to fopen - %s\n", param_path);
+        return false;
+    }
+
+    int num_read = fread(uio_name, 1, MAX_NAME, fp);
+    if( num_read <= 0 )
+    {
+        ERR("Failed to read name - %s\n", param_path);
+        fclose(fp);
+        return false;
+    }
+
+    uio_name[num_read-1] = '\0'; // Terminate
+    result = std::string(uio_name);
+    fclose(fp);
+
+    return true;
+}
+
+std::string uio_get_device(const std::string prefix, const int32_t index)
+{
+  glob_t globbuf = {0};
+  std::string uio_name;
+
+  int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf);
+  if( (glob_res == 0) && (globbuf.gl_pathc) )
+  {
+    std::string device_name;
+    device_name = prefix + std::to_string(index);
+
+    for( size_t i=0; i<globbuf.gl_pathc; i++ )
+    {
+      std::string name;
+      uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name);
+
+      if( name.find(device_name) != std::string::npos )
+      {
+        // We will return just the device name without the UIO_BASE_PATH
+        std::string name = std::string(globbuf.gl_pathv[i]);
+        uio_name = name.substr(sizeof(UIO_BASE_PATH)-1);
+      }
+    }
+   }
+   return uio_name;
+}
+
+board_names uio_get_devices(const std::string device_name, const int max_devices)
+{
+  board_names names;
+  int device = 0;
+
+  glob_t globbuf = {0};
+
+  int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf);
+  if( (glob_res == 0) && (globbuf.gl_pathc) )
+  {
+    for( size_t i=0; (i<globbuf.gl_pathc) && (device < max_devices); i++ )
+    {
+      std::string name;
+      uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name);
+
+      if( name.find(device_name) != std::string::npos )
+      {
+        // We will return just the device name without the UIO_BASE_PATH
+        std::string name = std::string(globbuf.gl_pathv[i]);
+        name = name.substr(sizeof(UIO_BASE_PATH)-1);
+        names.push_back(name);
+        device++;
+      }
+    }
+   }
+   return names;
+}
+
+//////////////////////////////////////////////////////////////
+uio_device::uio_device(std::string &name, const int mmd_handle, const bool bEnableIRQ)
+: _mmd_handle(mmd_handle)
+{
+    // Map the first address space
+    if ( !map_region(name, 0) ) {
+        ERR("Failed to map region 0 on %s\n", name.c_str());
+        return;
+    }
+#ifndef RUNTIME_POLLING
+    if( bEnableIRQ ) {
+        _spInterrupt = std::make_shared<uio_interrupt>(_fd, _mmd_handle);
+        if( !_spInterrupt->initialized() ) {
+            _spInterrupt = nullptr; // If the uio_interrupt failed to initialize then delete
+        }
+        _bIrqEnabled = bEnableIRQ;
+    }
+#endif
+}
+
+bool uio_device::bValid() {
+    bool bValid = (_fd >=0);
+#ifndef RUNTIME_POLLING // If we're not polling check that the interrupt handling is working
+    if( _bIrqEnabled ) {
+        bValid |= (_spInterrupt != nullptr);
+    }
+#endif
+    return bValid;
+};
+
+uio_device::~uio_device()
+{
+#ifndef RUNTIME_POLLING
+    _spInterrupt = nullptr; // Shutdown the interrupt handler
+#endif
+    unmap_region();
+}
+
+uint32_t uio_device::read(const uint32_t reg)
+{
+    // NOT YET IMPLEMENTED
+    return 0;
+}
+
+void uio_device::write(const uint32_t reg, const uint32_t value)
+{
+    // NOT YET IMPLEMENTED
+    return;
+}
+
+// Copies the block of data from the FPGA to the host
+// memcpy is not used as this can cause multiple transfers of the AXI bus depending
+// on the implementation of memcpy
+int  uio_device::read_block(void *host_addr, size_t offset, size_t size)
+{
+    // Support for only 32bit aligned transfers
+    if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){
+        return FAILURE;
+    }
+
+    // Transfer the data in 32bit chunks
+    volatile const uint32_t *pDeviceMem32 = reinterpret_cast<volatile const uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset);
+    uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr);
+    while (size >= sizeof(uint32_t)) {
+        *host_addr32++ = *pDeviceMem32++;
+        size -= sizeof(uint32_t);
+    }
+
+    return SUCCESS;
+}
+
+// Copies the block of data from the host to the FPGA
+// memcpy is not used as this can cause multiple transfers of the AXI bus depending
+// on the implementation of memcpy
+int  uio_device::write_block(const void *host_addr, size_t offset, size_t size)
+{
+    // Support for only 32bit aligned transfers
+    if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){
+        return FAILURE;
+    }
+
+    // Transfer the remaining 32bits of data
+    volatile uint32_t *pDeviceMem32 = reinterpret_cast<volatile uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset);
+    const uint32_t *host_addr32 = reinterpret_cast<const uint32_t*>(host_addr);
+    while( size >= sizeof(uint32_t) ) {
+        *pDeviceMem32++ = *host_addr32++;
+        size -= sizeof(uint32_t);
+    }
+    return SUCCESS;
+}
+
+int uio_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+#ifndef RUNTIME_POLLING
+    if( _spInterrupt ) {
+        return _spInterrupt->set_interrupt_handler(fn, user_data);
+    }
+#endif
+    return FAILURE;
+}
+
+/////////////////////////////////////////////////////////////////
+void uio_device::unmap_region()
+{
+    if( _pBase )
+    {
+        munmap(_pBase, _size);
+        _pBase = nullptr;
+    }
+
+    if( _fd >= 0 )
+    {
+        close(_fd);
+        _fd = -1;
+    }
+}
+
+bool uio_device::map_region( std::string &name, const uint32_t index)
+{
+    char map_path[UIO_MAX_PATH];
+
+    std::string uio_params_path(UIO_BASE_PATH);
+    uio_params_path += name;
+
+    // char device_path[UIO_MAX_PATH];
+    // const char *p;
+
+    if( snprintf(map_path, sizeof(map_path), "maps/map%d/size", index ) < 0 )
+    {
+        ERR("Failed to make map addr name.\n");
+        return false;
+    }
+    if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _size) )
+    {
+        ERR("Failed to read size\n");
+        return false;
+    }
+    // Make sure that the size doesn't exceed 32bits, as this will fail the mapping
+    // call on 32bit systems
+    if( _size > UINT32_MAX ) {
+        ERR("Invalid size value\n");
+        return false;
+    }
+
+    if( snprintf(map_path, sizeof(map_path), "maps/map%d/offset", index ) < 0 )
+    {
+        ERR("Failed to make map offset name.\n");
+        return false;
+    }
+    if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _offset) )
+    {
+        ERR("Failed to read offset\n");
+        return false;
+    }
+
+    std::string uio_dev_path("/dev/");
+    uio_dev_path += name;
+
+    _fd = open(uio_dev_path.c_str(), O_RDWR );
+    if( _fd < 0 )
+    {
+        ERR("Failed to open - %s\n", uio_dev_path.c_str());
+        return false;
+    }
+    // Map the region into userspace
+    // The base of the region is the page_size offset of the index
+    uint32_t page_size = (uint32_t)sysconf(_SC_PAGESIZE);
+
+    _pBase = (uint8_t*)mmap(NULL, (size_t)_size, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, (off_t) (index * page_size));
+    if( _pBase == MAP_FAILED )
+    {
+        ERR("Failed to map uio region.\n");
+        close(_fd);
+        _fd = -1;
+        return false;
+    }
+    // CST base address is at _pBase + _offset
+    _pPtr = (uint32_t*)(_pBase + _offset);
+
+    return true;
+};
+
+#ifndef RUNTIME_POLLING
+///////////////////////////////////////////////////////////////////////////////////
+uio_interrupt::uio_interrupt(const int fd, const int mmd_handle)
+: _device_fd(fd), _mmd_handle(mmd_handle) {
+    if( is_irq_available() ) {
+        // Create a eventfd_object to be used for shutting down the work_thread
+        _spShutdown_event = std::make_shared<eventfd_object>();
+        if( _spShutdown_event->initialized() ) {
+            _pThread = new std::thread(work_thread, std::ref(*this));
+        } else {
+            _spShutdown_event = nullptr;
+        }
+    } else {
+        ERR("No device interrupt found.\n");
+    }
+}
+
+uio_interrupt::~uio_interrupt() {
+    // kill the thread
+    if (_pThread && _spShutdown_event) {
+        // send message to thread to end it
+        _spShutdown_event->notify(1);
+
+        // join with thread until it ends
+        _pThread->join();
+
+        delete _pThread;
+        _pThread = NULL;
+
+        _spShutdown_event = nullptr;
+    }
+}
+
+bool uio_interrupt::is_irq_available() {
+    // Disable the interrupt handling, this will fail if the IRQ has not been setup correctly.
+    // For example devicetree is incorrect.
+    return disable_irq();
+}
+
+bool uio_interrupt::enable_irq() {
+    // Enable interrupts from the device
+    uint32_t info = 1;
+    ssize_t nb = write(_device_fd, &info, sizeof(info));
+    if( nb != (ssize_t)sizeof(info) ) {
+        ERR( "Failed in enable CoreDLA Interrupt = %s\n", strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+bool uio_interrupt::disable_irq() {
+    // Enable interrupts from the device
+    uint32_t info = 0;
+    ssize_t nb = write(_device_fd, &info, sizeof(info));
+    if( nb != (ssize_t)sizeof(info) ) {
+        ERR( "Failed in disable CoreDLA Interrupt = %s\n", strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+void uio_interrupt::work_thread(uio_interrupt& obj) {
+    obj.run_thread();
+}
+
+#define UIO_INTERRUPT_TIMEOUT (-1)
+void uio_interrupt::run_thread() {
+    while( true ) {
+        // Need to re-enable the UIO interrupt handling as UIO disables the IRQ each time it is fired
+        if ( !enable_irq() ) {
+            exit(-1);
+        }
+        // Poll for the shutdown_event and uio interrupt
+        struct pollfd pollfd_arr[2];
+        pollfd_arr[0].fd = _spShutdown_event->get_fd();
+        pollfd_arr[0].events = POLLIN;
+        pollfd_arr[0].revents = 0;
+        pollfd_arr[1].fd = _device_fd;
+        pollfd_arr[1].events = POLLIN;
+        pollfd_arr[1].revents = 0;
+
+        int res = poll(pollfd_arr, 2, UIO_INTERRUPT_TIMEOUT);
+        if (res < 0) {
+            ERR( "Poll error errno = %s\n", strerror(errno));
+            exit(-1);
+        } else if (res > 0 && pollfd_arr[0].revents == POLLIN) {
+            uint64_t count;
+            ssize_t bytes_read = read(pollfd_arr[0].fd, &count, sizeof(count));
+            if (bytes_read > 0) {
+                break; // We've been asked to shutdown
+            } else {
+                ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+                exit(-1);
+            }
+        } else if (res > 0 && pollfd_arr[1].revents == POLLIN) {
+            uint32_t count;
+            ssize_t bytes_read = read(pollfd_arr[1].fd, &count, sizeof(count));
+            if (bytes_read > 0) {
+                if( _interrupt_fn ) { // Run the callback to the application
+                    _interrupt_fn(get_mmd_handle(), _interrupt_fn_user_data );
+                }
+            } else {
+                ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+                exit(-1);
+            }
+        }
+    }
+    // Disable interrupt handling in UIO
+    if( !disable_irq() ){
+        exit(-1);
+    }
+}
+
+int uio_interrupt::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  _interrupt_fn = fn;
+  _interrupt_fn_user_data = user_data;
+  return SUCCESS;
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h
new file mode 100644
index 0000000..c5f3ed5
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h
@@ -0,0 +1,162 @@
+#ifndef UIO_DEVICE_H_
+#define UIO_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- uio_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         uio device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the uio device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+#include <string.h>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "aocl_mmd.h"
+#include "hps_types.h"
+
+// simple wrapper class for managing eventfd objects
+class eventfd_object final {
+ public:
+  eventfd_object() {
+    m_initialized = false;
+    // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set
+    // The implementation of functions using eventfd assumes that
+    m_fd = eventfd(0, 0);
+    if (m_fd < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return;
+    }
+
+    m_initialized = true;
+  }
+
+  ~eventfd_object() {
+    if (m_initialized) {
+      if (close(m_fd) < 0) {
+        fprintf(stderr, "eventfd : %s", strerror(errno));
+      }
+    }
+  }
+
+  bool notify(uint64_t count) {
+    ssize_t res = write(m_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+    return true;
+  }
+
+  int get_fd() { return m_fd; }
+  bool initialized() { return m_initialized; }
+
+ private:
+  // not used and not implemented
+  eventfd_object(eventfd_object& other);
+  eventfd_object& operator=(const eventfd_object& other);
+
+  // member varaibles
+  int m_fd;
+  int m_initialized;
+};  // class eventfd_object
+typedef std::shared_ptr<eventfd_object> eventfd_object_ptr;
+
+#ifndef RUNTIME_POLLING
+class uio_interrupt final {
+  public:
+    uio_interrupt(const int fd, const int mmd_handle);
+    ~uio_interrupt();
+    bool initialized() { return _pThread != nullptr; }; // If the thread is not created then must be invalid
+    int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+
+  private:
+    bool is_irq_available(); // Checks that the interrupt has been mapped into userspace
+    bool enable_irq();  // Enables UIO Irq handling
+    bool disable_irq(); // Disabled UIO Irq handling
+
+    static void work_thread(uio_interrupt &obj);
+    void run_thread(); // Function which handles waiting for interrupts
+
+    uio_interrupt() = delete;
+    uio_interrupt(uio_interrupt const&) = delete;
+    void operator=(uio_interrupt const&) = delete;
+
+    int get_mmd_handle() {return _mmd_handle; };
+
+    std::thread *_pThread = {nullptr}; // Pointer to a thread object for waiting for interrupts
+    int _device_fd = {-1}; // /dev/uio* device pointer
+    int _mmd_handle = {-1}; // handle to the parent mmd_device
+    eventfd_object_ptr _spShutdown_event = {nullptr}; // Shutdown thread event object
+
+    aocl_mmd_interrupt_handler_fn _interrupt_fn = {nullptr};
+    void                          *_interrupt_fn_user_data = {nullptr};
+};
+typedef std::shared_ptr<uio_interrupt> uio_interrupt_ptr;
+#endif
+
+class uio_device
+{
+public:
+  uio_device(std::string &name, const int mmd_handle, const bool bEnableIrq=false);
+  ~uio_device();
+
+  uint32_t read(const uint32_t reg);
+  void write(const uint32_t reg, const uint32_t value);
+
+  int read_block(void *host_addr, size_t offset, size_t size);
+  int write_block(const void *host_addr, size_t offset, size_t size);
+  int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+
+  bool bValid();
+
+private:
+  bool map_region( std::string &name, const uint32_t index );
+  void unmap_region();
+
+  uio_device() = delete;
+  uio_device(uio_device const&) = delete;
+  void operator=(uio_device const &) = delete;
+
+  int _mmd_handle; // Handle to the parent mmd device
+  int _fd = {-1}; // File pointer to UIO - Used to indicate the the uio_device is valid
+  uint64_t _size;   // Size of the mmapped region
+  uint64_t _offset; // Offset of the first register
+  uint8_t *_pBase; // Base of the mmapped region
+
+  uint32_t *_pPtr; // The first register
+#ifndef RUNTIME_POLLING
+  bool _bIrqEnabled; // Indicates that we tried to create with IRQ
+  uio_interrupt_ptr _spInterrupt; // Object to handle UIO Interrupts
+#endif
+};
+typedef std::shared_ptr<uio_device> uio_device_ptr;
+
+extern board_names uio_get_devices(const std::string name, const int max_devices);
+extern std::string uio_get_device(const std::string prefix, const int32_t index);
+
+#endif // UIO_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h
new file mode 100644
index 0000000..7c1c73d
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h
@@ -0,0 +1,645 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include <cstdint>  //uint32_t
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3)
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,            /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,                /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,                   /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,                 /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                      /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                        /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                       /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,                  /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,                 /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,               /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11,      /* total # of concurrent operations read + writes*/
+  AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,       /* Min alignment that the BSP supports for host allocations (size_t) */
+  AOCL_MMD_HOST_MEM_CAPABILITIES = 13,           /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+  AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,         /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+  AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,         /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+  AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/
+  AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+  AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+// AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+//                                     aocl_mmd_info_t requested_info_id,
+//                                     size_t param_value_size,
+//                                     void* param_value,
+//                                     size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signaled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+// AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+//                                                         aocl_mmd_device_interrupt_handler_fn fn,
+//                                                         void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+//AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+//AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+// AOCL_MMD_CALL int aocl_mmd_copy(
+//     int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+//AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+//AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+//AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+//AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device. That means the kernels will be idle and no read/write/copy
+ * commands are active. Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size. The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again. At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+
+// #ifdef DLA_MMD
+// // CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to
+// // reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask
+// // the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared).
+// // BEWARE: reprogramming will invalidate the handle
+// AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename) WEAK;
+// #else
+// AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+// #endif
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4
+#define AOCL_MMD_ERROR_INVALID_POINTER -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+/** Memory properties*/
+typedef enum {
+  /**
+   *  Specifies the name of a global memory that can be found in the
+   *  board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  global memory interface.
+   */
+  AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1,
+  /**
+   *  Specifies the index of a bank inside the global memory interface that can be found in
+   *  the board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  memory bank. It is invalid to specify this property without also specifying
+   *  AOCL_MMD_GLOBAL_MEMORY_INTERFACE.
+   */
+  AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK
+} aocl_mmd_mem_properties_t;
+
+/**
+ *  Host allocations provide memory that is allocated on the host. Host
+ *  allocations are accessible by the host and one or more devices.
+ *  The same pointer to a host allocation may be used on the host and all
+ *  supported devices; they have address equivalence. This memory must be
+ *  deallocated with aocl_mmd_free();
+ *
+ *  Once the device has signaled completion through
+ *  aocl_mmd_interrupt_handler_fn() the host can assume it has access to the
+ *  latest contents of the memory, allocated by this call.
+ *
+ *  @param handles Handles for devices that will need access to this memory
+ *  @param num_devices Number of devices in the handles
+ *  @param size The size of the memory region
+ *  @param alignment The alignment in bytes of the allocation
+ *  @param properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+// AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles,
+//                                         size_t num_devices,
+//                                         size_t size,
+//                                         size_t alignment,
+//                                         aocl_mmd_mem_properties_t* properties,
+//                                         int* error) WEAK;
+
+/**
+ * Frees memory that has been allocated by MMD
+ *
+ * @param mem The pointer to the memory region. Must be a pointer that is
+ *   allocated by the MMD.
+ * @return AOCL_MMD_ERROR_SUCCESS if success, else error code
+ */
+// AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK;
+
+/**
+ *  Allocate memory that is owned by the device. This pointer can only be
+ *  accessed by the kernel; can't be accessed by the host. The host is able to
+ *  manipulate the pointer (e.g. increment it) just not access the underlying
+ *  data. This memory must be deallocated by aocl_mmd_free();
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param  alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return Pointer that can be passed into the kernel. NULL on failure.
+ */
+// AOCL_MMD_CALL void* aocl_mmd_device_alloc(
+//     int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+/**
+ *  Shared allocations may migrate between the host and one or more associated
+ *  device. The same pointer to a shared allocation may be used on the host and
+ *  the supported device; they have address equivalence.
+ *
+ *  If the device does not support concurrent access to memory allocated by
+ *  aocl_mmd_shared_alloc() then a call must be made to
+ *  aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should
+ *  be migrated to the device before the device accesses this memory.  For
+ *  example, a call to aocl_mmd_shared_mem_migrate() should be made before a
+ *  kernel accessing this memory is launched).  Conversely,
+ *  aocl_mmd_shared_mem_migrate() should be called again to indicate that the
+ *  shared allocation should be migrated to the host before the host accesses
+ *  this memory again.  If the device supports concurrent access to memory
+ *  allocated with aocl_mmd_shared_alloc(), then the call to
+ *  aocl_mmd_shared_mem_migrate() is not necessary, but may still be made.  In
+ *  the case of concurrent access, it is the responsibility of the MMD to ensure
+ *  both the device and host can access aocl_mmd_shared_alloc() allocations at
+ *  all times.
+ *
+ *  Memory allocated by aocl_mmd_shared_alloc() must be deallocated with
+ *  aocl_mmd_free().
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported properties are
+ *    listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_.
+ *    Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+// AOCL_MMD_CALL void* aocl_mmd_shared_alloc(
+//     int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t;
+
+/**
+ *  A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared
+ *  allocations any time the accessor of the allocation changes.  For example,
+ *  aocl_mmd_shared_migrate() should be called indicating that the allocation
+ *  should be migrated to the device before a kernel accessing the allocation
+ *  is launched on the device.  Similarly, aocl_mmd_shared_migrate() should be
+ *  called indicating that the allocation is migrated to the host before the
+ *  host accesses the memory after kernel completion.
+ *
+ *  For concurrent allocations this call may be used as a performance hint, but
+ *  is not strictly required for functionality.
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc()
+ *  @param size In bytes, the size of the migration. Must be of multiple of a
+ *   page boundary that the BSP supports.
+ *  @param destination The destination of migration
+ *  @return The error code defined by AOCL_MMD_ERROR*
+ */
+// AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle,
+//                                           void* shared_ptr,
+//                                           size_t size,
+//                                           aocl_mmd_migrate_t destination) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+#define STREAM_CONTROLLER_ACCESS
+#ifdef STREAM_CONTROLLER_ACCESS
+AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) WEAK;
+AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+#endif
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt
new file mode 100644
index 0000000..d8be216
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+add_library(system_console_mmd INTERFACE)
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp
new file mode 100644
index 0000000..64c6631
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp
@@ -0,0 +1,320 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "mmd_wrapper.h"
+#include "dla_dma_constants.h"  // DLA_DMA_CSR_OFFSET_***
+
+#include <cassert>    // assert
+#include <cstddef>    // size_t
+#include <iostream>   // std::cerr
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::string
+
+#include <boost/process.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/format.hpp>
+#include <boost/filesystem/fstream.hpp>
+#include <boost/process/environment.hpp>
+#include <string>
+#include <iostream>
+#include <string>
+#include <cstdio>
+#include <sstream>
+#include <ostream>
+
+#define xstr(s) _str(s)
+#define _str(s)  #s
+
+// All board variants must obey the CoreDLA CSR spec, which says that all access must be
+// - 32 bits in size
+// - address must be 4 byte aligned
+// - within the address range, CSR size is 2048 bytes
+constexpr uint64_t DLA_CSR_ALIGNMENT = 4;
+constexpr uint64_t DLA_CSR_SIZE = 2048;
+namespace bp = boost::process; //we will assume this for all further examples
+
+constexpr auto max_size = std::numeric_limits<std::streamsize>::max();
+
+static const boost::filesystem::path system_console_path("/home/pmclean/intelfpga_pro/23.4/qprogrammer/syscon/bin/system-console");
+static boost::filesystem::path temp_file_path;
+static boost::filesystem::path tcl_file_path;
+static boost::filesystem::path sof_file_path;
+static uint32_t enable_pmon;
+static bool     preserve_temp_files;
+
+const uint32_t DLA_CSR_BASE_ADDRESS = 0x80000000;
+const uint32_t DLA_DDR_BASE_ADDRESS = 0x0;
+
+
+static bp::opstream in;
+static bp::ipstream out;
+static bp::child subprocess;
+
+static int capture_till_prompt(bp::ipstream& out, std::ostream& capture)
+{
+  std::array<char, 4096> line_buffer;
+  if (out.fail()) {
+    std::cout << "EOF" << std::endl;
+    return 1;
+  }
+
+  do {
+    out.clear();
+    out.getline(&line_buffer[0], (std::streamsize)line_buffer.size(), '%');
+    capture.write(&line_buffer[0], out.gcount());
+    // If out.getline fills the line buffer without encountering the delimiter
+    // then the failbit of out will be set, causing out.fail() to return true.
+    // bp::ipstream indirectly inherits std::ios_base::iostate, which defines failbit/badbit
+  } while (out.fail() && (static_cast<long unsigned int> (out.gcount()) == line_buffer.size()-1));
+
+  if (out.fail()) {
+    std::cout << "EOF" << std::endl;
+    return 1;
+  }
+  return 0;
+}
+
+static int wait_for_prompt(bp::ipstream& out)
+{
+  return capture_till_prompt(out, std::cout);
+}
+
+std::string remove_non_alphanumeric(const std::string& input) {
+  std::string result = input;
+  result.erase(std::remove_if(result.begin(), result.end(), [](unsigned char c) {
+   return !std::isalnum(c);
+  }), result.end());
+  return result;
+}
+
+static void send_command(bp::opstream& in, std::string command)
+{
+  in << command << "\n";
+  in.flush();
+}
+
+static void write_to_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr, uint32_t data) {
+  addr += DLA_CSR_BASE_ADDRESS;
+  send_command(in, "master_write_32 $::g_dla_csr_service " + str( boost::format("0x%|08x| 0x%|08x|") % addr % data));
+  if (0 != wait_for_prompt(out))
+  {
+    throw std::runtime_error("Unexpected EOF");
+  }
+}
+
+static uint32_t read_from_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr) {
+  if (addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK)
+  {
+    return 3;
+  }
+  if (addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG)
+  {
+    return 1;
+  }
+  addr += DLA_CSR_BASE_ADDRESS;
+  send_command(in, "master_read_32 $::g_dla_csr_service " + str( boost::format("0x%|08x|") % addr ) + " 1");
+  std::basic_stringstream<char> s1;
+  std::string captured;
+  do {
+    if (0 != capture_till_prompt(out, s1))
+    {
+      throw std::runtime_error("Unexpected EOF");
+    }
+    captured = s1.str();
+  } while (std::all_of(captured.begin(), captured.end(), [](unsigned char c){return (std::isspace(c) || std::iscntrl(c));}));
+  std::string trimmed = remove_non_alphanumeric(captured);
+
+  uint32_t data = std::stoul(trimmed, nullptr, 16);
+
+  return data;
+}
+
+static void read_from_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, void* data)
+{
+  if (data == nullptr)
+  {
+    throw std::runtime_error("null data");
+  }
+  boost::filesystem::path temp_file_name = boost::filesystem::unique_path();
+  boost::filesystem::path temppath = temp_file_path / temp_file_name;
+  send_command(in, "master_read_to_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x| 0x%|08x|") % addr % length ) );
+  if (0 != wait_for_prompt(out)) {
+    throw std::runtime_error("Unexpected EOF");
+  }
+  boost::filesystem::ifstream ifs(temppath, std::ios::in | std::ios::binary);
+  ifs.read(static_cast<char *>(data), length);
+  ifs.close();
+
+  if (!preserve_temp_files) {
+    try {
+          boost::filesystem::remove(temppath);
+        } catch (const boost::filesystem::filesystem_error& ex) {
+          std::cerr << "Error removing file: " << ex.what() << std::endl;
+    }
+  }
+}
+
+static void write_to_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, const void* data)
+{
+  boost::filesystem::path temp_file_name = boost::filesystem::unique_path();
+  boost::filesystem::path temppath = temp_file_path / temp_file_name;
+  boost::filesystem::ofstream ofs(temppath, std::ios::out | std::ios::binary);
+  if (ofs.fail()) {
+    throw std::runtime_error("Failed to access the temporary file " + temppath.generic_string());
+  }
+  ofs.write(static_cast<const char *>(data), length);
+  ofs.close();
+  send_command(in, "master_write_from_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x|") % addr ) );
+  if (0 != wait_for_prompt(out))
+  {
+    throw std::runtime_error("Unexpected EOF");
+  }
+
+  if (!preserve_temp_files) {
+    try {
+          boost::filesystem::remove(temppath);
+        } catch (const boost::filesystem::filesystem_error& ex) {
+          std::cerr << "Error removing file: " << ex.what() << std::endl;
+    }
+  }
+}
+
+MmdWrapper::MmdWrapper() {
+  // Check for the envrionment variable
+  auto env = boost::this_process::environment();
+  tcl_file_path = env.find("DLA_SYSCON_SOURCE_FILE") != env.end() ?
+      boost::filesystem::path(env["DLA_SYSCON_SOURCE_FILE"].to_string()) :
+      boost::filesystem::path(xstr(DLA_SYSCON_SOURCE_ROOT)) / "system_console_script.tcl";
+  if (!boost::filesystem::exists(tcl_file_path)) {
+     throw std::runtime_error("Cannot locate " + tcl_file_path.generic_string() + ". Please specify the path of the Tcl setup script by defining the environment variable DLA_SYSCON_SOURCE_FILE\n");
+  } else {
+    std::cout <<"Using the Tcl setup script at "<<tcl_file_path.generic_string()<<std::endl;
+  }
+
+  temp_file_path = env.find("DLA_TEMP_DIR") != env.end() ?
+    boost::filesystem::path(env["DLA_TEMP_DIR"].to_string()) :
+    boost::filesystem::current_path();
+  if (!boost::filesystem::exists(temp_file_path)) {
+    throw std::runtime_error("The temporary file storage directory specified via the environment variable DLA_TEMP_DIR does not exist.\n");
+  } else {
+    std::cout <<"Saving temporary files to "<<temp_file_path.generic_string()<<std::endl;
+  }
+
+  sof_file_path = env.find("DLA_SOF_PATH") != env.end() ?
+    boost::filesystem::path(env["DLA_SOF_PATH"].to_string()):
+    boost::filesystem::current_path() / "top.sof";
+  if (!boost::filesystem::exists(sof_file_path)) {
+    throw std::runtime_error("Cannot find the FPGA bitstream (.sof). Please specify its location via the environment variable DLA_SOF_PATH,"\
+     " or copy it as top.sof to the current working directory.\n");
+  } else {
+    std::cout <<"Using the FPGA bitstream at "<<sof_file_path.generic_string()<<" to configure the JTAG connection"<<std::endl;
+  }
+
+  boost::filesystem::path system_console_path = bp::search_path("system-console");
+  if (system_console_path.empty()) {
+    throw std::runtime_error("Cannot find system-console in system PATH!\n");
+
+  }
+  enable_pmon = env.find("DLA_ENABLE_PMON") != env.end() ? 1 : 0;
+
+  preserve_temp_files = env.find("DLA_PRESERVE_TEMP_FILES") != env.end() ? true : false;
+
+  subprocess = bp::child(system_console_path, "-cli", bp::std_out > out, bp::std_in < in);
+  if (wait_for_prompt(out))
+  {
+    throw std::runtime_error("Could not find initial prompt");
+  }
+  send_command(in, "set ::cl(sof) " + sof_file_path.generic_string());
+  if (enable_pmon == 1) {
+    send_command(in, "set ::cl(enable_pmon) 1");
+  }
+  send_command(in, "source " + tcl_file_path.generic_string());
+  std::basic_stringstream<char> s1;
+  if (0 != capture_till_prompt(out, s1))
+  {
+    throw std::runtime_error("Could not find prompt after source");
+  }
+  std::string captured(s1.str());
+
+  // Reset the IP
+  write_to_csr(in, out, DLA_DMA_CSR_OFFSET_IP_RESET, 1);
+  // Constants of the design
+  maxInstances_ = 1;
+  ddrSizePerInstance_ = 0x80000000;
+  // Need to change the frequencies below when their counterparts in the Qsys system are modified
+  coreDlaClockFreq_ = 200;
+  ddrClockFreq_ = 200;
+  // Initialize the handle_ object to a dummy value. It is not relevant to this MMD
+  handle_ = 0;
+}
+
+MmdWrapper::~MmdWrapper() {
+  send_command(in, "close_services");
+  if (wait_for_prompt(out))
+  {
+    std::cout << "Could not find prompt after attempting to close system console services\n";
+  }
+  send_command(in, "exit");
+  try {
+    subprocess.terminate();
+    std::cout << "Successfully closed JTAG services.\n";
+  } catch (const boost::process::process_error& e) {
+    std::cerr << "Failed to terminate the system-console process due to reason: " << e.what() << std::endl;
+  }
+}
+
+void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const {
+  throw std::runtime_error("System Console plugin requires polling");
+}
+
+void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const {
+  write_to_csr(in, out, addr, data);
+}
+
+uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const {
+  return read_from_csr(in, out, addr);
+}
+
+void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const {
+  write_to_ddr(in, out, addr, length, data);
+}
+
+void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const {
+  read_from_ddr(in, out, addr, length, data);
+}
+
+#ifndef STREAM_CONTROLLER_ACCESS
+// Stream controller access is not supported by the platform abstraction
+bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; }
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(false);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(false);
+}
+#else
+// If the mmd layer supports accesses to the Stream Controller
+bool MmdWrapper::bIsStreamControllerValid(int instance) const {
+  return false;
+}
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl
new file mode 100644
index 0000000..9e0e386
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl
@@ -0,0 +1,79 @@
+# Author: linqiaol
+# Purpose: Perform write-read tests on external memory and CoreDLA CSR to make sure the registers can be accessed from host.
+
+# Declare and initialize CL arguments
+if {![info exists ::cl(sof)]} {
+    set ::cl(sof)                "top.sof"
+}
+
+if {![info exists ::cl(enable_pmon)]} {
+    set ::cl(enable_pmon)                0
+}
+
+# Declare global variables
+set ::g_emif_calip_service ""
+set ::g_emif_ddr_service ""
+set ::g_dla_csr_service ""
+set ::g_pmon_service ""
+
+# Declare some contants
+set ::g_const_master_offset_emif 0x0
+set ::g_const_master_range_emif  0x080000000
+set ::g_const_master_offset_dla  0x080000000
+set ::g_const_master_range_dla   0x000001000
+
+#{{{ load_sof
+proc load_sof {} {
+    puts "loading sof: $::cl(sof)"
+    design_load $::cl(sof)
+}
+#}}}
+
+#{{{claim_emif_ddr_service
+proc claim_emif_ddr_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]]
+    set service [claim_service master $path {} "\{${::g_const_master_offset_emif} ${::g_const_master_range_emif} EXCLUSIVE\}"]
+    return $service
+}
+#}}}
+
+#{{{claim_dla_csr_service
+proc claim_dla_csr_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]]
+    set service [claim_service master $path {} "\{${::g_const_master_offset_dla} ${::g_const_master_range_dla} EXCLUSIVE\}"]
+    return $service
+}
+#}}}
+
+#{{{claim_pmon_service
+proc claim_pmon_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *pmon*master*]]
+    set service [claim_service master $path {} {{0x0 0x00001000 EXCLUSIVE}}]
+    return $service
+}
+#}}}
+
+proc initialization {} {
+    load_sof
+    puts "Claim required services"
+    set ::g_dla_csr_service    [claim_dla_csr_service]
+    set ::g_emif_ddr_service   [claim_emif_ddr_service]
+    if {$::cl(enable_pmon) == 1} {
+        puts "Claiming JTAG service to the AXI4 performance monitor"
+        set ::g_pmon_service       [claim_pmon_service]
+    }
+}
+
+proc close_services {} {
+    close_service master $::g_dla_csr_service
+    if {$::cl(enable_pmon) == 1} {
+        close_service master $::g_pmon_service
+    }
+    close_service master $::g_emif_ddr_service
+    puts "Closed DLA JTAG services"
+}
+
+initialization
+\ No newline at end of file
diff --git a/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp
new file mode 100644
index 0000000..9ac7598
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp
@@ -0,0 +1,125 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_batch_job.h"  //CoreDlaBatchJob
+#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
+#include "stream_controller_comms.h"
+
+static constexpr int CONFIG_READER_DATA_BYTES = 8;
+
+std::unique_ptr<BatchJob> CoreDlaBatchJob::MakeUnique(MmdWrapper* mmdWrapper,
+                                                      uint64_t totalConfigWords,
+                                                      uint64_t configBaseAddrDDR,
+                                                      uint64_t inputAddrDDR,
+                                                      uint64_t outputAddrDDR,
+                                                      uint64_t inputSizeDDR,
+                                                      uint64_t outputSizeDDR,
+                                                      const bool enableIstream,
+                                                      const bool enableOstream,
+                                                      int instance,
+                                                      std::shared_ptr<StreamControllerComms> spStreamControllerComms) {
+  return std::unique_ptr<BatchJob>(new CoreDlaBatchJob(mmdWrapper,
+                                                       totalConfigWords,
+                                                       configBaseAddrDDR,
+                                                       inputAddrDDR,
+                                                       outputAddrDDR,
+                                                       inputSizeDDR,
+                                                       outputSizeDDR,
+                                                       enableIstream,
+                                                       enableOstream,
+                                                       instance,
+                                                       spStreamControllerComms));
+}
+CoreDlaBatchJob::CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+                                 uint64_t totalConfigWords,
+                                 uint64_t configBaseAddrDDR,
+                                 uint64_t inputAddrDDR,
+                                 uint64_t outputAddrDDR,
+                                 uint64_t inputSizeDDR,
+                                 uint64_t outputSizeDDR,
+                                 const bool enableIstream,
+                                 const bool enableOstream,
+                                 int instance,
+                                 std::shared_ptr<StreamControllerComms> spStreamControllerComms)
+: mmdWrapper_(mmdWrapper)
+, instance_(instance)
+, totalConfigWords_(totalConfigWords)
+, configBaseAddrDDR_(configBaseAddrDDR)
+, inputAddrDDR_(inputAddrDDR)
+, outputAddrDDR_(outputAddrDDR)
+, inputSizeDDR_(inputSizeDDR)
+, outputSizeDDR_(outputSizeDDR)
+, enableIstream_(enableIstream)
+, enableOstream_(enableOstream)
+, lastJobQueueNumber_(0)
+, spStreamControllerComms_(spStreamControllerComms) {
+}
+
+// This function must be called by a single thread
+// It can be called on a different thread than StartDla or WaitForDla
+void CoreDlaBatchJob::LoadInputFeatureToDDR(void* inputArray) {
+  mmdWrapper_->WriteToDDR(instance_, inputAddrDDR_, inputSizeDDR_, inputArray);
+  StartDla();
+}
+
+void CoreDlaBatchJob::ScheduleInputFeature() const {
+  if (spStreamControllerComms_) {
+    // Send message to NIOS-V
+    uint64_t configurationSize64 = (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2;
+    uint32_t configurationBaseAddressDDR = static_cast<uint32_t>(configBaseAddrDDR_);
+    uint32_t configurationSize = static_cast<uint32_t>(configurationSize64);
+    uint32_t inputAddressDDR = static_cast<uint32_t>(inputAddrDDR_);
+    uint32_t outputAddressDDR = static_cast<uint32_t>(outputAddrDDR_);
+
+    Payload<CoreDlaJobPayload> item;
+    item._configurationBaseAddressDDR = configurationBaseAddressDDR;
+    item._configurationSize = configurationSize;
+    item._inputAddressDDR = inputAddressDDR;
+    item._outputAddressDDR = outputAddressDDR;
+
+    spStreamControllerComms_->ScheduleItems( { item } );
+  }
+}
+
+// This function must be called by a single thread
+// It can be called on a different thread than WaitForDla or LoadInputFeatureToDDR
+void CoreDlaBatchJob::StartDla() {
+  //////////////////////////////////////
+  //  Write to CSR to start the FPGA  //
+  //////////////////////////////////////
+
+  // interrupt mask was already enabled in the DlaDevice constructor
+
+  // intermediate buffer address was already set when the graph was loaded
+
+  // base address for config reader
+  mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configBaseAddrDDR_);
+
+  // how many words for config reader to read
+  // hardware wants the number of words minus 2 since the implementation is a down counter which ends at -1, the sign
+  // bit is used to denote the end of the counter range
+  mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2);
+
+  if (enableIstream_ && enableOstream_) {
+    // Arm the streaming interface. Will continuously load configs.
+    const unsigned int enable = 1;
+    mmdWrapper_->WriteToCsr(instance_, DLA_CSR_OFFSET_READY_STREAMING_IFACE, enable);
+  } else {
+    // base address for feature reader -- this will trigger one run of DLA
+    mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputAddrDDR_);
+  }
+}
+
+void CoreDlaBatchJob::ReadOutputFeatureFromDDR(void* outputArray) const {
+  mmdWrapper_->ReadFromDDR(instance_, outputAddrDDR_, outputSizeDDR_, outputArray);
+}
diff --git a/python/openvino/runtime/coredla_device/src/coredla_device.cpp b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
new file mode 100644
index 0000000..b28d8a2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
@@ -0,0 +1,574 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_device.h"     //CoreDlaDevice
+#include "coredla_batch_job.h"  //CoreDlaBatchJob
+#include "coredla_graph_job.h"  //CoreDlaBatchJob
+#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
+#include "stream_controller_comms.h"
+
+#include <algorithm>  //std::count
+#include <cassert>    //assert
+#include <chrono>     //std::chrono::seconds
+#include <cstddef>    //size_t
+#include <cstdlib>    //std::getenv
+#ifndef USE_OLD_COREDLA_DEVICE
+#include <cinttypes>  //printf formatters
+#endif
+#include <mutex>      //std::mutex
+#include <stdexcept>  //std::runtime_error
+#include <string>     //std::string
+#include <iostream>   //std::cerr
+#include <stdint.h>   //
+#include <thread>
+#include <cinttypes>
+
+std::unique_ptr<Device> Device::MakeUnique(const arch_params* archParams,
+                                           uint32_t waitForDlaTimeoutSeconds) {
+  return std::unique_ptr<Device>(new CoreDlaDevice(waitForDlaTimeoutSeconds));
+}
+
+void InterruptServiceRoutine(int handle, void* data) {
+  InterruptServiceRoutineData* isrData = static_cast<InterruptServiceRoutineData*>(data);
+  // clear interrupt status -- write 1 to clear that bit
+  constexpr int writeDataToClearInterruptStatus = 3;
+  const int numInstances = static_cast<int>(isrData->jobsFinished.size());
+  for (int i = 0; i < numInstances; i++) {
+    isrData->mmdWrapper->WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, writeDataToClearInterruptStatus);
+  }
+  for (int i = 0; i < numInstances; i++) {
+    isrData->desc_queue_diag[i] = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS);
+    // ask the csr how many jobs have finished
+    uint32_t completionCount =  isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+    // check if the completionCount wraps around (overflow detection) and save this information
+    if (isrData->prevCount[i] > completionCount)
+      isrData->base_multiplier[i] ++;
+    isrData->prevCount[i] = completionCount;
+    // we add base_multiplier to account for the fact that a wrap around is actually an increment of 1
+    std::unique_lock<std::mutex> isrMutexLock(isrData->isrMutex[i]);
+    isrData->jobsFinished[i] = (uint64_t) isrData->base_multiplier[i] * UINT32_MAX + completionCount + isrData->base_multiplier[i];
+    isrData->isrCondVar[i].notify_all();
+  }
+}
+
+CoreDlaDevice::CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds)
+: waitForDlaTimeoutSeconds_(waitForDlaTimeoutSeconds) {
+#ifdef COREDLA_RUNTIME_POLLING
+  runtimePolling_ = true;
+#else
+  runtimePolling_ = false;
+#endif
+  // mmdWrapper_ ctor runs first, which will open a handle to the MMD. Now determine the number of hardware instances
+  // by writing a nonzero value to some offset and then reading it back. While trying to enable the interrupt
+  // mask, test for this.
+  numInstances_ = 0;
+  for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+    constexpr uint32_t allInterruptsMask = (1<<DLA_DMA_CSR_INTERRUPT_ERROR_BIT) | (1<<DLA_DMA_CSR_INTERRUPT_DONE_BIT);
+    // clear any pending interrupts (there may be pending interrupts from last run), then enable mask for instance count
+    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, allInterruptsMask);
+    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, allInterruptsMask);
+    uint32_t readData = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK);
+    if (allInterruptsMask == readData) numInstances_ = i + 1;
+  }
+  LOG_AND_PRINT(Logger::INFO, "numInstances_: %d\n", numInstances_);
+  assert(numInstances_ >= 1);
+  jobsWaited_.resize(numInstances_, 0);
+
+  uint32_t license = mmdWrapper_.ReadFromCsr(0, DLA_DMA_CSR_OFFSET_LICENSE_FLAG);
+  if (license == 0) {
+    DLA_LOG("Using unlicensed IP\n");
+  }
+  else if (license == 1) {
+    DLA_LOG("Using licensed IP\n");
+  }
+  else {
+    throw std::runtime_error("Unrecongnized license flag");
+  }
+#ifndef USE_OLD_COREDLA_DEVICE
+  startClocksActive.resize(numInstances_, 0);
+  startClockAllJobs.resize(numInstances_, 0);
+#endif
+  startNumInputFeatureMemoryReads.resize(numInstances_, 0);
+  startNumFilterMemoryReads.resize(numInstances_, 0);
+  startNumOutputFeatureMemoryWrites.resize(numInstances_, 0);
+
+  // Package up the data that interrupt service routine needs
+  isrData_.mmdWrapper = &mmdWrapper_;
+  isrData_.jobsFinished = std::vector<uint64_t>(numInstances_, 0);
+  isrData_.base_multiplier = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.prevCount = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.desc_queue_diag = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.isrMutex = std::vector<std::mutex>(numInstances_);
+  isrData_.isrCondVar = std::vector<std::condition_variable>(numInstances_);
+
+  if (runtimePolling_) {
+    // disable the interrupt mask -- it was originally enabled to determine how many instances were present
+    for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+      constexpr uint32_t disableInterruptMaskValue = 0;
+      mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, disableInterruptMaskValue);
+    }
+  }
+  else {
+    // register an interrupt handler
+    mmdWrapper_.RegisterISR(&InterruptServiceRoutine, &isrData_);
+  }
+
+  // Record the current counters
+  for(int i=0; i < numInstances_; i++) {
+#ifndef USE_OLD_COREDLA_DEVICE
+    jobsWaited_[i] = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+    isrData_.jobsFinished[i] = jobsWaited_[i];
+
+    startClocksActive[i] = GetClocksActive(i);
+    startClockAllJobs[i] = GetClocksAllJobs(i);
+#endif
+    startNumInputFeatureMemoryReads.at(i) = GetNumInputFeatureMemoryReadsTotal(i);
+    startNumFilterMemoryReads.at(i) = GetNumFilterMemoryReadsTotal(i);
+    startNumOutputFeatureMemoryWrites.at(i) = GetNumOutputFeatureMemoryWritesTotal(i);
+  }
+
+  // Allocator needs access to mmd to write to CSR the start address of the shared intermediate buffer allocated in DDR
+  ddrAllocator_ = std::unique_ptr<DeviceMemoryAllocator[]>(new DeviceMemoryAllocator[numInstances_]);
+  for (int i = 0; i < numInstances_; i++) {
+    ddrAllocator_[i].Initialize(mmdWrapper_.GetDDRSizePerInstance(), &mmdWrapper_);
+  }
+
+// Choose which data pattern you want, all zeros or all ones can also be useful for IP debug purposes
+#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR, INDEX) ((ADDR * 12345) + (INDEX * 6789))
+  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0)
+  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0xffffffffffffffffULL)
+  bool run_memory_test = getenv("COREDLA_RUNTIME_MEMORY_TEST") != nullptr;
+  if (run_memory_test) {
+    // Ensure host can access all of the device memory that is accessible by all CoreDLA instances
+    // This is not necessarily the total device memory e.g. only 1 CoreDLA instance but 2 DDR banks
+    DLA_LOG("starting memory test with %d instances\n", numInstances_);
+    constexpr uint64_t CHUNK_SIZE = 1ULL << 20;  // one address check is 1 MB
+    const uint64_t ADDR_LIMIT = mmdWrapper_.GetDDRSizePerInstance();
+    int mismatch = 0;
+    uint64_t expected;
+    uint64_t* data = new uint64_t[CHUNK_SIZE / sizeof(uint64_t)];
+
+    for (int inst = 0; inst < numInstances_; ++inst) {
+      // write to entire fpga ddr
+      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++)
+          data[index] = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+        mmdWrapper_.WriteToDDR(inst, addr, CHUNK_SIZE, static_cast<const void*>(data));
+      }
+      // read back entire fpga ddr and compare to expected
+      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+        mmdWrapper_.ReadFromDDR(inst, addr, CHUNK_SIZE, data);
+        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) {
+          expected = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+          if (data[index] != expected) {
+            if (mismatch < 10) {
+#if (!defined(USE_OLD_COREDLA_DEVICE) || defined(_WIN32))
+              DLA_LOG("memory test mismatch, addr %" PRIu64 ", index %" PRIu64 ", got %" PRIu64 ", expected %" PRIu64
+                      "\n",
+                      addr,
+                      index,
+                      data[index],
+                      expected);
+#else
+              DLA_LOG("memory test mismatch, addr %lu, index %lu, got %lu, expected %lu\n",
+                      addr,
+                      index,
+                      data[index],
+                      expected);
+#endif
+            }
+            mismatch++;
+          }
+        }
+      }
+    }
+    delete[] data;
+    DLA_LOG("finished memory test ");
+    if (mismatch == 0) {
+      DLA_LOG("SUCCESS\n");
+    } else {
+      DLA_LOG("FAILURE (%d mismatches)\n", mismatch);
+    }
+  }
+}
+
+CoreDlaDevice::~CoreDlaDevice() {
+  // Avoid the scenario where some CoreDLA job has been started but something goes wrong
+  // in the runtime which causes it to exit, e.g. assertion failure or uncaught exception.
+  // CoreDLA will still raise an interrupt when the job finishes, yet the runtime will no
+  // longer be able to deal with it. Better to shut off interurpts.
+  for (int instance = 0; instance < numInstances_; instance++) {
+    // MmDWrapper.WriteToCSR might throw exception, and the destructor should not have
+    // unhandled exception, so we need to handle exceptions internally
+    try {
+      mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
+    } catch (const std::exception& e) {
+      std::cerr << "Failed to shut off the DMA CSR interrupt mask due to " << e.what() << std::endl;
+    }
+  }
+}
+
+GraphJob* CoreDlaDevice::CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                                        size_t numPipelines,
+#else
+                                        uint64_t numPipelines,
+#endif
+                                        int instance,
+                                        std::string AES_key,
+                                        std::string IV_key,
+                                        bool encryption_enabled,
+                                        const std::string export_dir,
+                                        const std::string parameter_rom_export_dir) {
+  assert(instance < numInstances_);
+  (void) export_dir;  // unused in HW runtime. CoreDLA utilizes base pointers, which the SW emulator utilizes this variable. We void it here.
+  allGraphJobs_.push_back(move(
+      CoreDlaGraphJob::MakeUnique(&ddrAllocator_[instance], &mmdWrapper_, compiledResult, numPipelines, instance, spStreamControllerComms_)));
+  return (allGraphJobs_.back()).get();
+}
+
+// This function must be called by a single thread
+void CoreDlaDevice::WaitForDla(int instance, size_t threadId, std::function<bool()> isCancelledPredicate) {
+  // ISR updates jobsFinished, if not enough jobs have finished then sleep until ISR runs again
+  // it is possible that several hardware jobs could finish around the same time
+  // by the time software handles the first interrupt, hardware could report that 2 jobs have
+  // finished, for example the second time that waitForInterrupt runs, software already tracks
+  // that the second job has finished and therefore don't need to sleep waiting for ISR
+  std::unique_lock<std::mutex> isrMutexLock(isrData_.isrMutex[instance]);
+  uint32_t completionCount = 0;
+  bool timedOut = false;
+  auto timeoutDuration = std::chrono::seconds(waitForDlaTimeoutSeconds_);
+
+  if (runtimePolling_) {
+    std::chrono::time_point<std::chrono::system_clock> pollingEndingTime =
+        std::chrono::system_clock::now() + timeoutDuration;
+
+    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+      // Update isrData_.jobsFinished[instance] here (polling)
+      if (isCancelledPredicate and isCancelledPredicate()) {
+        break;
+      }
+
+      completionCount = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+      isrData_.jobsFinished[instance] = completionCount;
+      if (std::chrono::system_clock::now() > pollingEndingTime) {
+        timedOut = true;
+        break;
+      }
+    }
+  } else {
+    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+      // isrData_.jobsFinished[instance] is updated in the ISR
+      if (std::cv_status::timeout == isrData_.isrCondVar[instance].wait_for(isrMutexLock, timeoutDuration)) {
+        timedOut = true;
+        break;
+      }
+    }
+  }
+
+  if (timedOut) {
+    std::string str_poll_vs_int = "interrupt";
+    if (runtimePolling_) {
+      str_poll_vs_int = "polling";
+    }
+    std::string timeoutMsg = "WaitForDla " + str_poll_vs_int + " timeout with threadId_" + std::to_string(threadId) + "\n";
+
+    // Timeout has happened if we get here
+    timeoutMsg += "If inference on one batch is expected to take more than " +
+                  std::to_string(waitForDlaTimeoutSeconds_) +
+                  " seconds, then increase WAIT_FOR_DLA_TIMEOUT in dlia_plugin.cpp and "
+                  "recompile the runtime.\n";
+    DLA_LOG("%s", timeoutMsg.c_str());  // this should always print, even if logging
+                                        // verbosity is too low
+    LOG(Logger::WARNING, "%s", timeoutMsg.c_str());
+    std::string exceptionMsg = "FATAL ERROR: inference on FPGA did not complete";
+    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+    throw std::runtime_error(exceptionMsg);
+  }
+
+  if ((isrData_.desc_queue_diag[instance] >> DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT) & 0x01) {
+    std::cerr << "ERROR: Out of free inferences on this IP. " <<
+                 "The Intel FPGA AI suite cannot continue without a license!" << std::endl;
+    std::string exceptionMsg = "Inference on FPGA exited with a license error";
+    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+    exceptionMsg += "\nPlease check your license. The Intel FPGA AI suite cannot continue without a license!";
+    throw std::runtime_error(exceptionMsg);
+  }
+
+  jobsWaited_[instance]++;
+}
+
+#ifndef USE_OLD_COREDLA_DEVICE
+uint64_t CoreDlaDevice::GetClocksActive(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+  return (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+}
+
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+  uint64_t clocksActive = GetClocksActive(instance) - startClocksActive[instance];
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+uint64_t CoreDlaDevice::GetClocksAllJobs(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+  return (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+  uint64_t clocksAllJobs = GetClocksAllJobs(instance) - startClockAllJobs[instance];
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#else
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+    uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+  uint64_t clocksActive = (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+  uint64_t clocksAllJobs = (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#endif
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReads(int instance) const {
+  return GetNumInputFeatureMemoryReadsTotal(instance) - startNumInputFeatureMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReads(int instance) const {
+  return GetNumFilterMemoryReadsTotal(instance) - startNumFilterMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWrites(int instance) const {
+  return GetNumOutputFeatureMemoryWritesTotal(instance) - startNumOutputFeatureMemoryWrites.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReadsTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numIFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO);
+  uint32_t numIFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI);
+  uint64_t numIFReads = (((uint64_t) numIFReadsHi) << 32) | ((uint64_t) numIFReadsLo);
+  return numIFReads;
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReadsTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numWeightReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO);
+  uint32_t numWeightReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI);
+  uint64_t numWeightReads = (((uint64_t) numWeightReadsHi) << 32) | ((uint64_t) numWeightReadsLo);
+  return numWeightReads;
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWritesTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numOFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO);
+  uint32_t numOFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI);
+  uint64_t numOFReads = (((uint64_t) numOFReadsHi) << 32) | ((uint64_t) numOFReadsLo);
+  return numOFReads;
+}
+
+// Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail if
+// the module number and address have not been implemented. The debug network is fault tolerant to both read requests
+// never being accepted as well as read responses never being produced.
+bool CoreDlaDevice::ReadDebugCsr(
+    uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose) const {
+  assert(moduleNum <= 0xff);
+  assert(address <= 0xffffff);
+  uint32_t addr = ((moduleNum & 0xff) << 24) | (address & 0xffffff);
+
+  // Step 1: send the address that the debug network will use to issue a read request. Writing once to this CSR offset
+  // will cause the debug network to issue one read request.
+  mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR, addr);
+
+  // Optional step: read back the value sent to CSR, sanity check that it is correct. Note this is all handled
+  // internally to the CSR, e.g. the CSR does not go ask the debug network what address it sent.
+  uint32_t addrCheck = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR);
+  if (addr != addrCheck) {
+    if (verbose) DLA_LOG("ReadDebugCsr addr read back check failed, expected %u, got %u\n", addr, addrCheck);
+    return false;
+  }
+
+  // Step 2: the debug network should produce a read response which is cached by the CSR. Poll the corresponding status
+  // register inside the CSR until this happens, or until the runtime decides to give up and declare the read a failure.
+  // Do not throw an exception if the read fails, it is allowed to fail if the runtime is trying to figure out which
+  // external debug-capable modules are attached to the debug network. Once the runtime has determined that a module is
+  // attached, only then should read failures should cause an exception.
+  uint32_t isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+  int retry = 5;
+  while (!isValid && retry) {
+    --retry;
+    isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+  }
+  if (!isValid) {
+    if (verbose) DLA_LOG("ReadDebugCsr failed to read at addr %u\n", addr);
+    return false;
+  }
+
+  // Step 3: runtime has confirmed the CSR has a cached the read response from debug network, now go and get the value.
+  readData = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA);
+  if (verbose) DLA_LOG("ReadDebugCsr, addr %u, data %u\n", addr, readData);
+  return true;
+}
+
+// This is a helper function that throws an exception if runtime fails to read from the debug network. This should only
+// be called if the runtime has already confirmed that a module is attached to the debug network i.e. a previous read to
+// this module number had succeeded.
+void ReadDebugNetworkError(int moduleNum, int address, int instance) {
+  std::string msg = "ReadDebugNetwork failure, instance " + std::to_string(instance) +
+                    ", failed to read at module number " + std::to_string(moduleNum) + " address " +
+                    std::to_string(address);
+  throw std::runtime_error(msg);
+}
+
+// Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+// this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+// information the debug register contains, and the value is the data of the debug register. Note that the runtime must
+// completely tranverse the ROM before reading any of the debug register values, and the runtime must read the debug
+// register values in the order that they occur inside the ROM. Usually profiling counters are 64-bit values, and since
+// there is only a 32-bit read available, it takes more than one read to get all the data. The counters could still be
+// updating when the runtime wants to read them, so typically there is a freeze register which can be activated by
+// reading from a special address (hardware will see an incoming read request to this address, that is how it knows to
+// freeze the counters). The offset for the freeze register will typically go first in the ROM, even if it is not the
+// first offset in the address space.
+DebugNetworkData CoreDlaDevice::ReadDebugNetwork(int instance) const {
+  DebugNetworkData result;
+  for (uint32_t moduleNum = 0; moduleNum < 256; moduleNum++) {
+    // Read the ROM to get the offsets and descriptions
+    std::vector<uint32_t> offset;
+    std::vector<std::string> description;
+    uint32_t address = 0, readData = 0;
+    bool first = true, success = false;
+    while (1) {
+      // Parse the offset
+      success = ReadDebugCsr(moduleNum, address, instance, readData);
+      if (!success) {
+        // Failure to read is allowed on the very first time, it is assumed that no external debug-capable module is
+        // attached to the debug network at this moduleNum
+        if (first)
+          break;
+        else
+          ReadDebugNetworkError(moduleNum, address, instance);
+      }
+      if (!readData) break;  // end of list is indicated with offset = 0
+      first = false;
+      address += 4;
+      offset.push_back(readData);
+
+      // Parse the description string
+      std::string str;
+      bool endOfStringSeen = false;
+      while (!endOfStringSeen) {
+        success = ReadDebugCsr(moduleNum, address, instance, readData);
+        if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+        address += 4;
+        for (int i = 0; i < 4; i++) {
+          if (readData & 0xff) {
+            str += ((char)(readData & 0xff));
+            readData >>= 8;
+          } else {
+            endOfStringSeen = true;
+            break;
+          }
+        }
+      }
+      description.push_back(str);
+    }
+
+    assert(offset.size() == description.size());
+
+    // Read the profiling counters
+    for (size_t i = 0; i < offset.size(); i++) {
+      address = offset[i];
+      success = ReadDebugCsr(moduleNum, address, instance, readData);
+      if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+
+      int descriptionOccurenceCnt = result.count(description[i]);
+      // Same description name should show up 2 times in maximum
+      if (descriptionOccurenceCnt == 2) {
+        throw std::runtime_error("More than 2 profiling counter descriptions are the same.");
+      } else if (descriptionOccurenceCnt && (address - offset[i - 1] != 4)) {
+        // same description existed before
+        // check if the two addresses associatede with the same decription are consecutive (offset by 4)
+        throw std::runtime_error("Profiling counter addresses with name: " + description[i] + " are not consecutive");
+      } else if (std::count(offset.begin(), offset.end(), address) > 1) {
+        // same address shows up more than once
+        throw std::runtime_error("Duplicate profiling counter address: " + address);
+      }
+
+      // Avoid printing special stuff like _Freeze and _Unfreeze
+      if (description[i].at(0) != '_') {
+        if (descriptionOccurenceCnt) {
+          // This key has existed before, concatenate 2 uint32_t into uint64_t
+          result[description[i]] |= (((uint64_t)readData) << 32);
+        } else {
+          result[description[i]] = readData;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+int CoreDlaDevice::GetSizeCsrDescriptorQueue() const { return DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE; }
+
+double CoreDlaDevice::GetCoreDlaClockFreq() const { return mmdWrapper_.GetCoreDlaClockFreq(); }
+
+std::string CoreDlaDevice::SchedulerGetStatus() const {
+  if (!spStreamControllerComms_) return "";
+
+  Payload<StatusMessagePayload> statusPayload = spStreamControllerComms_->GetStatus();
+  return spStreamControllerComms_->GetStatusString(statusPayload);
+}
+
+bool CoreDlaDevice::InitializeScheduler(uint32_t sourceBufferSize,
+                                        uint32_t dropSourceBuffers,
+                                        uint32_t numInferenceRequests,
+                                        const std::string source_fifo_file) {
+  spStreamControllerComms_ = std::make_shared<StreamControllerComms>();
+  if (spStreamControllerComms_->IsPresent()) {
+    bool initOK = spStreamControllerComms_->Initialize(sourceBufferSize, dropSourceBuffers, numInferenceRequests);
+    return initOK;
+  } else {
+    spStreamControllerComms_.reset();
+    return false;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp
new file mode 100644
index 0000000..c1f349f
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp
@@ -0,0 +1,279 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_graph_job.h"  //CoreDlaGraphJob
+
+#include <cinttypes>
+#include <cstdlib>   //std::getenv
+#include <iomanip>   //std::hex
+#include <iostream>  //std::cerr
+#include <sstream>   //std::stringstream
+#include <string>    //std::string
+
+#define BUILD_VERSION_CSR_OFFSET (ARCH_HASH_SIZE)
+#define ARCH_NAME_CSR_OFFSET (ARCH_HASH_SIZE + BUILD_VERSION_SIZE)
+
+#define FLAG_DISABLE_ARCH_CHECK "DLA_DISABLE_ARCH_CHECK"
+#define FLAG_DISABLE_VERSION_CHECK "DLA_DISABLE_VERSION_CHECK"
+
+std::unique_ptr<GraphJob> CoreDlaGraphJob::MakeUnique(DeviceMemoryAllocator *ddrBufferAllocator,
+                                                      MmdWrapper *mmdWrapper,
+                                                      const dla::CompiledResult *compiledResult,
+                                                      uint64_t numPipelines,
+                                                      int instance,
+                                                      std::shared_ptr<StreamControllerComms> spStreamControllerComms) {
+  return std::unique_ptr<GraphJob>(new CoreDlaGraphJob(
+      ddrBufferAllocator, mmdWrapper, compiledResult, numPipelines, instance, spStreamControllerComms));
+}
+
+std::string get_env_var_wrapper(const std::string &env_var) {
+  const char *env_var_ptr = std::getenv(env_var.c_str());
+  if (env_var_ptr == nullptr) {
+    return "";
+  }
+
+  return std::string(env_var_ptr);
+}
+
+std::string arch_hash_to_string(const std::vector<int> &arch_hash) {
+  std::stringstream s;
+  for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+    s << std::setfill('0') << std::setw(8) << std::hex << std::right << arch_hash[i] << " ";
+  }
+
+  return s.str();
+}
+
+std::string read_string_from_bitstream_rom(MmdWrapper *mmdWrapper,
+                                           const int instance,
+                                           const uint32_t str_word_size_in_bytes,
+                                           const uint32_t str_offset_in_rom) {
+  std::string str_from_rom;
+  bool done = false;
+  for (uint32_t i = 0; i < str_word_size_in_bytes && (!done); ++i) {
+    int chunk = mmdWrapper->ReadFromCsr(instance, str_offset_in_rom + i * 4);
+    // Parse the int word into chars. Stops at any NUL char.
+    for (int j = 0; j < 4; ++j) {
+      char rom_char = (chunk >> (j * 8)) & 0xFF;
+      if (rom_char == 0) {
+        done = true;
+        break;
+      } else {
+        str_from_rom.push_back(rom_char);
+      }
+    }
+  }
+  return str_from_rom;
+}
+
+CoreDlaGraphJob::CoreDlaGraphJob(DeviceMemoryAllocator *ddrBufferAllocator,
+                                 MmdWrapper *mmdWrapper,
+                                 const dla::CompiledResult *compiledResult,
+                                 uint64_t numPipelines,
+                                 int instance,
+                                 std::shared_ptr<StreamControllerComms> spStreamControllerComms)
+    : configFilterBiasBufferSizeDDR_(0),
+      intermediateBufferSizeDDR_(0),
+      ddrBufferAllocator_(ddrBufferAllocator),
+      mmdWrapper_(mmdWrapper),
+      batchJobsRequested_(0),
+      instance_(instance) {
+  // First read the arch_md5, build_version_string and arch_name string from
+  // the metadata stored in the bitstream discovery ROM, then compare them
+  // against the information present in the compiled result. Fail if it does not match.
+
+  // ARCH_HASH_SIZE bytes for the arch hash.
+  std::vector<int> bitstream_arch_hash;
+  DLA_LOG("Read hash from bitstream ROM...\n");
+  for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+    bitstream_arch_hash.push_back(mmdWrapper_->ReadFromCsr(instance_, i * 4));
+  }
+
+  // Next BUILD_VERSION_SIZE bytes are for the build version string
+  DLA_LOG("Read build version string from bitstream ROM...\n");
+  std::string bitstream_build_version =
+      read_string_from_bitstream_rom(mmdWrapper_, instance_, BUILD_VERSION_WORD_SIZE, BUILD_VERSION_CSR_OFFSET);
+
+  // Next ARCH_NAME_SIZE bytes are for the arch name string
+  DLA_LOG("Read arch name string from bitstream ROM...\n");
+  std::string bitstream_arch_name =
+      read_string_from_bitstream_rom(mmdWrapper_, instance_, ARCH_NAME_WORD_SIZE, ARCH_NAME_CSR_OFFSET);
+
+  // ************************ Perform all checks *******************************
+  // ***************************************************************************
+  if (get_env_var_wrapper(FLAG_DISABLE_ARCH_CHECK) != "1") {
+    DLA_LOG("Runtime arch check is enabled. Check started...\n");
+
+    for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+      if (compiledResult->get_arch_hash()[i] != bitstream_arch_hash[i]) {
+        std::cerr << "Arch check failed: "
+                  << "compiledResult arch hash is " << arch_hash_to_string(compiledResult->get_arch_hash())
+                  << ", compiledResult arch is " << compiledResult->get_arch_name() << ", bitstream arch_hash is "
+                  << arch_hash_to_string(bitstream_arch_hash) << ", bitstream arch is " << bitstream_arch_name
+                  << std::endl;
+
+        std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_ARCH_CHECK << "=1."
+                  << std::endl;
+        std::exit(1);
+      }
+    }
+    DLA_LOG("Runtime arch check passed.\n");
+  } else {
+    DLA_ERROR(
+        "Environment variable %s is set to 1; "
+        "architecture check will be skipped. "
+        "This might cause undefined behavior including hanging, "
+        "and the user should only disable the check if "
+        "they understand the potential consequences.\n",
+        FLAG_DISABLE_ARCH_CHECK);
+  }
+
+  if (get_env_var_wrapper(FLAG_DISABLE_VERSION_CHECK) != "1") {
+    DLA_LOG(
+        "Runtime build version check is enabled. "
+        "Check started...\n");
+    if (bitstream_build_version != compiledResult->get_build_version_string()) {
+      std::cerr << "Build version check failed:"
+                << "compiledResult build version is " << compiledResult->get_build_version_string()
+                << ", bitstream build version is " << bitstream_build_version << std::endl;
+
+      std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_VERSION_CHECK << "=1."
+                << std::endl;
+
+      std::exit(1);
+    }
+    DLA_LOG("Runtime build version check passed.\n");
+  } else {
+    DLA_ERROR(
+        "Environment variable %s is set to 1; "
+        "build version check will be skipped. "
+        "This might cause undefined behavior including hanging, "
+        "and the user should only disable the check if "
+        "they understand the potential consequences.\n",
+        FLAG_DISABLE_VERSION_CHECK);
+  }
+
+  // Checks completed. Allocate buffers and write to DDR
+  intermediateBufferSizeDDR_ = compiledResult->get_conv_intermediate_size_in_bytes();
+  uint64_t totalConfigBytes = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                                0 :
+                                compiledResult->get_config_size_in_bytes();
+  auto &config_fbs_array = compiledResult->get_config_filter_bias_scale_array();
+  auto config_fbs_raw_array = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                              nullptr :
+                              config_fbs_array[0].data();
+  configFilterBiasBufferSizeDDR_ = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                                    0 :
+                                    config_fbs_array[0].size();
+
+  // TODO: uncomment when buffer_t object is added
+  // assert(config_filter_bias_graph_buffer_size_ddr == config_filter_bias_buffer->size_in_bytes());
+  // Allocate graph buffer (config, filter, bias, io) in DDR
+  uint64_t inputSizeDDR = compiledResult->get_conv_input_size_in_bytes();
+  uint64_t outputSizeDDR = compiledResult->get_conv_output_size_in_bytes();
+
+  // DMA data path width in bytes for feature and filter data
+  // TODO: move this into the arch
+  constexpr uint64_t featureWordSize = 32;
+  constexpr uint64_t filterWordSize = 64;
+
+  // Sanity check that buffer sizes are sufficiently aligned to ensure address alignment.
+  // Input, output, and intermediate buffers contain feature words.
+  assert(inputSizeDDR % featureWordSize == 0);
+  assert(outputSizeDDR % featureWordSize == 0);
+  assert(intermediateBufferSizeDDR_ % featureWordSize == 0);
+  // filter contains filter words, and config must be padded to a filter word size
+  assert(totalConfigBytes % filterWordSize == 0);
+  assert(configFilterBiasBufferSizeDDR_ % filterWordSize == 0);
+
+  // Allocate the intermediate buffer.
+  ddrBufferAllocator_->AllocateSharedBuffer(intermediateBufferSizeDDR_, instance_);
+
+  // Allocate the input/output buffer.
+  // Output buffer must come immediately after the input buffer, so from an allocation perspective this is one buffer.
+  // Note there is an input/output buffer pair allocated for each pipeline. The input/output pair must be contiguous for
+  // each pipeline, but input/output pairs from different pipelines are allowed to have a gap. We could call the
+  // allocator for each input/output buffer pair, however because everything is sized and aligned to the feature word
+  // size, we won't get gaps between them due to alignment. Calling the allocator once per pipeline would result in the
+  // same allocation as calling the allocator just once and using offsets within this big buffer for each pipeline.
+  uint64_t inputOutputBufferSize = numPipelines * (inputSizeDDR + outputSizeDDR);  // how much space to allocate
+  uint64_t inputOutputBufferAlignment = featureWordSize;  // starting address must be aligned to this
+  uint64_t inputOutputBufferAddr;                         // where did the allocator place this buffer
+  ddrBufferAllocator_->AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr);
+
+  // Allocate the config/filter buffer.
+  // Filter buffer must come immediately after the config buffer, so from an allocation perspective this is one buffer.
+  uint64_t configFilterBufferSize = configFilterBiasBufferSizeDDR_;
+  uint64_t configFilterBufferAlignment = filterWordSize;
+  uint64_t configFilterBufferAddr;
+  ddrBufferAllocator_->AllocatePrivateBuffer(
+      configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr);
+
+  // Print the allocation results
+  bool print_allocation_result = getenv("COREDLA_RUNTIME_DEBUG") != nullptr;
+  ios_base::fmtflags coutFlags = cout.flags();  // printing in both decimal and hex, save cout state to undo it later
+  if (print_allocation_result) {
+    DLA_LOG("FPGA DDR allocation results\n");
+    // Intermediate buffer address is hardcoded to 0 in device_memory_allocator.cpp, don't bother printing this
+    DLA_LOG("  Config buffer is at address %" PRIu64, configFilterBufferAddr);
+    DLA_LOG(" (%#" PRIx64 ")\n", configFilterBufferAddr);
+    const uint64_t filter_buffer_address = configFilterBufferAddr + totalConfigBytes;
+    DLA_LOG("  Filter/bias/scale buffer is at address %" PRIu64, filter_buffer_address);
+    DLA_LOG(" (%#" PRIx64 ")\n", filter_buffer_address);
+  }
+
+  const bool enable_istream = compiledResult->get_input_configuration().begin()->second.enable_input_streaming;
+  const bool enable_ostream = compiledResult->get_output_configuration().output_streaming_enabled;
+
+  // Write graph buffer to DDR
+  if (!compiledResult->get_ddrfree_header().enable_parameter_rom) {
+    mmdWrapper_->WriteToDDR(instance_, configFilterBufferAddr, configFilterBiasBufferSizeDDR_, config_fbs_raw_array);
+  } else {
+    DLA_LOG("  Ddrfree graph constants are not written to DDR.\n");
+  }
+
+  for (uint64_t i = 0; i < numPipelines; i++) {
+    uint64_t inputAddrDDR = inputOutputBufferAddr + i * (inputSizeDDR + outputSizeDDR);
+    uint64_t outputAddrDDR = inputAddrDDR + inputSizeDDR;
+    if (print_allocation_result) {
+      DLA_LOG("  Input buffer %" PRIu64 " is at address %" PRIu64, i, inputAddrDDR);
+      DLA_LOG(" (%#" PRIx64 ")\n", inputAddrDDR);
+      DLA_LOG("  Output buffer %" PRIu64 " is at address %" PRIu64, i, outputAddrDDR);
+      DLA_LOG(" (%#" PRIx64 ")\n", outputAddrDDR);
+    }
+    batchJobs_.push_back(move(CoreDlaBatchJob::MakeUnique(mmdWrapper_,
+                                                          totalConfigBytes,
+                                                          configFilterBufferAddr,
+                                                          inputAddrDDR,
+                                                          outputAddrDDR,
+                                                          inputSizeDDR,
+                                                          outputSizeDDR,
+                                                          enable_istream,
+                                                          enable_ostream,
+                                                          instance_,
+                                                          spStreamControllerComms)));
+  }
+  cout.flags(coutFlags);  // restore the state of cout
+}
+
+BatchJob *CoreDlaGraphJob::GetBatchJob() {
+  graphJobMutex.lock();
+  if (batchJobsRequested_ >= batchJobs_.size()) {
+    graphJobMutex.unlock();
+    return nullptr;
+  }
+  auto *batchJob = batchJobs_[batchJobsRequested_].get();
+  batchJobsRequested_++;
+  graphJobMutex.unlock();
+  return batchJob;
+}
diff --git a/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp
new file mode 100644
index 0000000..48844f4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp
@@ -0,0 +1,80 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "dla_dma_constants.h"        //DLA_DMA_CSR_OFFSET_***
+
+#include <stdexcept>  //std::runtime_error
+#include <string>     //std::string
+
+void DeviceMemoryAllocator::Initialize(uint64_t totalSize, MmdWrapper* mmdWrapper) {
+  totalGlobalMemSize_ = totalSize;
+  mmdWrapper_ = mmdWrapper;
+  currentIntermediateMaxBufferSizeAllocated_ = 0;
+  currentStartAddressGraphBufferSpace_ = totalSize;
+}
+
+// The intermediate buffer is shared among all graphs. It gets placed at the lowest address
+// and grows upwards (if a new graph is added which needs a bigger intermediate buffer).
+void DeviceMemoryAllocator::AllocateSharedBuffer(uint64_t bufferSize, int instance) {
+  if (bufferSize > currentIntermediateMaxBufferSizeAllocated_) {
+    currentIntermediateMaxBufferSizeAllocated_ = bufferSize;
+
+    // error intermediate buffer grows into the region of memory used for private buffers
+    if (currentIntermediateMaxBufferSizeAllocated_ > currentStartAddressGraphBufferSpace_) {
+      std::string msg = "FPGA DDR allocation failed, intermediate buffer grew upwards to " +
+                        std::to_string(currentIntermediateMaxBufferSizeAllocated_) +
+                        ", remaining unallocated space is limited to " +
+                        std::to_string(currentStartAddressGraphBufferSpace_);
+      throw std::runtime_error(msg);
+    }
+
+    // tell the fpga where the intermediate buffer is located. At address 0 now. Will change in future with multiple
+    // pe_arrays
+    mmdWrapper_->WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0);
+  }
+}
+
+// The config, filter, input, and output buffers are specific to a graph and therefore require
+// their own space in device memory. Note that filter must come immediately after config, so the
+// allocator allocates both of these together as one buffer. Likewise output must come immediately
+// after input. Private buffers are allocated from the highest to lowest address since the size is
+// known at allocation time. Hardware requires the address to have some alignment, which is
+// specified by the bufferAlignment argument.
+void DeviceMemoryAllocator::AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t& bufferAddr) {
+  uint64_t maxInflatedBufferSize = bufferSize + bufferAlignment;  // be conservative for how much space buffer may take
+
+  // error if the graph does not fit in fpga ddr
+  if (currentIntermediateMaxBufferSizeAllocated_ + maxInflatedBufferSize > currentStartAddressGraphBufferSpace_) {
+    std::string msg =
+      "FPGA DDR allocation failed, allocating buffer of size " + std::to_string(maxInflatedBufferSize) +
+      " exceeds the remaining space available of size " +
+      std::to_string(currentStartAddressGraphBufferSpace_ - currentIntermediateMaxBufferSizeAllocated_) +
+      ". This could be caused by the graph being too large or splitting the graph into too many subgraphs. " +
+      "Memory requirements for large graphs can be reduced by selecting different folding options, " +
+      "reducing batch size or selecting architectures with less padding.";
+    throw std::runtime_error(msg);
+  }
+
+  currentStartAddressGraphBufferSpace_ -= bufferSize;  // allocate from highest to lowest address
+  currentStartAddressGraphBufferSpace_ -=
+      (currentStartAddressGraphBufferSpace_ % bufferAlignment);  // correct for alignment
+  bufferAddr = currentStartAddressGraphBufferSpace_;
+}
+
+void DeviceMemoryAllocator::Clear() {
+  currentIntermediateMaxBufferSizeAllocated_ = 0;
+  currentStartAddressGraphBufferSpace_ = totalGlobalMemSize_;
+}
+
+DeviceMemoryAllocator::~DeviceMemoryAllocator() { Clear(); }
diff --git a/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp
new file mode 100644
index 0000000..bbb052a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp
@@ -0,0 +1,172 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "mmd_wrapper.h"
+#include "aocl_mmd.h"           // aocl_mmd_***
+#include "dla_dma_constants.h"  // DLA_DMA_CSR_OFFSET_***
+
+#include <cassert>    // assert
+#include <cstddef>    // size_t
+#include <iostream>   // std::cerr
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::string
+
+// All board variants must obey the CoreDLA CSR spec, which says that all access must be
+// - 32 bits in size
+// - address must be 4 byte aligned
+// - within the address range, CSR size is 2048 bytes
+constexpr uint64_t DLA_CSR_ALIGNMENT = 4;
+constexpr uint64_t DLA_CSR_SIZE = 2048;
+
+// assert(status == 0) is removed by the c++ processor when compiling in release mode
+// this is a handy workaround for suppressing the compiler warning about an unused variable
+template <class T>
+void suppress_warning_unused_varible(const T &) {}
+
+MmdWrapper::MmdWrapper() {
+  // Open the MMD
+  constexpr size_t MAX_BOARD_NAMES_LEN = 4096;
+  char name[MAX_BOARD_NAMES_LEN];
+  size_t sz;
+  int status = aocl_mmd_get_offline_info(AOCL_MMD_BOARD_NAMES, MAX_BOARD_NAMES_LEN, name, &sz);
+  if (status) {
+    std::string msg = "Failed to query a board name from MMD. Perhaps no FPGA device is available?";
+    throw std::runtime_error(msg);
+  }
+  int handle = aocl_mmd_open(name);
+  if (handle < 0) {
+    std::string msg = "Failed to open MMD";
+    throw std::runtime_error(msg);
+  }
+  handle_ = handle;
+
+  // Query some board-specific information from the MMD. Some values can be hardcoded constants
+  // where different boards have different constants, e.g. capacity of FPGA DDR. Others values may
+  // be determined experimentally e.g. start and stop a counter with a known duration in between to
+  // measure the clk_dla frequency.
+  maxInstances_ = dla_mmd_get_max_num_instances();
+  ddrSizePerInstance_ = dla_mmd_get_ddr_size_per_instance();
+  coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_);
+
+  // On DE10 Agilex boards with GCC 8.3.0, we noticed that the clock frequency was being read as 0,
+  // around 50% of the time, and around 10% of the time on GCC 9.2.0, causing failures on perf_est
+  // tests. This retry loop will recall the function until the coreDlaClockFreq is non zero, or
+  // it exhausts 10 retries.
+  // We have no idea why this happens currently, but it typically passes by the second try.
+  int clockFreqRetries = 10;
+  while (coreDlaClockFreq_ == 0 && clockFreqRetries > 0) {
+    coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_);
+    clockFreqRetries--;
+  }
+  ddrClockFreq_ = dla_mmd_get_ddr_clock_freq();
+}
+
+MmdWrapper::~MmdWrapper() {
+  // Close the MMD
+  int status = aocl_mmd_close(handle_);
+  if (status) {
+    // Avoid throwning an exception from a Destructor.  We are ultimately
+    // part of a (virtual) OpenVINO destructor, so we should follow the
+    // noexcept(true) that it advertises.  Perhaps we can close the mmd
+    // as a separate step prior to destruction to make signaling errors
+    // easier?
+    std::cerr << "Failed to close MMD" << std::endl;
+    std::cerr << "Error status " << status << std::endl;
+    std::exit(1);
+  }
+}
+
+void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const {
+  // register an interrupt handler
+  int status = aocl_mmd_set_interrupt_handler(handle_, func, data);
+  if (status) {
+    std::string msg = "Failed to register an interrupt handler with MMD";
+    throw std::runtime_error(msg);
+  }
+}
+
+void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE);
+  assert(addr % DLA_CSR_ALIGNMENT == 0);
+  int status = dla_mmd_csr_write(handle_, instance, addr, &data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE);
+  assert(addr % DLA_CSR_ALIGNMENT == 0);
+  uint32_t data;
+  int status = dla_mmd_csr_read(handle_, instance, addr, &data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+  return data;
+}
+
+void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + length <= ddrSizePerInstance_);
+  int status = dla_mmd_ddr_write(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + length <= ddrSizePerInstance_);
+  int status = dla_mmd_ddr_read(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+#ifndef STREAM_CONTROLLER_ACCESS
+// Stream controller access is not supported by the platform abstraction
+bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; }
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(false);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(false);
+}
+#else
+// If the mmd layer supports accesses to the Stream Controller
+bool MmdWrapper::bIsStreamControllerValid(int instance) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  bool status = dla_is_stream_controller_valid(handle_, instance);
+  return status;
+}
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr % sizeof(uint32_t) == 0);
+  assert(length % sizeof(uint32_t) == 0);
+  int status = dla_mmd_stream_controller_write(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr % sizeof(uint32_t) == 0);
+  assert(length % sizeof(uint32_t) == 0);
+  int status = dla_mmd_stream_controller_read(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp
new file mode 100644
index 0000000..677f6e4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp
@@ -0,0 +1,274 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "stream_controller_comms.h"
+#include <chrono>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+// StreamControllerComms provides an interface to the Stream Controller
+// microcode running in the NIOS-V
+
+static const uint32_t messageReadyMagicNumber = 0x55225522;
+static constexpr uint32_t mailboxRamSize = 0x1000;
+
+StreamControllerComms::StreamControllerComms() {}
+
+bool StreamControllerComms::IsPresent() {
+  // Check there is an interface to the stream controller
+  if (!_mmdWrapper.bIsStreamControllerValid(_streamControllerInstance)) {
+    return false;
+  }
+
+  // Check that the stream controller responds
+  bool isPresent = Ping();
+  return isPresent;
+}
+
+// Query for the current status
+Payload<StatusMessagePayload> StreamControllerComms::GetStatus() {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return {};
+  }
+
+  if (SendMessage(MessageType_GetStatus)) {
+    if (ReceiveMessage() == MessageType_Status) {
+      return _receivedStatusMessage;
+    }
+  }
+
+  return {};
+}
+
+// Schedule an inference request with the stream controller
+bool StreamControllerComms::ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items) {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  bool status = true;
+
+  for (auto& job : items) {
+    bool thisJobStatus = false;
+
+    if (SendMessage(MessageType_ScheduleItem, job.GetPayload(), job.GetSize())) {
+      if (ReceiveMessage() == MessageType_NoOperation) {
+        thisJobStatus = true;
+      }
+    }
+
+    if (!thisJobStatus) {
+      status = false;
+    }
+  }
+
+  return status;
+}
+
+// Send a ping command to the stream controller and wait for a pong
+// response.
+bool StreamControllerComms::Ping() {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  if (SendMessage(MessageType_Ping)) {
+    return (ReceiveMessage() == MessageType_Pong);
+  }
+
+  return false;
+}
+
+// Initialize and reset the stream controller
+//
+// sourceBufferSize:
+//      The size of the MSGDMA buffers that the stream
+//      controller will receive from the layout transform
+// dropSourceBuffers:
+//      How many source buffers to drop between each
+//      processed one. 0 by default unless set in the configuration
+//      by the app with DLIAPlugin::properties::streaming_drop_source_buffers.name()
+// numInferenceRequest:
+//      A constant value set in the executable network. The
+//      stream controller will start executing once it has
+//      received this number of inference requests from OpenVINO
+bool StreamControllerComms::Initialize(uint32_t sourceBufferSize,
+                                       uint32_t dropSourceBuffers,
+                                       uint32_t numInferenceRequests) {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  Payload<InitializeStreamControllerPayload> initializePayload{};
+  initializePayload._sourceBufferSize = sourceBufferSize;
+  initializePayload._dropSourceBuffers = dropSourceBuffers;
+  initializePayload._numInferenceRequests = numInferenceRequests;
+
+  if (SendMessage(
+          MessageType_InitializeStreamController, initializePayload.GetPayload(), initializePayload.GetSize())) {
+    if (ReceiveMessage() == MessageType_NoOperation) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Receive a message from the stream controller by reading from the
+// mailbox memory until the magic number is set to indicate a message is ready.
+// Only the Status return message has a payload
+MessageType StreamControllerComms::ReceiveMessage() {
+  uint32_t receiveMessageOffset = mailboxRamSize / 2;
+  MessageHeader* pReceiveMessage = nullptr;
+  uint32_t messageReadyMagicNumberOffset = receiveMessageOffset;
+  uint32_t payloadOffset = static_cast<uint32_t>(receiveMessageOffset + (size_t)&pReceiveMessage->_payload);
+  uint32_t waitCount = 0;
+
+  while (waitCount < 100) {
+    MessageHeader messageHeader;
+    _mmdWrapper.ReadFromStreamController(
+        _streamControllerInstance, receiveMessageOffset, sizeof(messageHeader), &messageHeader);
+    if (messageHeader._messageReadyMagicNumber == messageReadyMagicNumber) {
+      MessageType messageType = static_cast<MessageType>(messageHeader._messageType);
+      uint32_t sequenceId = messageHeader._sequenceID;
+
+      bool ok = false;
+
+      if (messageType == MessageType_Status) {
+        ok = StatusMessageHandler(payloadOffset);
+      } else if (messageType == MessageType_Pong) {
+        ok = true;
+      }
+
+      if (!ok) {
+        _numBadMessages++;
+      }
+
+      _mmdWrapper.WriteToStreamController(
+          _streamControllerInstance, messageReadyMagicNumberOffset, sizeof(sequenceId), &sequenceId);
+      _lastReceiveSequenceID = sequenceId;
+      return messageType;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    waitCount++;
+  }
+
+  return MessageType_Invalid;
+}
+
+// Send a message to the stream controller by writing to the mailbox memory,
+// and wait for the message to be received/processed
+bool StreamControllerComms::SendMessage(MessageType messageType, void* pPayload, size_t payloadSize) {
+  uint32_t sendMessageOffset = 0;
+  MessageHeader* pSendMessage = nullptr;
+  uint32_t messageReadyMagicNumberOffset = 0;
+  uint32_t messageTypeOffset = static_cast<uint32_t>((size_t)&pSendMessage->_messageType);
+  uint32_t sequenceIDOffset = static_cast<uint32_t>((size_t)&pSendMessage->_sequenceID);
+  uint32_t payloadOffset = static_cast<uint32_t>((size_t)&pSendMessage->_payload);
+
+  uint32_t uintMessageType = static_cast<uint32_t>(messageType);
+
+  _mmdWrapper.WriteToStreamController(
+      _streamControllerInstance, messageTypeOffset, sizeof(uintMessageType), &uintMessageType);
+  _mmdWrapper.WriteToStreamController(
+      _streamControllerInstance, sequenceIDOffset, sizeof(_sendSequenceID), &_sendSequenceID);
+
+  if (payloadSize > 0) {
+    _mmdWrapper.WriteToStreamController(_streamControllerInstance, payloadOffset, payloadSize, pPayload);
+  }
+
+  // Signal the message as ready
+  _mmdWrapper.WriteToStreamController(_streamControllerInstance,
+                                      messageReadyMagicNumberOffset,
+                                      sizeof(messageReadyMagicNumber),
+                                      &messageReadyMagicNumber);
+
+  // Wait until the message has been processed by looking for the sequence ID
+  // in the magic number position
+  uint32_t waitCount = 0;
+  while (waitCount < 100) {
+    MessageHeader messageHeader;
+    _mmdWrapper.ReadFromStreamController(
+        _streamControllerInstance, sendMessageOffset, sizeof(messageHeader), &messageHeader);
+
+    if (messageHeader._messageReadyMagicNumber == _sendSequenceID) {
+      _sendSequenceID++;
+      return true;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    waitCount++;
+  }
+
+  return false;
+}
+
+// Read the status message payload
+bool StreamControllerComms::StatusMessageHandler(uint32_t payloadOffset) {
+  _mmdWrapper.ReadFromStreamController(
+      _streamControllerInstance, payloadOffset, sizeof(_receivedStatusMessage), &_receivedStatusMessage);
+  return true;
+}
+
+// Parse the status message payload into a string
+std::string StreamControllerComms::GetStatusString(Payload<StatusMessagePayload>& statusPayload) {
+  std::ostringstream stringStream;
+  stringStream << static_cast<uint32_t>(statusPayload._status) << "," << statusPayload._statusLineNumber << ","
+               << statusPayload._numReceivedSourceBuffers << "," << statusPayload._numScheduledInferences << ","
+               << statusPayload._numExecutedJobs;
+  return stringStream.str();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// BusyFlag is used to prevent concurrent access to the stream controller,
+// without holding a mutex when sending/receiving commands
+using LockGuard = std::lock_guard<std::recursive_mutex>;
+
+bool BusyFlag::Lock() {
+  LockGuard lock(_mutex);
+  if (_busy) {
+    return false;
+  }
+
+  _busy = true;
+  return true;
+}
+
+void BusyFlag::Release() {
+  LockGuard lock(_mutex);
+  _busy = false;
+}
+
+BusyCheck::BusyCheck(BusyFlag& busyFlag) : _busyFlag(busyFlag), _haveLocked(false) {}
+
+BusyCheck::~BusyCheck() {
+  if (_haveLocked) {
+    _busyFlag.Release();
+  }
+}
+
+BusyCheck::operator bool() {
+  bool locked = _busyFlag.Lock();
+  if (locked) {
+    _haveLocked = true;
+  }
+  return locked;
+}
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h
new file mode 100644
index 0000000..d77c5ab
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h
@@ -0,0 +1,45 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+//the numbers below are byte addresses, must be a multiple of 4 since each access is 32 bits
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL      = 512; //0x200
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_MASK         = 516;
+static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR       = 528; //0x210
+static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO = 532;
+static const uint32_t DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR = 536;
+static const uint32_t DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS       = 540;
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR = 544; //0x220
+static const uint32_t DLA_DMA_CSR_OFFSET_COMPLETION_COUNT       = 548;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO       = 576; //0x240
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI       = 580;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO     = 584;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI     = 588;
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR     = 592; //0x250
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID    = 596;
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA     = 600;
+
+//bit positions in interrupt control and mask
+static const uint32_t DLA_DMA_CSR_INTERRUPT_ERROR_BIT = 0;
+static const uint32_t DLA_DMA_CSR_INTERRUPT_DONE_BIT  = 1;
+
+//bit positions in descriptor diagnostic
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT    = 0;
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT = 1;
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT = 2;
+
+//descriptor queue
+//runtime knows how many jobs it has enqueued and how many jobs have finished
+//runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE  = 64;   //max number of jobs that runtime can enqueue
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB = 8;    //how many words in the queue are needed to enqueue 1 job
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE = DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE * DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //number of words in the hardware queue
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c
new file mode 100644
index 0000000..1a12def
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c
@@ -0,0 +1,80 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "message_handlers.h"
+#include "stream_controller_messages.h"
+
+bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    InitializeStreamControllerPayload* pInitializePayload = (InitializeStreamControllerPayload*)pPayload;
+    this->InitializeStreamController(this,
+                                     pInitializePayload->_sourceBufferSize,
+                                     pInitializePayload->_dropSourceBuffers,
+                                     pInitializePayload->_numInferenceRequests);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    volatile CoreDlaJobPayload* pCoreDlaJobPayload = (volatile CoreDlaJobPayload*)pPayload;
+    this->NewInferenceRequestReceived(this, pCoreDlaJobPayload);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    this->SendMessage(this, MessageType_Pong, NULL, 0);
+    return true;
+}
+
+bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    StatusMessagePayload statusMessagePayload;
+    statusMessagePayload._status = this->_status;
+    statusMessagePayload._statusLineNumber = this->_statusLineNumber;
+    statusMessagePayload._numReceivedSourceBuffers = this->_numReceivedSourceBuffers;
+    statusMessagePayload._numScheduledInferences = this->_numScheduledInferences;
+    statusMessagePayload._numExecutedJobs = this->_numExecutedJobs;
+    this->SendMessage(this, MessageType_Status, &statusMessagePayload, sizeof(statusMessagePayload));
+    return true;
+}
+
+bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    ManualArmDmaTransferPayload* pManualArmDmaTransferPayload = (ManualArmDmaTransferPayload*)pPayload;
+    CoreDlaJobItem emptyJob = {};
+    this->_debugJob = emptyJob;
+    this->_debugJob._payload._inputAddressDDR = pManualArmDmaTransferPayload->_inputAddressDDR;
+    this->_sourceBufferSize = pManualArmDmaTransferPayload->_sourceBufferSize;
+    bool fromHPS = (pManualArmDmaTransferPayload->_fromHPS != 0);
+    this->ArmDmaTransfer(this, &this->_debugJob, fromHPS);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    ManualScheduleDlaInferencePayload* pManualScheduleDlaInferencePayload = (ManualScheduleDlaInferencePayload*)pPayload;
+    CoreDlaJobItem emptyJob = {};
+    this->_debugJob = emptyJob;
+    this->_debugJob._payload._configurationBaseAddressDDR = pManualScheduleDlaInferencePayload->_configurationBaseAddressDDR;
+    this->_debugJob._payload._configurationSize = pManualScheduleDlaInferencePayload->_configurationSize;
+    this->_debugJob._payload._inputAddressDDR = pManualScheduleDlaInferencePayload->_inputAddressDDR;
+    this->ScheduleDlaInference(this, &this->_debugJob);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h
new file mode 100644
index 0000000..a7e5187
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h
@@ -0,0 +1,22 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include "stream_controller.h"
+
+extern bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload);
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c
new file mode 100644
index 0000000..ad8b372
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c
@@ -0,0 +1,426 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "stream_controller.h"
+#include "message_handlers.h"
+#include "sys/alt_cache.h"
+#include "dla_registers.h"
+#include <string.h>
+
+static const uint32_t messageReadyMagicNumber = 0x55225522;
+static const uint32_t mailboxBaseAddress = 0x40000;
+static const uint32_t mailboxSize = 0x1000;
+static const uint32_t dlaBaseAddress = 0x30000;
+
+static void Start(StreamController* this);
+static void Reset(StreamController* this);
+static bool InitializeMsgDma(StreamController* this);
+static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS);
+static void RunEventLoop(StreamController* this);
+static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data);
+static void InitializeStreamController(StreamController* this, uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber);
+static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage);
+static bool SendMessage(StreamController* this,
+                        MessageType messageType,
+                        void* pPayload,
+                        size_t payloadSize);
+static void NewSourceBuffer(StreamController* this);
+static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob);
+static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload);
+static void MsgDmaIsr(void* pContext);
+
+int main()
+{
+    StreamController streamController = {};
+    StreamController* this = &streamController;
+
+    this->Start = Start;
+    this->Reset = Reset;
+    this->InitializeMsgDma = InitializeMsgDma;
+    this->ArmDmaTransfer = ArmDmaTransfer;
+    this->RunEventLoop = RunEventLoop;
+    this->WriteToDlaCsr = WriteToDlaCsr;
+    this->InitializeStreamController = InitializeStreamController;
+    this->SetStatus = SetStatus;
+    this->ReceiveMessage = ReceiveMessage;
+    this->SendMessage = SendMessage;
+    this->NewSourceBuffer = NewSourceBuffer;
+    this->ScheduleDlaInference = ScheduleDlaInference;
+    this->NewInferenceRequestReceived = NewInferenceRequestReceived;
+
+    // Message handlers
+    this->GetStatusMessageHandler = GetStatusMessageHandler;
+    this->ScheduleItemMessageHandler = ScheduleItemMessageHandler;
+    this->PingMessageHandler = PingMessageHandler;
+    this->InitializeStreamControllerMessageHandler = InitializeStreamControllerMessageHandler;
+    this->ManualArmDmaTransferMessageHandler = ManualArmDmaTransferMessageHandler;
+    this->ManualScheduleDlaInferenceMessageHandler = ManualScheduleDlaInferenceMessageHandler;
+
+    this->Reset(this);
+    this->Start(this);
+
+    return 0;
+}
+
+static void Start(StreamController* this)
+{
+    // Clear the mailbox memory
+    uint8_t* pMailbox = (uint8_t*)(mailboxBaseAddress);
+    memset(pMailbox, 0, mailboxSize);
+
+    if (this->InitializeMsgDma(this))
+    {
+        // Run the main event loop
+        this->RunEventLoop(this);
+    }
+}
+
+static bool InitializeMsgDma(StreamController* this)
+{
+    this->_pMsgDevice = alt_msgdma_open(DLA_MSGDMA_0_CSR_NAME);
+    if (this->_pMsgDevice)
+    {
+        alt_msgdma_register_callback(this->_pMsgDevice, MsgDmaIsr, 0, this);
+        alt_dcache_flush_all();
+        return true;
+    }
+    else
+    {
+        this->SetStatus(this, NiosStatusType_MsgDmaFailed, __LINE__);
+        return false;
+    }
+}
+
+static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS)
+{
+    this->_pFillingImageJob = pFillJob;
+
+    alt_u32* pWriteBuffer = (alt_u32*)this->_pFillingImageJob->_payload._inputAddressDDR;
+    alt_u32 length = this->_sourceBufferSize;
+    alt_u32 control = ALTERA_MSGDMA_DESCRIPTOR_CONTROL_TRANSFER_COMPLETE_IRQ_MASK;
+
+    int r = 0;
+    if (fromHPS)
+    {
+        r = alt_msgdma_construct_extended_st_to_mm_descriptor(this->_pMsgDevice,
+                                                              &this->_msgdmaDescriptor,
+                                                              pWriteBuffer,
+                                                              length,
+                                                              control,
+                                                              0,
+                                                              0,
+                                                              1);
+    }
+    else
+    {
+        r = alt_msgdma_construct_extended_mm_to_st_descriptor(this->_pMsgDevice,
+                                                              &this->_msgdmaDescriptor,
+                                                              pWriteBuffer,
+                                                              length,
+                                                              control,
+                                                              0,
+                                                              0,
+                                                              1);
+    }
+
+    if (r == 0)
+    {
+        r = alt_msgdma_extended_descriptor_async_transfer(this->_pMsgDevice, &this->_msgdmaDescriptor);
+        if (r != 0)
+        {
+            this->SetStatus(this, NiosStatusType_AsyncTransferFailed, __LINE__);
+        }
+    }
+    else
+    {
+        this->SetStatus(this, NiosStatusType_BadDescriptor, __LINE__);
+    }
+
+    return (r == 0);
+}
+
+static void RunEventLoop(StreamController* this)
+{
+    volatile MessageHeader* pReceiveMessage = (MessageHeader*)(mailboxBaseAddress);
+
+    uint32_t previousIsrCount = this->_isrCount;
+
+    while (true)
+    {
+        uint32_t isrCount = this->_isrCount;
+
+        if (isrCount != previousIsrCount)
+        {
+            this->NewSourceBuffer(this);
+        }
+
+        if (pReceiveMessage->_messageReadyMagicNumber == messageReadyMagicNumber)
+        {
+            this->ReceiveMessage(this, pReceiveMessage);
+        }
+
+        previousIsrCount = isrCount;
+    }
+}
+
+static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage)
+{
+    MessageType messageType = pReceiveMessage->_messageType;
+    uint32_t sequenceId = pReceiveMessage->_sequenceID;
+    this->_commandCounter++;
+
+    bool ok = false;
+
+    volatile uint32_t* pPayload = &pReceiveMessage->_payload;
+
+    if (messageType == MessageType_GetStatus)
+        ok = this->GetStatusMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ScheduleItem)
+        ok = this->ScheduleItemMessageHandler(this, pPayload);
+    else if (messageType == MessageType_Ping)
+        ok = this->PingMessageHandler(this, pPayload);
+    else if (messageType == MessageType_InitializeStreamController)
+        ok = this->InitializeStreamControllerMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ManualArmDmaTransfer)
+        ok = this->ManualArmDmaTransferMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ManualScheduleDlaInference)
+        ok = this->ManualScheduleDlaInferenceMessageHandler(this, pPayload);
+
+    if (!ok)
+        this->SetStatus(this, NiosStatusType_BadMessage, __LINE__);
+
+    pReceiveMessage->_messageReadyMagicNumber = sequenceId;
+
+    if ((this->_lastReceiveSequenceID != 0) && ((this->_lastReceiveSequenceID + 1) != sequenceId))
+    {
+        // If the DLA plugin has restarted, the first message will be InitializeStreamController
+        // with a sequence ID of 0
+        if ((sequenceId != 0) || (messageType != MessageType_InitializeStreamController))
+            this->SetStatus(this, NiosStatusType_BadMessageSequence, __LINE__);
+    }
+
+    this->_lastReceiveSequenceID = sequenceId;
+    return messageType;
+}
+
+static bool SendMessage(StreamController* this,
+                        MessageType messageType,
+                        void *pPayload,
+                        size_t payloadSize)
+{
+    uint32_t mailboxSendAddress = mailboxBaseAddress + (mailboxSize / 2);
+    uint32_t* pMailbox = (uint32_t*)mailboxSendAddress;
+    MessageHeader* pSendMessage = (MessageHeader*)(pMailbox);
+    void* pPayloadDestination = &pSendMessage->_payload;
+
+    pSendMessage->_messageType = messageType;
+    pSendMessage->_sequenceID = this->_sendSequenceID;
+
+    if (payloadSize > 0)
+        memcpy(pPayloadDestination, pPayload, payloadSize);
+
+    // Signal the message as ready
+    pSendMessage->_messageReadyMagicNumber = messageReadyMagicNumber;
+
+    this->_sendSequenceID++;
+    return true;
+}
+
+// We have received a new source buffer via the msgdma
+static void NewSourceBuffer(StreamController* this)
+{
+    // Read the response to flush the buffer
+    CoreDlaJobItem* pJustFilledJob = this->_pFillingImageJob;
+    CoreDlaJobItem* pNextFillJob = NULL;
+
+    uint32_t bufferSequence = this->_numReceivedSourceBuffers;
+    this->_numReceivedSourceBuffers++;
+
+    // Have we just captured a manually armed DMA transfer?
+    if (pJustFilledJob == &this->_debugJob)
+        return;
+
+    if (this->_dropSourceBuffers > 0)
+    {
+        // If _dropSourceBuffers = 1, we process 1, drop 1 etc
+        // if _dropSourceBuffers = 2, we process 1, drop 2, process 1, drop 2 etc
+        if (bufferSequence % (this->_dropSourceBuffers + 1) != 0)
+        {
+            // Drop this buffer, capture the next one in its place
+            this->ArmDmaTransfer(this, pJustFilledJob, true);
+            return;
+        }
+    }
+
+    pJustFilledJob->_hasSourceBuffer = true;
+
+    if (pJustFilledJob->_pNextJob->_hasSourceBuffer)
+    {
+        // No space in the next job, so keep filling the same job
+        pNextFillJob = pJustFilledJob;
+
+        // It already has a buffer but we have to
+        // consider this as dropped as we will write another
+        // in its place
+        pNextFillJob->_hasSourceBuffer = false;
+    }
+    else
+    {
+        pNextFillJob = pJustFilledJob->_pNextJob;
+    }
+
+    // Re-arm the DMA transfer
+    this->ArmDmaTransfer(this, pNextFillJob, true);
+
+    // If there are less than two scheduled buffers, then we can schedule another one
+    // _pNextInferenceRequestJob is the executing job if it is marked as scheduled
+
+    uint32_t nScheduled = 0;
+    if (this->_pNextInferenceRequestJob->_scheduledWithDLA)
+        nScheduled++;
+    if (this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA)
+        nScheduled++;
+
+    if (nScheduled < 2)
+        this->ScheduleDlaInference(this, pJustFilledJob);
+}
+
+static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload)
+{
+    // Once we have received all '_totalNumInferenceRequests' inference requests,
+    // we set the state to running and can now capture the input dma's
+    bool wasRunning = this->_running;
+    this->_numInferenceRequests++;
+    this->_running = (this->_numInferenceRequests >= this->_totalNumInferenceRequests);
+
+    CoreDlaJobItem* pThisJob = this->_pNextInferenceRequestJob;
+
+    // Store the job details and move to the next
+    uint32_t previousAddress = pThisJob->_payload._inputAddressDDR;
+    pThisJob->_payload = *pJobPayload;
+
+    // This job has just completed so clear its state
+    pThisJob->_scheduledWithDLA = false;
+    pThisJob->_hasSourceBuffer = false;
+
+    // The jobs are recycled by the DLA plugin so the inputAddrDDR should
+    // stay the same for each _jobs[n]
+    if ((pThisJob->_payload._inputAddressDDR != previousAddress) && (previousAddress != 0))
+        this->SetStatus(this, NiosStatusType_Error, __LINE__);
+
+    this->_pNextInferenceRequestJob = this->_pNextInferenceRequestJob->_pNextJob;
+
+    if (wasRunning)
+    {
+        this->_numExecutedJobs++;
+
+        // Check if we have any jobs ready to be scheduled. Maximum of 2 can have _scheduledWithDLA set
+        if (!this->_pNextInferenceRequestJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_hasSourceBuffer)
+        {
+            this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob);
+        }
+        else if (!this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_pNextJob->_hasSourceBuffer)
+        {
+            this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob->_pNextJob);
+        }
+    }
+    else if (this->_running)
+    {
+        // We have just started running
+        // Arm the DMA transfer to start receiving source buffers
+        this->ArmDmaTransfer(this, &this->_jobs[0], true);
+    }
+}
+
+static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob)
+{
+    // The DLA has an input FIFO. By setting the base address register,
+    // we add this request to the FIFO
+    pJob->_scheduledWithDLA = true;
+    this->_numScheduledInferences++;
+
+    CoreDlaJobPayload* pJobPayload = &pJob->_payload;
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, pJobPayload->_configurationBaseAddressDDR);
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, pJobPayload->_configurationSize);
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, pJobPayload->_inputAddressDDR);
+}
+
+static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber)
+{
+    this->_status = statusType;
+    this->_statusLineNumber = lineNumber;
+}
+
+static void InitializeStreamController(StreamController* this,
+                                       uint32_t sourceBufferSize,
+                                       uint32_t dropSourceBuffers,
+                                       uint32_t numInferenceRequests)
+{
+    // This is called once when the inference app is run,
+    // so acts like a reset
+    this->_sourceBufferSize = sourceBufferSize;
+    this->_dropSourceBuffers = dropSourceBuffers;
+    this->_totalNumInferenceRequests = numInferenceRequests;
+    this->_jobs = malloc(sizeof(CoreDlaJobItem) * this->_totalNumInferenceRequests);
+
+    // Reset any previous state
+    this->Reset(this);
+}
+
+static void Reset(StreamController* this)
+{
+    CoreDlaJobItem emptyJob = {};
+    uint32_t lastIndex = this->_totalNumInferenceRequests - 1;
+
+    // Set up the circular job buffers
+    for (uint32_t i = 0; i < this->_totalNumInferenceRequests; i++)
+    {
+        this->_jobs[i] = emptyJob;
+        this->_jobs[i]._index = i;
+        uint32_t previousIndex = (i == 0) ? lastIndex : i - 1;
+        uint32_t nextIndex = (i == lastIndex) ? 0 : i + 1;
+        this->_jobs[i]._pPreviousJob = &this->_jobs[previousIndex];
+        this->_jobs[i]._pNextJob = &this->_jobs[nextIndex];
+    }
+
+    this->_pNextInferenceRequestJob = &this->_jobs[0];
+    this->_pFillingImageJob = &this->_jobs[0];
+    this->_status = NiosStatusType_OK;
+    this->_statusLineNumber = 0;
+    this->_commandCounter = 0;
+    this->_numInferenceRequests = 0;
+    this->_numExecutedJobs = 0;
+    this->_numScheduledInferences = 0;
+    this->_lastReceiveSequenceID = 0;
+    this->_sendSequenceID = 0;
+    this->_running = false;
+    this->_isrCount = 0;
+    this->_numReceivedSourceBuffers = 0;
+}
+
+static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data)
+{
+    uint32_t* pRegister = (uint32_t*)(dlaBaseAddress + addr);
+    pRegister[0] = data;
+}
+
+// Incrementing the ISR count here will result in NewSourceBuffer above being called
+// in the event loop
+static void MsgDmaIsr(void* pContext)
+{
+    StreamController* this = (StreamController*)pContext;
+    this->_isrCount++;
+}
+
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h
new file mode 100644
index 0000000..8b19066
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h
@@ -0,0 +1,86 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "altera_msgdma.h"
+#include "system.h"
+#include "stream_controller_messages.h"
+
+typedef struct CoreDlaJobItem
+{
+    uint32_t                _index;
+    bool                    _hasSourceBuffer;
+    bool                    _scheduledWithDLA;
+    CoreDlaJobPayload       _payload;
+    struct CoreDlaJobItem*  _pPreviousJob;
+    struct CoreDlaJobItem*  _pNextJob;
+} CoreDlaJobItem;
+
+typedef struct StreamController
+{
+    void        (*Start)(struct StreamController* this);
+    void        (*Reset)(struct StreamController* this);
+    bool        (*InitializeMsgDma)(struct StreamController* this);
+    bool        (*ArmDmaTransfer)(struct StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS);
+    void        (*RunEventLoop)(struct StreamController* this);
+    void        (*WriteToDlaCsr)(struct StreamController* this, uint32_t addr, uint32_t data);
+    void        (*InitializeStreamController)(struct StreamController* this,
+                                              uint32_t sourceBufferSize,
+                                              uint32_t dropSourceBuffers,
+                                              uint32_t numInferenceRequests);
+    void        (*SetStatus)(struct StreamController* this,
+                             NiosStatusType statusType, uint32_t lineNumber);
+    MessageType (*ReceiveMessage)(struct StreamController *this, volatile MessageHeader* pReceiveMessage);
+    bool        (*SendMessage)(struct StreamController* this,
+                               MessageType messageType,
+                               void* pPayload,
+                               size_t payloadSize);
+    void        (*NewSourceBuffer)(struct StreamController* this);
+    void        (*ScheduleDlaInference)(struct StreamController* this, CoreDlaJobItem* pJob);
+    void        (*NewInferenceRequestReceived)(struct StreamController* this, volatile CoreDlaJobPayload* pJob);
+
+    // Message handlers
+    bool        (*GetStatusMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ScheduleItemMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*PingMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*InitializeStreamControllerMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ManualArmDmaTransferMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ManualScheduleDlaInferenceMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+
+    CoreDlaJobItem* _jobs;
+    CoreDlaJobItem* _pNextInferenceRequestJob;
+    CoreDlaJobItem* _pFillingImageJob;
+    CoreDlaJobItem  _debugJob;
+    NiosStatusType  _status;
+    uint32_t        _statusLineNumber;
+    uint32_t        _commandCounter;
+    uint32_t        _sourceBufferSize;
+    uint32_t        _dropSourceBuffers;
+    uint32_t        _totalNumInferenceRequests;
+    uint32_t        _numInferenceRequests;
+    uint32_t        _numExecutedJobs;
+    uint32_t        _numScheduledInferences;
+    uint32_t        _lastReceiveSequenceID;
+    uint32_t        _sendSequenceID;
+    bool            _running;
+    uint32_t        _numReceivedSourceBuffers;
+    volatile uint32_t   _isrCount;
+    alt_msgdma_dev*     _pMsgDevice;
+    alt_msgdma_extended_descriptor _msgdmaDescriptor;
+} StreamController;
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h
new file mode 100644
index 0000000..3891326
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h
@@ -0,0 +1,90 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <stdint.h>
+
+typedef enum
+{
+    MessageType_Invalid,
+    MessageType_NoOperation,
+    MessageType_GetStatus,
+    MessageType_Status,
+    MessageType_ScheduleItem,
+    MessageType_Ping,
+    MessageType_Pong,
+    MessageType_InitializeStreamController,
+    MessageType_ManualArmDmaTransfer,
+    MessageType_ManualScheduleDlaInference
+} MessageType;
+
+typedef enum
+{
+    NiosStatusType_OK = 1000,
+    NiosStatusType_Error,
+    NiosStatusType_BadMessage,
+    NiosStatusType_BadMessageSequence,
+    NiosStatusType_BadDescriptor,
+    NiosStatusType_AsyncTransferFailed,
+    NiosStatusType_MsgDmaFailed,
+    NiosStatusType_InvalidParameter
+} NiosStatusType;
+
+typedef struct
+{
+    uint32_t _messageReadyMagicNumber;
+    uint32_t _messageType;
+    uint32_t _sequenceID;
+    uint32_t _payload;
+} MessageHeader;
+
+// Message payloads:
+
+typedef struct
+{
+    uint32_t _configurationBaseAddressDDR;
+    uint32_t _configurationSize;
+    uint32_t _inputAddressDDR;
+    uint32_t _outputAddressDDR;
+} CoreDlaJobPayload;
+
+typedef struct
+{
+    uint32_t _sourceBufferSize;
+    uint32_t _dropSourceBuffers;
+    uint32_t _numInferenceRequests;
+} InitializeStreamControllerPayload;
+
+typedef struct
+{
+    NiosStatusType _status;
+    uint32_t _statusLineNumber;
+    uint32_t _numReceivedSourceBuffers;
+    uint32_t _numScheduledInferences;
+    uint32_t _numExecutedJobs;
+} StatusMessagePayload;
+
+typedef struct
+{
+    uint32_t _sourceBufferSize;
+    uint32_t _inputAddressDDR;
+    uint32_t _fromHPS;
+} ManualArmDmaTransferPayload;
+
+typedef struct
+{
+    uint32_t _configurationBaseAddressDDR;
+    uint32_t _configurationSize;
+    uint32_t _inputAddressDDR;
+} ManualScheduleDlaInferencePayload;
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/build.sh b/python/openvino/runtime/coredla_device/stream_controller/build.sh
new file mode 100755
index 0000000..2d22c5e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/build.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Run in Nios V Command Shell, Quartus Prime 22.4 or later
+
+quartus_project=$1
+qsys_file=$2
+hex_file=$3
+
+usage()
+{
+    echo "Usage:"
+    echo "    build.sh <quartus_project_file> <qsys_file> <destination_hex_file>"
+}
+
+if [ -z "$quartus_project" ]; then
+    usage
+    exit 1
+fi
+
+if [ -z "$qsys_file" ]; then
+    usage
+    exit 1
+fi
+
+if [ -z "$hex_file" ]; then
+    usage
+    exit 1
+fi
+
+if [ ! -f "$quartus_project" ]; then
+    echo Quartus project file not found "$quartus_project"
+    usage
+    exit 1
+fi
+
+if [ ! -f "$qsys_file" ]; then
+    echo qsys file not found "$qsys_file"
+    usage
+    exit 1
+fi
+
+# Export the bsp folder from the Quartus project, create the
+# CMakeFiles.txt for the application, build the app, then
+# build the stream_controller.hex binary, in the 'build' folder
+
+niosv-bsp -c --quartus-project=$quartus_project --qsys=$qsys_file --type=hal bsp/settings.bsp
+niosv-app --bsp-dir=bsp --app-dir=app --srcs=app --elf-name=stream_controller.elf
+
+# cmake dependency, version 3.14.10 or later. https://cmake.org/download/
+cmake -B build -DCMAKE_BUILD_TYPE=Release app
+cmake --build build
+elf2hex build/stream_controller.elf -b 0x0 -w 32 -e 0x1ffff -r 4 -o build/stream_controller.hex
+cp build/stream_controller.hex $hex_file
+
+exit 0
diff --git a/python/openvino/runtime/create_hps_image.sh b/python/openvino/runtime/create_hps_image.sh
new file mode 100755
index 0000000..74df01c
--- /dev/null
+++ b/python/openvino/runtime/create_hps_image.sh
@@ -0,0 +1,488 @@
+#! /bin/bash
+set -o errexit
+
+#
+# This script is a starter script for building an ED4 SD Card image on the HPS platform.
+# This script wraps the following steps:
+# 1. Build bitstreams (S2M) if specified
+# 2. Build Yocto SD card image (.wic) to obtain the toolchain SDK or use prebuilt .wic.
+# 3. Use the SDK to cross-build HPS packages and CoreDLA runtime
+# 4. Update the .wic image with CoreDLA executable, libraries, and FPGA bitstreams
+
+if [[ -z "${COREDLA_ROOT}" ]]; then
+    echo "Error: COREDLA_ROOT environment variable not set. Run init_env.sh script first."
+    exit 1
+fi
+
+
+SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# runtime
+RUNTIME_DIR="${SCRIPT_PATH}"  # this script should always be located in runtime
+RUNTIME_BUILD_TYPE="Release"  # Release or Debug. Release will build OpenVINO and DLA in debug mode
+RUNTIME_BUILD_DIR="${RUNTIME_DIR}/build_${RUNTIME_BUILD_TYPE}"
+BUILD_RUNTIME_SCRIPT="${RUNTIME_DIR}/build_runtime.sh"
+
+# Yocto
+ED4_DIR="${COREDLA_ROOT}/hps/ed4"
+ED4_SCRIPT_DIR="${ED4_DIR}/scripts"
+UPDATE_SDCARD_SCRIPT="${ED4_SCRIPT_DIR}/update_sd_card.sh"
+USING_PREBUILT_YOCTO=false
+UPDATE_SDCARD=false
+YOCTO_DIR="${ED4_DIR}/yocto"
+YOCTO_BUILD_SCRIPT="${YOCTO_DIR}/run-build.sh"
+YOCTO_BUILD_DIR="${RUNTIME_DIR}/build_Yocto"
+MACHINE="arria10"  # default device family
+
+# HPS packages
+BUILD_HPSPACKAGES_SCRIPT="${RUNTIME_DIR}/build_hpspackages.sh"
+HPSPACKAGES_BUILD_DIR="${RUNTIME_DIR}/hps_packages"
+
+# Build bitstreams
+BUILD_BITSTREAM=false
+BUILD_BITSTREAM_SCRIPT="${COREDLA_ROOT}/bin/dla_build_example_design.py"
+
+#################################################################
+function usage
+{
+    echo -e "\nThis script is a starter script for building an ED4 SD Card image on the HPS platform."\
+            "This script wraps the following steps:"\
+            "\n\t1. Build an S2M bitstream of choice for Arm-based SoC (if specified)"\
+            "\n\t2. Build a Yocto SD card image (.wic) and obtain the toolchain SDK."\
+            "\n\t3. Use the SDK from step 2 to cross-compile dependency packages and CoreDLA runtime"\
+            "\n\t4. Update the .wic image with CoreDLA runtime, libraries, and FPGA bitstream"\
+            "\n"
+    echo "create_hps_image.sh -o <PATH> [-f <PATH>] [-y] [-u] [-b] [-h] [-a <PATH>] [-m <FPGA Machine>]"
+    echo ""
+    echo "Options:"
+    echo -e "  -h Display usage"
+    echo -e "  -c Clean the runtime directory back to its default state"
+    echo -e "  -o (Required) Path to the output directory to save the updated SD card image."
+    echo -e "  -u (Optional) Specify this to update the SD card .wic image at the end.  If this option is set, the DLA runtime"
+    echo -e "     and the FPGA bitstream specified by -f will be written to .wic image at the end.  This option is helpful"
+    echo -e "     if a user only wants to build the DLA runtime without a working bitstream. In this case, you can skip"
+    echo -e "     setting -u and -f."
+    echo -e "  -b (Optional) Whether to build a bitstream (S2M) using the arch file specified by -a."
+    echo -e "  -a (Optional) Path to the architecture file. Required if -b is set. "
+    echo -e "  -f (Optional) Path to the FPGA directory that contains the target FPGA bitstreams."
+    echo -e "      This is a REQUIRED argument if at least one of the following conditions is met: "
+    echo -e "      1. -u is set, i.e., update SD card with the bitstreams in this directory at the end."
+    echo -e "         The directory must contain top.core.rbf top.periph.rbf;"
+    echo -e "      2. -b is set, i.e., build bitstream and save to this location"
+    echo -e "  -y (Optional) Path to a pre-built Yocto image directory.   If given, this script skips"
+    echo -e "     building the Yocto image from scratch and use the .wic image and the poky SDK file from"
+    echo -e "     this build directory instead"
+    echo -e "  -m (Optional) FPGA Machine.  Options: arria10 (Arria 10 SoC), agilex7_dk_si_agi027fa (Agilex 7 SoC). Default: arria10"
+    echo
+}
+
+function clean_runtime()
+{
+    echo "Cleaning runtime directory"
+
+    echo "rm -rf ${RUNTIME_BUILD_DIR}"
+    rm -rf ${RUNTIME_BUILD_DIR}
+
+    echo "rm -rf ${RUNTIME_DIR}/embedded_arm_sdk"
+    rm -rf ${RUNTIME_DIR}/embedded_arm_sdk
+
+    echo "rm -rf ${HPSPACKAGES_BUILD_DIR}"
+    rm -rf ${HPSPACKAGES_BUILD_DIR}
+
+    # Search for presence of Poky SDK file
+    ED4_POKY_SDK_NAME="poky*.sh"
+    ED4_POKY_SDK_FILE_LOC="$(find ${RUNTIME_DIR} -maxdepth 1 -type f -name ${ED4_POKY_SDK_NAME})"
+
+    # Confirm presence of Poky SDK file and remove it
+    if [ -e "${ED4_POKY_SDK_FILE_LOC}" ]; then
+        echo "rm -rf ${ED4_POKY_SDK_FILE_LOC}"
+        rm -rf ${ED4_POKY_SDK_FILE_LOC}
+    fi
+}
+
+while getopts "hcbuf:o:y:m:a:" optname; do
+    case "$optname" in
+        h)
+            usage
+            exit 0
+            ;;
+        c)
+            clean_runtime
+            exit 0
+            ;;
+        f)
+            if [[ ${OPTARG} != /* ]]; then
+                ED4_BITSTREAM_DIR="$(pwd)/${OPTARG}"
+            else
+                ED4_BITSTREAM_DIR=${OPTARG}
+            fi
+            ;;
+        o)
+            if [[ ${OPTARG} != /* ]]; then
+                ED4_SDCARD_DIR="$(pwd)/${OPTARG}"
+            else
+                ED4_SDCARD_DIR=${OPTARG}
+            fi
+            ED4_ROOTFS_DIR="${ED4_SDCARD_DIR}/ed4_root"
+            ED4_APP_DIR="${ED4_ROOTFS_DIR}/home/root/app"
+            ;;
+        u)
+            UPDATE_SDCARD=true
+            ;;
+        b)
+            BUILD_BITSTREAM=true
+            ;;
+        y)
+            if [[ ${OPTARG} != /* ]]; then
+                YOCTO_BUILD_DIR="$(pwd)/${OPTARG}"
+            else
+                YOCTO_BUILD_DIR=${OPTARG}
+            fi
+            USING_PREBUILT_YOCTO=true
+            ;;
+        m)
+            MACHINE=${OPTARG}
+            if ! [[ "${MACHINE}" == "agilex7_dk_si_agi027fa" || "${MACHINE}" == "stratix10" || "${MACHINE}" == "arria10" ]]; then
+                usage
+                exit 1
+            fi
+            ;;
+        a)
+            if [[ ${OPTARG} != /* ]]; then
+                ARCH_FILE="$(pwd)/${OPTARG}"
+            else
+                ARCH_FILE=${OPTARG}
+            fi
+            ;;
+    esac
+done
+
+if [[ -z ${ED4_SDCARD_DIR} ]]; then
+    usage
+    echo "Error: -o is required"
+    exit 1;
+fi
+
+if [[ "${UPDATE_SDCARD}" == true && ! -d ${ED4_BITSTREAM_DIR} && "${BUILD_BITSTREAM}" == false ]]; then
+    usage
+    echo "Error: -f is required and must exist if -u is set. "\
+         "Add -b if you want to build bitstreams"
+    exit 1;
+fi
+
+if [[ "${BUILD_BITSTREAM}" == true ]]; then
+    if [[ -z ${ED4_BITSTREAM_DIR} ]]; then
+        usage
+        echo "Error: -f must be specified if building a bitstream"
+        exit 1;
+    fi
+    if [[ -z ${ARCH_FILE} ]]; then
+        usage
+        echo "Error: -a must be specified if building a bitstream"
+        exit 1;
+    fi
+    if [[ -d ${ED4_BITSTREAM_DIR} && "$(ls -A ${ED4_BITSTREAM_DIR})" ]]; then
+        echo "Error: ${ED4_BITSTREAM_DIR} is not empty. "
+        exit 1;
+    fi
+    if [[ -n "${DISPLAY}" ]]; then
+        # Check if xdpyinfo command is available
+        if command -v xdpyinfo &> /dev/null; then
+            # xdpyinfo exits successfully if a valid display is detected
+            if ! xdpyinfo &> /dev/null; then
+                echo "Error: X-11 Forwarding is enabled and connected to an invalid display: ${DISPLAY}. Unset DISPLAY environment variable to continue."
+                exit 1;
+            fi
+        else
+            echo "Warning: X-11 Forwarding is enabled and potentially connected to an invalid display: ${DISPLAY}. Results may be unpredictable."
+        fi
+    fi
+fi
+
+function build_s2m_bitstream()
+{
+    if [ ! -f "${BUILD_BITSTREAM_SCRIPT}" ]; then
+        echo "Error: Cannot find ${BUILD_BITSTREAM_SCRIPT}."
+        exit 1
+    fi
+
+    echo "Building bitstream for ${MACHINE}"
+
+    ed="4_A10_S2M"
+    if [ "${MACHINE}" == "stratix10" ]; then
+        ed="4_S10_S2M"
+    elif [ "${MACHINE}" == "agilex7_dk_si_agi027fa" ]; then
+        ed="4_AGX7_S2M"
+    fi
+
+    ${BUILD_BITSTREAM_SCRIPT} -a ${ARCH_FILE} --build --output-dir=${ED4_BITSTREAM_DIR} -n 1 -ed=${ed}
+    BUILD_BITSTREAM_RESULT=$?
+    if [ $BUILD_BITSTREAM_RESULT -eq 1 ]; then
+        echo "Bitstream failed to build. "
+        exit 1
+    fi
+        
+    if [ "${MACHINE}" == "arria10" ]; then
+        if [[ $(find ${ED4_BITSTREAM_DIR} -name '*.periph.rbf' | wc -l) -eq 1 &&
+            $(find ${ED4_BITSTREAM_DIR} -name '*.core.rbf' | wc -l) -eq 1 ]]; then
+            periph=$(find ${ED4_BITSTREAM_DIR} -name '*.periph.rbf')
+            core=$(find ${ED4_BITSTREAM_DIR} -name '*.core.rbf')
+            mv $periph "${ED4_BITSTREAM_DIR}/top.periph.rbf"
+            mv $core "${ED4_BITSTREAM_DIR}/top.core.rbf"
+        else
+            echo "Error: You should have exactly 1 periph.rbf and 1 core.rbf in ${ED4_BITSTREAM_DIR}"
+            exit 1
+        fi
+    else
+        if [[ $(find ${ED4_BITSTREAM_DIR} -name '*.sof' | wc -l) -eq 1 ]]; then
+            top=$(find ${ED4_BITSTREAM_DIR} -name '*.sof')
+            mv $top "${ED4_BITSTREAM_DIR}/top.sof"
+        else
+            echo "Error: You should have exactly 1 .sof file in ${ED4_BITSTREAM_DIR}"
+            exit 1
+        fi
+    fi
+}
+
+function build_yocto()
+{
+    if [ ! -f "${YOCTO_BUILD_SCRIPT}" ]; then
+        echo "Error: Cannot find run-build.sh at ${YOCTO_DIR}."
+        exit 1
+    fi
+    echo "Building Yocto for ${MACHINE}"
+    if [[ "${USING_PREBUILT_YOCTO}" == false ]]; then
+        umask a+rx u+rwx
+    fi
+    pushd $YOCTO_DIR
+        # -i: build_image; -s: build_sdk, -b <build_directory>: build location
+        if [[ "${USING_PREBUILT_YOCTO}" == false ]]; then
+            ${YOCTO_BUILD_SCRIPT} -is -b ${YOCTO_BUILD_DIR} ${MACHINE}
+        else
+            echo "Using prebuilt Yocto: ${YOCTO_BUILD_DIR}"
+        fi
+        YOCTO_BUILD_RESULT=$?
+        if [ $YOCTO_BUILD_RESULT -eq 0 ]; then
+            echo "Yocto built successfully. "
+            YOCTO_SDK_DIR="${YOCTO_BUILD_DIR}/build/tmp/deploy/sdk"
+            export SEARCH_NAME="poky*${MACHINE}*.sh"
+            export ED4_POKY_SDK_LOC="$(find ${YOCTO_SDK_DIR} -name ${SEARCH_NAME})"
+            if [ ! -f "${ED4_POKY_SDK_LOC}" ]; then
+                echo "Error: Cannot find the POKY SDK in ${YOCTO_SDK_DIR}. "
+                exit 1
+            else
+                echo "The POKY SDK can be found at ${ED4_POKY_SDK_LOC}"
+            fi
+
+            # Newer versions of Yocto generate a symlink to the .wic file with an extension of .rootfs.wic
+            # instead of just .wic. To keep everything working, make a copy of the file without the .rootfs extension.
+            YOCTO_WIC_DIR="${YOCTO_BUILD_DIR}/build/tmp/deploy/images/${MACHINE}"
+            YOCTO_WIC_FILE="coredla-image-${MACHINE}.wic"
+            YOCTO_ROOTFS_WIC_FILE="coredla-image-${MACHINE}.rootfs.wic"
+
+            # Check if the .wic file is missing
+            if [ ! -f "${YOCTO_WIC_DIR}/${YOCTO_WIC_FILE}" ]; then
+
+                # Check if the .rootfs.wic file is present
+                if [ -f "${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_WIC_FILE}" ]; then
+                    cp -a "${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_WIC_FILE}" "${YOCTO_WIC_DIR}/${YOCTO_WIC_FILE}"
+                else
+                    echo "Error: ${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_WIC_FILE} missing"
+                    exit 1
+                fi                
+            fi
+
+            WIC_COPY_RESULT=$?
+            if [ $WIC_COPY_RESULT -eq 0 ]; then
+                echo "The .wic file can be found at ${YOCTO_WIC_DIR}/${YOCTO_WIC_FILE}"
+            else
+                echo "Error: ${YOCTO_WIC_DIR}/${YOCTO_WIC_FILE} missing"
+                exit 1
+            fi
+
+            # The same issue as above applies to the .cpio file
+            YOCTO_CPIO_FILE="coredla-image-${MACHINE}.cpio"
+            YOCTO_ROOTFS_CPIO_FILE="coredla-image-${MACHINE}.rootfs.cpio"
+
+            # Check if the .cpio file is missing
+            if [ ! -f "${YOCTO_WIC_DIR}/${YOCTO_CPIO_FILE}" ]; then
+
+                # Check if the .rootfs.cpio file is present
+                if [ -f "${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_CPIO_FILE}" ]; then
+                    cp -a "${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_CPIO_FILE}" "${YOCTO_WIC_DIR}/${YOCTO_CPIO_FILE}"
+                else
+                    echo "Error: ${YOCTO_WIC_DIR}/${YOCTO_ROOTFS_CPIO_FILE} missing"
+                    exit 1
+                fi                
+            fi
+
+            CPIO_COPY_RESULT=$?
+            if [ $CPIO_COPY_RESULT -eq 0 ]; then
+                echo "The .cpio file can be found at ${YOCTO_WIC_DIR}/${YOCTO_CPIO_FILE}"
+            else
+                echo "Error: ${YOCTO_WIC_DIR}/${YOCTO_CPIO_FILE} missing"
+                exit 1
+            fi
+
+        else
+            echo "Yocto failed to build"
+            exit 1
+        fi
+    popd
+}
+
+function build_hpspackages()
+{
+    if [ ! -f "${BUILD_HPSPACKAGES_SCRIPT}" ]; then
+        echo "Error: Cannot find build_hpspackages.sh at ${RUNTIME_DIR}."
+        exit 1
+    fi
+
+    echo "Building hps packages..."
+    pushd $RUNTIME_DIR
+        if [[ $RUNTIME_BUILD_TYPE == "Release" ]]; then
+            ${BUILD_HPSPACKAGES_SCRIPT} -bs  # -b: build; -s: get sources
+        elif [[ $RUNTIME_BUILD_TYPE == "Debug" ]]; then
+            ${BUILD_HPSPACKAGES_SCRIPT} -bds  # -b: build; -s: get sources -d: build openvino with cmake debug
+        fi
+        HPSPACKAGES_BUILD_RESULT=$?
+        if [ $HPSPACKAGES_BUILD_RESULT -eq 0 ]; then
+            echo "HPS packages built successfully. "
+        else
+            echo "HPS packages failed to build. "
+            exit 1
+        fi
+    popd
+
+}
+
+function build_hps_dla_runtime()
+{
+    if [ ! -f "${BUILD_RUNTIME_SCRIPT}" ]; then
+        echo "Error: Cannot find build_runtime.sh at ${RUNTIME_DIR}."
+        exit 1
+    fi
+
+    # arm32 or arm64 platform depends on arria10 or stratix10
+    ARM_ARCH="armv7l"
+    if [[ "${MACHINE}" == "agilex7_dk_si_agi027fa" || "${MACHINE}" == "stratix10" ]]; then
+        ARM_ARCH="aarch64"
+    fi
+
+    echo "Building runtime with the HPS platform..."
+    pushd $RUNTIME_DIR
+        if [[ $RUNTIME_BUILD_TYPE == "Release" ]]; then
+            ${BUILD_RUNTIME_SCRIPT} --hps_platform --hps_machine=${MACHINE} --build_dir=${RUNTIME_BUILD_DIR}
+        elif [[ $RUNTIME_BUILD_TYPE == "Debug" ]]; then
+            ${BUILD_RUNTIME_SCRIPT} --hps_platform --hps_machine=${MACHINE} --build_dir=${RUNTIME_BUILD_DIR} -cmake_debug
+        else
+            echo "Unrecognized build type ${RUNTIME_BUILD_TYPE}. "
+            exit 1
+        fi
+
+        RUNTIME_BUILD_RESULT=$?
+        if [ $RUNTIME_BUILD_RESULT -eq 0 ]; then
+            echo "DLA runtime built successfully. "
+        else
+            echo "DLA runtime failed to build. "
+            exit 1
+        fi
+    popd
+
+    # extract executable and libraries for ED4
+    mkdir -p "${ED4_APP_DIR}"
+
+    pushd $ED4_APP_DIR
+        # rsync is used for convenience to select/exclude files destined for the SD card
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/dla_benchmark/dla_benchmark .
+
+        rsync -avzP  ${HPSPACKAGES_BUILD_DIR}/openvino/bin/${ARM_ARCH}/${RUNTIME_BUILD_TYPE}/*.so* .
+        rsync -avzP  ${HPSPACKAGES_BUILD_DIR}/openvino/bin/${ARM_ARCH}/Release/libopenvino_arm_cpu_plugin.so .
+
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/common/format_reader/libformat_reader.so .
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/coredla_device/mmd/hps_platform/libhps_platform_mmd.so .
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/hetero_plugin/libcoreDLAHeteroPlugin.so .
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/libcoreDlaRuntimePlugin.so .
+        rsync -avzP  ${RUNTIME_BUILD_DIR}/plugins.xml .
+
+        rsync -avzP  --exclude libopencv_highgui.so \
+                    --exclude libopencv_core.so \
+                    --exclude libopencv_imgcodecs.so \
+                    --exclude libopencv_imgproc.so \
+                    --exclude libopencv_videoio.so \
+                    ${HPSPACKAGES_BUILD_DIR}/armcpu_package/opencv/lib/*.so* .
+
+        rsync -avzP ${RUNTIME_BUILD_DIR}/streaming/image_streaming_app/image_streaming_app .
+        rsync -avzP ${RUNTIME_BUILD_DIR}/streaming/streaming_inference_app/streaming_inference_app .
+        rsync -avzP ${RUNTIME_DIR}/streaming/runtime_scripts/run_image_stream.sh .
+        rsync -avzP ${RUNTIME_DIR}/streaming/runtime_scripts/run_inference_stream.sh .
+        rsync -avzP ${RUNTIME_DIR}/streaming/streaming_inference_app/categories.txt .
+        rsync -avzP ${COREDLA_ROOT}/build_version.txt .
+        rsync -avzP ${COREDLA_ROOT}/build_os.txt .
+    popd
+
+    # Derive the .arch file from the selected MACHINE
+    ARCH_FILE="A10_Performance.arch"
+    if [ "${MACHINE}" == "agilex7_dk_si_agi027fa" ]; then
+        ARCH_FILE="AGX7_Performance.arch"
+    elif [ "${MACHINE}" == "stratix10" ]; then
+        ARCH_FILE="S10_Performance.arch"
+    fi
+
+    # update the -arch flag in run_inference_stream.sh to match the selected MACHINE
+    sed -i "s/A10_Performance.arch/$ARCH_FILE/" $ED4_APP_DIR/run_inference_stream.sh
+
+    # change the CPU plugin in plugins.xml to armPlugin
+    sed -i "s/libopenvino_intel_cpu_plugin.so/libopenvino_arm_cpu_plugin.so/" $ED4_APP_DIR/plugins.xml
+}
+
+function update_sd_card_image()
+{
+    if [ ! -f "${UPDATE_SDCARD_SCRIPT}" ]; then
+        echo "Error: Cannot find update_sd_card.sh at ${ED4_SCRIPT_DIR}."
+        exit 1
+    fi
+    export SDCARD_IMAGE_LOC="$(find ${YOCTO_BUILD_DIR}/build/tmp/deploy/images -name *coredla-image-${MACHINE}.wic)"
+    if [ ! -f "${SDCARD_IMAGE_LOC}" ]; then
+        echo "Error: Cannot find SD Card image (.wic) at ${YOCTO_BUILD_DIR}/build/tmp/deploy/images. Have you built Yocto?."
+        exit 1
+    fi
+
+    echo "Updating SD Card image..."
+    pushd $ED4_SCRIPT_DIR
+        ${UPDATE_SDCARD_SCRIPT} -w ${SDCARD_IMAGE_LOC} -o ${ED4_SDCARD_DIR} -f ${ED4_BITSTREAM_DIR} -r ${ED4_ROOTFS_DIR}
+        UPDATE_SDCARD_RESULT=$?
+        if [ $UPDATE_SDCARD_RESULT -eq 0 ]; then
+            echo "Updated SD card image successfully. "
+        else
+            echo "Failed to update SD card image. "
+            exit 1
+        fi
+    popd
+
+    if [[ $(find ${ED4_SDCARD_DIR} -name '*.wic' | wc -l) ]]; then
+        echo "The .wic image can be found here:"
+        echo $(find ${ED4_SDCARD_DIR} -name '*.wic')
+    else
+        echo "Cannot find updated .wic image in ${ED4_SDCARD_DIR}"
+        exit 1
+    fi
+}
+#################################################################
+
+
+# step 0: build bitstream
+if [[ "${BUILD_BITSTREAM}" == true ]]; then
+    build_s2m_bitstream
+fi
+# step 1: build Yocto
+build_yocto
+# step 2: build hps packages
+build_hpspackages
+# step 3: build coredla runtime on the hps platform
+build_hps_dla_runtime
+# step 4: update the .wic image from step 1 with hps runtime and fpga bitstreams
+if [[ "${UPDATE_SDCARD}" == true ]]; then
+    update_sd_card_image
+fi
+
+echo "All steps succeeded"
diff --git a/python/openvino/runtime/devel_package/dla/compiler/core/src/fpga_inc/version_checksum.h b/python/openvino/runtime/devel_package/dla/compiler/core/src/fpga_inc/version_checksum.h
new file mode 100644
index 0000000..bb09083
--- /dev/null
+++ b/python/openvino/runtime/devel_package/dla/compiler/core/src/fpga_inc/version_checksum.h
@@ -0,0 +1,29 @@
+// Copyright 2015-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+
+#ifndef __VERSION_CHECKSUM_H__
+#define __VERSION_CHECKSUM_H__
+
+// -------------------------------------------------------------------------- //
+// version_checksum.h.template:
+//
+// A version_checksum.h will be created during cmake follow. version_checksum.h
+// contains DLA_CHECKSUM macro which is the MD5SUM of fpga_config.h
+// -------------------------------------------------------------------------- //
+
+#define DLA_CHECKSUM DLA_CHECKSUM_36b92a7249e7ad81
+#define DLA_VERSION DLA_VERSION_v0_6
+
+#endif  // __VERSION_CHECKSUM_H__
+
diff --git a/python/openvino/runtime/dla/compiler/core/src/fpga_inc/version_checksum.h b/python/openvino/runtime/dla/compiler/core/src/fpga_inc/version_checksum.h
new file mode 100644
index 0000000..f318474
--- /dev/null
+++ b/python/openvino/runtime/dla/compiler/core/src/fpga_inc/version_checksum.h
@@ -0,0 +1,28 @@
+// Copyright 2015-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+
+#ifndef __VERSION_CHECKSUM_H__
+#define __VERSION_CHECKSUM_H__
+
+// -------------------------------------------------------------------------- //
+// version_checksum.h.template:
+//
+// A version_checksum.h will be created during cmake follow. version_checksum.h
+// contains DLA_CHECKSUM macro which is the MD5SUM of fpga_config.h
+// -------------------------------------------------------------------------- //
+
+#define DLA_CHECKSUM DLA_CHECKSUM_36b92a7249e7ad81
+#define DLA_VERSION DLA_VERSION_v0_6
+
+#endif  // __VERSION_CHECKSUM_H__
diff --git a/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt
new file mode 100644
index 0000000..0e1e4f8
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/CMakeLists.txt
@@ -0,0 +1,71 @@
+cmake_minimum_required(VERSION 3.10)
+
+# Use <package>_ROOT variables to help find_package locate packages
+if (POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+find_package(gflags COMPONENTS shared REQUIRED)
+
+add_subdirectory(dla_aot_splitter_plugin)
+add_subdirectory(dla_aot_splitter_example)
+
+if (DE10_AGILEX)
+  add_library(de10_agilex ALIAS de10_agilex_mmd)
+elseif (SYSTEM_CONSOLE_PLATFORM)
+  # DO NOTHING
+elseif (PAC_A10)
+  add_library(dcp_a10_pac ALIAS intel_opae_mmd)
+elseif(AGX7_IDK)
+  add_library(agx7_i_dk ALIAS intel_opae_mmd)
+elseif(AGX7_N6001)
+  add_library(agx7_n6001 ALIAS intel_opae_mmd)
+endif()
+
+add_executable(dla_aot_splitter ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp)
+
+target_compile_features(dla_aot_splitter PUBLIC cxx_std_11)
+
+target_sources(dla_aot_splitter PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_splitter.hpp
+  $ENV{COREDLA_ROOT}/runtime/dla_benchmark/inputs_filling.cpp #TODO REMOVE and replace with link library
+  $ENV{COREDLA_ROOT}/runtime/dla_benchmark/utils.cpp #TODO REMOVE and replace with link library
+  $ENV{COREDLA_ROOT}/runtime/common/utils/src/slog.cpp
+  $ENV{COREDLA_ROOT}/runtime/common/utils/src/args_helper.cpp
+  $ENV{COREDLA_ROOT}/runtime/common/utils/src/common.cpp
+  $ENV{COREDLA_ROOT}/runtime/common/utils/src/latency_metrics.cpp
+)
+
+target_include_directories(dla_aot_splitter PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc
+  $ENV{COREDLA_ROOT}/util/inc
+  $ENV{COREDLA_ROOT}/dla_plugin/inc
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia
+  $ENV{COREDLA_ROOT}/runtime/dla_benchmark #TODO REMOVE and replace with link library
+)
+
+if (WIN32)
+  target_include_directories(dla_aot_splitter PRIVATE
+    $ENV{COREDLA_ROOT}/compiler/inc # dla_performance_estimator.h
+  )
+endif()
+
+
+target_link_libraries(dla_aot_splitter PRIVATE
+  openvino::runtime
+  openvino_dev_api
+  format_reader
+  ie_samples_utils
+  ${OpenCV_LIBRARIES} # Needed for the directly compiled inputs_filling
+  dla_aot_splitter_plugin
+  gflags
+)
+
+if (NOT WIN32)
+  target_link_libraries(dla_aot_splitter PRIVATE
+    ${LIB_DL}
+    pthread
+  )
+endif()
diff --git a/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg b/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg
new file mode 100644
index 0000000..4bdae97
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/CPPLINT.cfg
@@ -0,0 +1,8 @@
+set noparent
+filter=-build/header_guard,-runtime/explicit,-build/include_subdir,-runtime/references,-build/c++11,-runtime/int,-runtime/string,-runtime/printf,-build/namespaces,-readability/todo,-readability/casting
+
+# Exlude Example code
+exclude_files=dla_aot_splitter_example
+
+linelength=160
+headers=h,hpp
diff --git a/python/openvino/runtime/dla_aot_splitter/README.md b/python/openvino/runtime/dla_aot_splitter/README.md
new file mode 100644
index 0000000..ffefe0d
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/README.md
@@ -0,0 +1,52 @@
+# Intel AI Suite Core DLA 'AoT Splitter'
+
+This tool is intended to split a compiled HETERO:FPGA OpenVINO model into Input memory, Config memory, and Filter memory data blobs that would normally exist in the DDR memory of a runtime CoreDLA IP. These blobs can be used to directly run an inference on the IP without using OpenVINO InferenceEngine.
+
+# How to Build the Splitter, Plugin, and Example
+
+First, follow all instructions to install CoreDLA compiler development environment
+
+Change directory to the dla runtime folder
+
+```
+sh build_runtime.sh -target_de10_agilex
+```
+
+# How to Run the Splitter Executable
+
+The executable outputs the memory blobs to the current working directory. Change directory to the location where you want the outputs to be generated
+
+```
+cd directory_where_you_want_output
+
+runtime/build_Release/dla_aot_splitter/dla_aot_splitter -cm compiled_hetero_fpga_model.bin -i path/to/image.bmp -bgr -plugins runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml
+```
+
+Ensure that the libdla_aot_splitter.so, libcoreDLAHeteroPlugin.so and other shared libraries are available to the utility.
+
+The tool outputs the following artifacts:
+ - arch_build.mem / arch_build.bin
+ - config.mem / config.bin
+ - filter.mem /filter.bin
+ - input.mem / input.bin
+ - inter_size.mem
+ - output_size.mem
+
+# Building the Example Inference Program
+
+The example inference program with static input,config,filter data is compiled with the following environment variables
+and option to build_runtime.sh
+
+## DE10 Agilex
+```
+export AOT_SPLITTER_EXAMPLE_MODEL=<path/to/model.xml>
+export AOT_SPLITTER_EXAMPLE_INPUT=<path/to/image.bmp>
+sh build_runtime.sh -aot_splitter_example -target_de10_agilex
+```
+
+This program directly embeds the input, config and filter data into the resulting exectuable file for direct use.
+
+## PCIE
+
+The emulation inference program uses the PCIE MMD driver from the example design to connect to and provision the IP.
+Your system may require a different driver to provision the IP
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt
new file mode 100644
index 0000000..a6f2ce8
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/CMakeLists.txt
@@ -0,0 +1,209 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+add_executable(dla_aot_splitter_example EXCLUDE_FROM_ALL src/main.cpp)
+
+target_compile_features(dla_aot_splitter_example PUBLIC cxx_std_11)
+
+target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_MMD)
+
+file(GLOB SOURCES
+  # coredla_device
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device_memory_allocator.h
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/dla_dma_constants.h
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/mmd_wrapper.h
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/src/device_memory_allocator.cpp
+  #
+  src/main.cpp
+)
+if (SYSTEM_CONSOLE_PLATFORM)
+  list(APPEND SOURCES ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/mmd_wrapper.cpp)
+else ()
+  list(APPEND SOURCES $ENV{COREDLA_ROOT}/runtime/coredla_device/src/mmd_wrapper.cpp)
+endif ()
+
+target_sources (dla_aot_splitter_example PRIVATE ${SOURCES})
+
+if (DISABLE_JIT)
+# for dla_dma_constants.svh
+  if (EXISTS $ENV{COREDLA_ROOT}/inc)
+    target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/inc)
+  else()
+    target_include_directories(dla_aot_splitter_example PRIVATE $ENV{COREDLA_ROOT}/build/coredla/dla/inc)
+  endif()
+endif()
+
+target_link_libraries(dla_aot_splitter_example PRIVATE
+  pthread
+)
+
+if (DISABLE_JIT)
+  target_include_directories(dla_aot_splitter_example PRIVATE
+    $ENV{COREDLA_ROOT}/util/inc
+    $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc
+  )
+  target_sources(dla_aot_splitter_example PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp)
+else()
+  target_link_libraries(dla_aot_splitter_example
+    PRIVATE
+      dla_compiled_result
+  )
+endif()
+
+if (DE10_AGILEX)
+  target_link_libraries(dla_aot_splitter_example PRIVATE de10_agilex)
+elseif(PAC_A10)
+  target_link_libraries(dla_aot_splitter_example PRIVATE dcp_a10_pac)
+elseif(AGX7_IDK)
+  target_link_libraries(dla_aot_splitter_example PRIVATE agx7_i_dk)
+  find_library(libjson-c_LIBRARIES
+    NAMES json-c
+    PATHS ${LIBOPAE-C_ROOT}/lib
+    ${LIBOPAE-C_ROOT}/lib64
+    /usr/local/lib
+    /usr/lib
+    /lib
+    /usr/lib/x86_64-linux-gnu
+    ${CMAKE_EXTRA_LIBS})
+  target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES})
+elseif(AGX7_N6001)
+  target_link_libraries(dla_aot_splitter_example PRIVATE agx7_n6001)
+  find_library(libjson-c_LIBRARIES
+    NAMES json-c
+    PATHS ${LIBOPAE-C_ROOT}/lib
+    ${LIBOPAE-C_ROOT}/lib64
+    /usr/local/lib
+    /usr/lib
+    /lib
+    /usr/lib/x86_64-linux-gnu
+    ${CMAKE_EXTRA_LIBS})
+  target_link_libraries(dla_aot_splitter_example PRIVATE ${libjson-c_LIBRARIES})
+elseif(SYSTEM_CONSOLE_PLATFORM)
+  # Agilex 5 JTAG ED: do nothing
+elseif(REFERENCE)
+  # Reference: do nothing
+else()
+  message(FATAL_ERROR "Building DLA AOT Aplitter Example with unsupported platform")
+endif()
+
+target_include_directories(dla_aot_splitter_example PRIVATE
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc
+  if(PAC_A10)
+    $ENV{COREDLA_ROOT}/runtime/coredla_device/mmd/dcp_a10_pac/host
+  endif()
+)
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include)
+
+target_sources (dla_aot_splitter_example PRIVATE
+  ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem
+  ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem
+  ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem
+  ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem
+  ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem
+  ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem
+)
+target_include_directories(dla_aot_splitter_example PRIVATE
+  ${CMAKE_CURRENT_BINARY_DIR}/include
+)
+
+if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_MODEL})
+  set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{AOT_SPLITTER_EXAMPLE_MODEL})
+else()
+  if (EXISTS $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml)
+    set (AOT_SPLITTER_EXAMPLE_MODEL $ENV{COREDLA_WORK}/demo/models/public/resnet-50-tf/FP32/resnet-50-tf.xml)
+  else()
+    # The path below is for Intel internal use only
+    if (EXISTS /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml)
+      set (AOT_SPLITTER_EXAMPLE_MODEL /p/psg/swip/dla/caffe/caffe_reference/ngraph_ir/coredla/ModelZoo/2021_4_1/resnet_50_tf/FP32/resnet-50-tf.xml)
+    endif()
+  endif()
+endif()
+
+if (DEFINED ENV{AOT_SPLITTER_EXAMPLE_INPUT})
+  set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{AOT_SPLITTER_EXAMPLE_INPUT})
+else()
+  if (EXISTS $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp)
+    set (AOT_SPLITTER_EXAMPLE_INPUT $ENV{COREDLA_ROOT}/demo/sample_images/val_00000000.bmp)
+  else()
+    # The path below is for Intel internal use only
+    if (EXISTS /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp)
+      set (AOT_SPLITTER_EXAMPLE_INPUT /p/psg/swip/dla/images/imagenet/ILSVRC2012_224x224/BMP/BMP/ILSVRC2012_val_00000000.bmp)
+    endif()
+  endif()
+endif()
+
+if (EXISTS ${CoreDLA_DIR}/../bin)
+  set(COREDLA_BIN ${CoreDLA_DIR}/../bin)
+  set(COREDLA_LIB ${CoreDLA_DIR}/../lib)
+  set(COREDLA_EXARCH ${CoreDLA_DIR}/../example_architectures)
+  if(DE10_AGILEX OR AGX7_IDK OR AGX7_N6001)
+    set (AOT_SPLITTER_EXAMPLE_ARCH AGX7_Performance.arch)
+  elseif(SYSTEM_CONSOLE_PLATFORM)
+    set (AOT_SPLITTER_EXAMPLE_ARCH AGX5_Small_Softmax.arch)
+  else()
+    set (AOT_SPLITTER_EXAMPLE_ARCH A10_Performance.arch)
+  endif()
+else()
+  set(COREDLA_BIN $ENV{COREDLA_ROOT}/build/coredla/dla/bin)
+  set(COREDLA_LIB $ENV{COREDLA_ROOT}/build/coredla/dla/lib)
+  set(COREDLA_EXARCH $ENV{COREDLA_ROOT}/example_architectures)
+
+  # The paths below are for Intel internal use only
+  if(DE10_AGILEX)
+    set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/64x32_i5x1_fp13agx_sb31744_xbark32_clamp_preluk32_poolk4_softmax_1inst.arch)
+  elseif(AGX7_IDK OR AGX7_N6001)
+    set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/AGX7/32x64_i5x1_fp13agx_sb32768_poolk4_actk32_prelu_rclamp_sig_softmaxk1.arch)
+  elseif(SYSTEM_CONSOLE_PLATFORM)
+    set (AOT_SPLITTER_EXAMPLE_ARCH 16x16_i12x1_fp12agx_sb8192_poolk4_actk16_clamp_softmaxk1.arch)
+  else()
+    set (AOT_SPLITTER_EXAMPLE_ARCH arch/descriptions/A10/64x32_i4x1_fp11_sb31744_xbark32_clamp_preluk32_poolk4_softmax.arch)
+  endif()
+endif()
+
+if (NOT DEFINED AOT_SPLITTER_INPUT_ARGUMENTS)
+  set (AOT_SPLITTER_INPUT_ARGUMENTS )
+  if (DEFINED AOT_SPLITTER_EXAMPLE_INPUT)
+    set (AOT_SPLITTER_INPUT_ARGUMENTS -i ${AOT_SPLITTER_EXAMPLE_INPUT} -bgr)
+  endif()
+endif()
+
+# Need to copy the system console script for Agilex 5E JTAG ED
+# Also link against Boost
+if (SYSTEM_CONSOLE_PLATFORM)
+  find_package(Boost REQUIRED COMPONENTS filesystem)
+  target_link_libraries(dla_aot_splitter_example PRIVATE Boost::filesystem)
+  add_custom_command(
+    TARGET dla_aot_splitter_example POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy
+    ${CMAKE_SOURCE_DIR}/coredla_device/mmd/system_console/system_console_script.tcl
+    ${CMAKE_CURRENT_BINARY_DIR}/system_console_script.tcl
+  )
+  target_compile_definitions(dla_aot_splitter_example PRIVATE DLA_SYSCON_SOURCE_ROOT=${CMAKE_CURRENT_BINARY_DIR})
+endif()
+
+add_custom_command(
+  OUTPUT
+    ${CMAKE_CURRENT_BINARY_DIR}/include/arch_build.mem
+    ${CMAKE_CURRENT_BINARY_DIR}/include/config.mem
+    ${CMAKE_CURRENT_BINARY_DIR}/include/filter.mem
+    ${CMAKE_CURRENT_BINARY_DIR}/include/input.mem
+    ${CMAKE_CURRENT_BINARY_DIR}/include/inter_size.mem
+    ${CMAKE_CURRENT_BINARY_DIR}/include/output_size.mem
+  COMMAND
+    LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} ${COREDLA_BIN}/dlac --network-file ${AOT_SPLITTER_EXAMPLE_MODEL} --march ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH} --foutput-format open_vino_hetero --o ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin
+  COMMAND
+    LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${COREDLA_LIB} $<TARGET_FILE:dla_aot_splitter> ${AOT_SPLITTER_INPUT_ARGUMENTS} -cm ${CMAKE_CURRENT_BINARY_DIR}/resnet.bin -plugins $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml
+  DEPENDS
+    ${COREDLA_BIN}/dlac
+    dla_benchmark
+    dla_aot_splitter
+    dla_aot_splitter_plugin
+    ${AOT_SPLITTER_EXAMPLE_MODEL}
+    ${COREDLA_EXARCH}/${AOT_SPLITTER_EXAMPLE_ARCH}
+    ${AOT_SPLITTER_EXAMPLE_INPUT}
+    $<TARGET_FILE_DIR:dla_aot_splitter_plugin>/plugins_aot_splitter.xml
+  WORKING_DIRECTORY
+    ${CMAKE_CURRENT_BINARY_DIR}/include
+)
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp
new file mode 100644
index 0000000..b90ccd5
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_example/src/main.cpp
@@ -0,0 +1,180 @@
+// Copyright 2022 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+//
+// This small tool demonstrates the minimum number of steps necessary to run an
+// inference on the FPGA while using the output files from the AoT splitter.
+//
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <stdint.h>
+#include <array>
+#include <cstring> //memcpy
+
+uint32_t arch_build_mem_32[] =
+{
+  #include "arch_build.mem"
+};
+uint8_t* const arch_build_mem = (uint8_t*)&arch_build_mem_32[0];
+const uint32_t arch_build_mem_size = sizeof(arch_build_mem_32);
+
+uint32_t input_mem_32[] =
+{
+  #include "input.mem"
+};
+uint8_t* const input_mem = sizeof(input_mem_32) ? (uint8_t*)&input_mem_32[0] : nullptr;
+const uint32_t input_mem_size = sizeof(input_mem_32);
+
+uint32_t config_mem_32[] =
+{
+  #include "config.mem"
+};
+uint8_t* const config_mem = (uint8_t*)&config_mem_32[0];
+const uint32_t config_mem_size = sizeof(config_mem_32);
+
+uint32_t filter_mem_32[] =
+{
+  #include "filter.mem"
+};
+uint8_t* const filter_mem = (uint8_t*)&filter_mem_32[0];
+const uint32_t filter_mem_size = sizeof(filter_mem_32);
+
+constexpr uint32_t output_mem_size =
+  #include "output_size.mem"
+;
+
+constexpr uint32_t inter_mem_size =
+  #include "inter_size.mem"
+;
+
+#include "mmd_wrapper.h"
+#include "device_memory_allocator.h"
+#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
+
+int main(int argc, char *argv[]) {
+  std::array<uint8_t, output_mem_size> actual_output_mem;
+  for (uint64_t i=0u; i < actual_output_mem.size();i++)
+  {
+    actual_output_mem[i] = (0xDEADBEEF) >> ((3-(i%4)) * 8);
+  }
+
+  std::cout << "AOT Splitter Example" << std::endl;
+
+  constexpr int instance = 0;
+
+  constexpr int _maxNumPipelines = 5;
+  constexpr int numPipelines = _maxNumPipelines;
+
+  // TODO: retrieve this from the arch file
+  constexpr uint64_t featureWordSize = 32;
+  constexpr uint64_t filterWordSize = 64;
+
+
+  constexpr int ARCH_HASH_SIZE = 16;
+  constexpr int BUILD_VERSION_SIZE = 32;
+
+  MmdWrapper mmdWrapper{};
+  DeviceMemoryAllocator ddrAllocator{};
+
+  for (size_t i = 0; i < ARCH_HASH_SIZE; i+=4) {
+    uint32_t arch_build_word_from_device = mmdWrapper.ReadFromCsr(instance, i);
+    if (arch_build_mem_32[i/4] != arch_build_word_from_device)
+    {
+      std::cout << "Arch hash mismatch at word " << i <<  " : expected " <<
+        std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_mem_32[i/4] <<
+        " != " <<
+        std::setfill('0') << std::setw(8) << std::uppercase << std::hex << (uint32_t)arch_build_word_from_device << std::endl;
+      return 1;
+    }
+  }
+  char expected_build_version[BUILD_VERSION_SIZE + 1];
+  expected_build_version[BUILD_VERSION_SIZE] = '\0';
+  std::memcpy(expected_build_version, (uint8_t*)&arch_build_mem_32[ARCH_HASH_SIZE/sizeof(uint32_t)], BUILD_VERSION_SIZE);
+
+  char actual_build_version[BUILD_VERSION_SIZE + 1];
+  actual_build_version[BUILD_VERSION_SIZE] = '\0';
+
+  for (uint32_t i=0;i < BUILD_VERSION_SIZE; i+=4)
+  {
+    uint32_t chunk = mmdWrapper.ReadFromCsr(instance, ARCH_HASH_SIZE + i);
+    for (uint8_t j=0;j < 4; j++)
+    {
+      actual_build_version[i+j] = chunk & 0xFF;
+      chunk >>= 8;
+    }
+  }
+  if (0 != std::strncmp(expected_build_version, actual_build_version, BUILD_VERSION_SIZE))
+  {
+    std::cout << "Build version mismath. Expected " << expected_build_version << " actual " << actual_build_version << std::endl;
+    return 1;
+  }
+
+  ddrAllocator.Initialize(mmdWrapper.GetDDRSizePerInstance(), &mmdWrapper);
+
+  ddrAllocator.AllocateSharedBuffer(inter_mem_size, instance);
+  //mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0);
+
+
+  uint64_t inputOutputBufferSize = numPipelines * (input_mem_size + output_mem_size);  // how much space to allocate
+  uint64_t inputOutputBufferAlignment = featureWordSize;  // starting address must be aligned to this
+  uint64_t inputOutputBufferAddr;                         // where did the allocator place this buffer
+  ddrAllocator.AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr);
+
+  uint64_t configFilterBufferSize = config_mem_size + filter_mem_size;
+  uint64_t configFilterBufferAlignment = filterWordSize;
+  uint64_t configFilterBufferAddr;
+  ddrAllocator.AllocatePrivateBuffer(configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr);
+
+  mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
+  mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, 3);
+  uint32_t completionCount = mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+  std::cout << "Initial completion count " << completionCount << std::endl;
+
+  mmdWrapper.WriteToDDR(instance, inputOutputBufferAddr, input_mem_size, input_mem);
+
+  mmdWrapper.WriteToDDR(instance, configFilterBufferAddr, config_mem_size, config_mem);
+  mmdWrapper.WriteToDDR(instance, configFilterBufferAddr + config_mem_size, filter_mem_size, filter_mem);
+
+  mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configFilterBufferAddr);
+  constexpr int CONFIG_READER_DATA_BYTES = 8;  // May want to move to a header in production code
+  mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, ((config_mem_size) / CONFIG_READER_DATA_BYTES) - 2);
+
+
+  // base address for feature reader -- this will trigger one run of DLA
+  mmdWrapper.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputOutputBufferAddr);
+
+  int i=0;
+  while(mmdWrapper.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT) == completionCount)
+  {
+    i++;
+    if (i % 100000 == 0) {
+      std::cout << "Timeout" << std::endl;
+      return 1;
+    }
+  }
+
+  std::cout << "Completed infered in " << i << " polling intervals" << std::endl;
+
+  //Reading from pipeline zero
+  mmdWrapper.ReadFromDDR(instance, inputOutputBufferAddr + input_mem_size, actual_output_mem.size(), actual_output_mem.data());
+
+  std::ofstream of ("actual_output.mem", std::ios_base::out | std::ios_base::binary);
+  if (of) {
+    of.write((const char*)actual_output_mem.data(), actual_output_mem.size());
+  }
+  of.close();
+
+  return 0;
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt
new file mode 100644
index 0000000..6f5e916
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/CMakeLists.txt
@@ -0,0 +1,113 @@
+cmake_minimum_required(VERSION 3.10)
+
+add_library(dla_aot_splitter_plugin SHARED)
+
+target_compile_features(dla_aot_splitter_plugin PUBLIC cxx_std_11)
+
+target_compile_definitions(dla_aot_splitter_plugin PUBLIC DISABLE_JIT)
+
+set_target_properties(dla_aot_splitter_plugin PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+if (WIN32)
+  # Fix warning C4273: inconsistent dll linkage
+  target_compile_definitions(dla_aot_splitter_plugin PRIVATE XBYAK_NO_OP_NAMES
+    IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+    $<TARGET_PROPERTY:openvino::runtime,INTERFACE_COMPILE_DEFINITIONS>)
+endif()
+
+target_include_directories(dla_aot_splitter_plugin PRIVATE
+  $ENV{COREDLA_ROOT}/dla_plugin
+  $ENV{COREDLA_ROOT}/dla_plugin/inc
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia
+  $ENV{COREDLA_ROOT}/util/inc # dla_error.h
+  $ENV{COREDLA_ROOT}/inc # dla_dma_constants.svh
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc # For abstract classes (BatchJob, Device etc.)
+  #
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc
+)
+
+target_sources(dla_aot_splitter_plugin PRIVATE
+##
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_async_infer_request.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_config.hpp
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_compiled_model.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_runtime_log.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_infer_request.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_plugin.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dlia_utils.h
+  $ENV{COREDLA_ROOT}/dla_plugin/inc/dla_plugin_config.hpp
+##
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dla_async_infer_request.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dla_config.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dla_compiled_model.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_infer_request.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_plugin.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dla_plugin_jit_functions.cpp
+  $ENV{COREDLA_ROOT}/dla_plugin/src/dlia_utils.cpp
+  $ENV{COREDLA_ROOT}/util/src/dla_numeric_utils.cpp
+##
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/graph_job.h
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/batch_job.h
+  $ENV{COREDLA_ROOT}/runtime/coredla_device/inc/device.h
+##
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_graph_job.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_device.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/raw_batch_job.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/dla_aot_utils.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_graph_job.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_device.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/raw_batch_job.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_utils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/inc/dla_aot_structs.h
+)
+
+if (WIN32)
+  target_link_libraries(dla_aot_splitter_plugin
+    PRIVATE
+##
+    dla_op_transformation
+    dliaPluginIOTransformations
+    openvino::runtime
+    openvino_dev_api
+    ${TBB_IMPORTED_TARGETS}
+)
+else()
+  target_link_libraries(dla_aot_splitter_plugin
+    PRIVATE
+##
+    pthread
+    dla_op_transformation
+    dliaPluginIOTransformations
+    openvino::runtime
+    openvino_dev_api
+    ${TBB_IMPORTED_TARGETS}
+)
+endif()
+
+if (DISABLE_JIT)
+  target_include_directories(dla_aot_splitter_plugin PRIVATE
+    $ENV{COREDLA_ROOT}/util/inc
+    $ENV{COREDLA_XUTIL_DIR}/compiled_result/inc
+  )
+  target_sources(dla_aot_splitter_plugin PRIVATE $ENV{COREDLA_XUTIL_DIR}/compiled_result/src/compiled_result_reader_writer.cpp)
+
+  if (EXISTS $ENV{COREDLA_ROOT}/inc)
+    target_include_directories(dla_aot_splitter_plugin PUBLIC $ENV{COREDLA_ROOT}/inc)
+  else()
+    target_include_directories(dla_aot_splitter_plugin PUBLIC $ENV{COREDLA_ROOT}/build/coredla/dla/inc)
+  endif()
+else()
+  target_link_libraries(dla_aot_splitter_plugin
+    PRIVATE
+      dla_compiled_result
+      archparam
+  )
+endif()
+
+set_target_properties(dliaPluginIOTransformations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+if (WIN32)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins_aot_splitter_win.xml ${CMAKE_CURRENT_BINARY_DIR}/plugins_aot_splitter.xml COPYONLY)
+else()
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugins_aot_splitter.xml ${CMAKE_CURRENT_BINARY_DIR}/ COPYONLY)
+endif()
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h
new file mode 100644
index 0000000..697b5d2
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_structs.h
@@ -0,0 +1,38 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef _DLA_AOT_STRUCTS_H_
+#define _DLA_AOT_STRUCTS_H_
+
+#include "compiled_result.h"
+
+// Custom type
+typedef unsigned char uint8_t;
+
+// All size and offset fields are in bytes.
+typedef struct {
+  const dla::CompiledResult* compiled_result;
+  uint32_t config_buffer_size;
+  uint32_t filter_bias_scale_buffer_size;
+  uint8_t *input_feature_buffer;
+  uint32_t input_feature_buffer_size;
+  uint32_t output_feature_buffer_size;
+  uint32_t intermediate_feature_buffer_size;
+} DLAInput;
+
+typedef struct {
+  // Its size is output_feature_buffer_size in DLAInput.
+  uint8_t *output_feature_buffer;
+} DLAOutput;
+
+#endif    // _DLA_REF_H_
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h
new file mode 100644
index 0000000..7fa23e8
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/dla_aot_utils.h
@@ -0,0 +1,49 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef _DLA_AOT_UTILS_H_
+#define _DLA_AOT_UTILS_H_
+
+#include <fcntl.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include <sys/stat.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "dla_aot_structs.h"
+
+using google::protobuf::io::FileInputStream;
+
+// fp16 feature element (in bytes)
+// TODO: extract it from arch / compiled result
+const uint32_t feature_elem_size = 2;
+
+//////////////////////////////////////////////////////////////////////////////
+// Dump DLA input and output to the following files:
+// - config_filter.mem: config + filter buffer
+// - input_feature.mem: input feature buffer
+// - output_feature.mem: output feature buffer (emulation results)
+//
+// Each .mem file is a text file, with one byte (in hex) per line.
+//////////////////////////////////////////////////////////////////////////////
+
+void writeInputOutputToFiles(const std::vector<int>& arch_hash,
+                             const std::string& build_version,
+                             const std::string& arch_name,
+                             const DLAInput& input,
+                             const DLAOutput& output);
+
+#endif  // _DLA_AOT_UTILS_H_
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h
new file mode 100644
index 0000000..dd8e5fa
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_batch_job.h
@@ -0,0 +1,79 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+#ifndef RAW_BATCH_JOB_H
+#define RAW_BATCH_JOB_H
+
+#include <assert.h>
+#include <cstdio>
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <memory>
+
+#include "batch_job.h"
+#include "dla_aot_structs.h"
+#include "raw_device.h"
+
+// RawBatchJob represents one batch execution
+// Contains functions to start DLA
+class RawBatchJob : public BatchJob {
+ private:
+  const CompiledResult* compiledResult;
+  DLAInput* dlaBuffers_;
+  DLAOutput output_;
+  int instance_;
+  uint32_t debugLevel_;
+  std::string AES_key_;
+  std::string IV_key_;
+  bool encryption_enabled_;
+  RawBatchJob(const CompiledResult* compiledResult,
+              DLAInput* dlaBuffers,
+              int instance,
+              uint32_t debugLevel,
+              std::string AES_key,
+              std::string IV_key,
+              bool encryption_enabled);
+
+ public:
+  RawBatchJob(const RawBatchJob&) = delete;
+  RawBatchJob(RawBatchJob&) = delete;
+  RawBatchJob& operator=(const RawBatchJob&) = delete;
+  static unique_ptr<BatchJob> MakeUnique(const CompiledResult* compiledResult,
+                                         DLAInput* dlaBuffers,
+                                         int instance,
+                                         uint32_t debugLevel,
+                                         std::string AES_key,
+                                         std::string IV_key,
+                                         bool encryption_enabled);
+  // @param inputArray - ptr to CPU array containing input data tp be copied to DDR
+  // blocking function
+  void LoadInputFeatureToDDR(void* inputArray);
+  // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data
+  void StartDla() override;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  void ReadOutputFeatureFromDDR(void* outputArray) const;
+  void ScheduleInputFeature() const {}
+};
+
+#endif
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h
new file mode 100644
index 0000000..168707e
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_device.h
@@ -0,0 +1,81 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+#ifndef RAW_DEVICE_H
+#define RAW_DEVICE_H
+
+#include <assert.h>
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <vector>
+#include <map>
+#include "arch_params.h"
+#include "compiled_result.h"
+#include "device.h"
+using namespace std;
+using namespace dla;
+class GraphJob;
+
+class RawDevice : public Device {
+ public:
+  GraphJob* CreateGraphJob(const CompiledResult* compiledResult,
+                           size_t numPipelines,
+                           int instance,
+                           std::string AES_key,
+                           std::string IV_key,
+                           bool encryption_enabled,
+                           const std::string export_dir,
+                           const std::string parameter_rom_export_dir);
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  int GetNumInferencesCompleted(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  double GetActiveHWTimeMs(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  double GetAvgHWTimePerJobMs(size_t num_jobs, int instance) const override;
+  RawDevice(const arch_params* archParams);
+  void WaitForDla(int instance,
+                  size_t threadId = 0,
+                  std::function<bool()> isCancelled = nullptr) override;  // threadId is for debugging purpose only
+  std::string SchedulerGetStatus() const override { return ""; }
+  bool InitializeScheduler(uint32_t sourceBufferSize,
+                           uint32_t dropSourceBuffers,
+                           uint32_t numInferenceRequests,
+                           const std::string source_fifo_file = "") override {
+    return true;
+  }
+  int GetNumInstances() const override { return numInstances_; }
+  int GetSizeCsrDescriptorQueue() const override { return -1; }  // meaningless here
+  double GetCoreDlaClockFreq() const override { return -1.0; }   // meaningless here
+  std::map<std::string, uint64_t> ReadDebugNetwork(int instance) const override {
+    return std::map<std::string, uint64_t>();
+  };
+  uint64_t GetNumInputFeatureMemoryReads(int instance) const override { return 0; };
+  uint64_t GetNumFilterMemoryReads(int instance) const override {return 0; };
+  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override {return 0; };
+
+ private:
+  RawDevice() = delete;
+  vector<unique_ptr<GraphJob>> allGraphJobs_;
+  int numInstances_;
+  const arch_params* archParams_;
+};
+
+#endif  // REF_DEVCE_H
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h
new file mode 100644
index 0000000..38ad075
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/inc/raw_graph_job.h
@@ -0,0 +1,80 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+#ifndef RAW_GRAPH_JOB_H
+#define RAW_GRAPH_JOB_H
+
+#include <assert.h>
+#include <cstdio>
+#include <memory>
+#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <thread>
+#include "compiled_result.h"
+
+#include "dla_aot_structs.h"
+#include "graph_job.h"
+#include "raw_batch_job.h"
+#include "raw_device.h"
+using namespace dla;
+/*! RawGraphJob is a DLA compiled graph loaded onto a emulation device
+ * Initialized with Emulator Device object
+ * RawGraphJob stores arrays filter, bias, config, inputs and outputs
+ * It provides handle to "batch job" objects that are used to load input and start DLA for one batch
+ */
+class RawGraphJob : public GraphJob {
+ public:
+  static unique_ptr<GraphJob> MakeUnique(const arch_params* archParams,
+                                         const CompiledResult* compiled_result,
+                                         size_t numPipelines,
+                                         int instance,
+                                         uint32_t debugLevel,
+                                         std::string AES_key,
+                                         std::string IV_key,
+                                         bool encryption_enabled);
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  // Increments batchJobsRequested_
+  // Thread safe
+  BatchJob* GetBatchJob();
+  RawGraphJob(const GraphJob&) = delete;
+  RawGraphJob(RawGraphJob&) = delete;
+  RawGraphJob& operator=(const RawGraphJob&) = delete;
+
+ private:
+  DLAInput dlaBuffers_;
+  vector<unique_ptr<BatchJob>> batchJobs_;
+  int instance_;
+  uint32_t debugLevel_;
+  unsigned int batchJobsRequested_;
+  std::mutex graphJobMutex;
+  RawGraphJob(const arch_params* archParams,
+              const CompiledResult* compiledResult,
+              size_t numPipelines,
+              int instance,
+              uint32_t debugLevel,
+              std::string AES_key,
+              std::string IV_key,
+              bool encryption_enabled);
+};
+
+#endif
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml
new file mode 100644
index 0000000..2f2d24e
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter.xml
@@ -0,0 +1,18 @@
+<ie>
+    <plugins>
+        <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
+        </plugin>
+        <plugin name="HETERO" location="libcoreDLAHeteroPlugin.so">
+        </plugin>
+        <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
+        </plugin>
+        <plugin name="MULTI" location="libopenvino_auto_plugin.so">
+        </plugin>
+        <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
+        </plugin>
+        <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
+        </plugin>
+        <plugin name="FPGA" location="libdla_aot_splitter_plugin.so">
+        </plugin>
+    </plugins>
+</ie>
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml
new file mode 100755
index 0000000..aeeedde
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/plugins_aot_splitter_win.xml
@@ -0,0 +1,22 @@
+<ie>
+    <plugins>
+        <plugin name="AUTO" location="openvino_auto_plugin.dll">
+        </plugin>
+        <plugin name="BATCH" location="openvino_auto_batch_plugin.dll">
+        </plugin>
+        <plugin name="CPU" location="openvino_intel_cpu_plugin.dll">
+        </plugin>
+        <plugin name="GNA" location="openvino_intel_gna_plugin.dll">
+        </plugin>
+        <plugin name="GPU" location="openvino_intel_gpu_plugin.dll">
+        </plugin>
+        <plugin name="HETERO" location="coreDLAHeteroPlugin.dll">
+        </plugin>
+        <plugin name="MULTI" location="openvino_auto_plugin.dll">
+        </plugin>
+        <plugin name="MYRIAD" location="openvino_intel_myriad_plugin.dll">
+        </plugin>
+        <plugin name="FPGA" location="dla_aot_splitter_plugin.dll">
+        </plugin>
+    </plugins>
+</ie>
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg
new file mode 100644
index 0000000..3288819
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/CPPLINT.cfg
@@ -0,0 +1,4 @@
+filter=-build/header_guard,-runtime/explicit,-build/include_subdir,-runtime/references,-build/c++11,-runtime/int
+exclude_files=^(?!pe_array_sim.cpp).*\.cpp
+linelength=160
+headers=h,hpp
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp
new file mode 100644
index 0000000..4317201
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/dla_aot_utils.cpp
@@ -0,0 +1,117 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/*
+  This file contains some helper utilities to output coredla data blobs to files
+  in the current working directory
+*/
+
+#include "dla_aot_utils.h"
+
+// The resulting file is expected to be consumed by RTL testbench or hardware.
+static void writeBufferToBinFile(const uint8_t *buffer, uint32_t buffer_size,
+                              const char *file_path) {
+  FILE *fp = fopen(file_path, "wb");
+  assert(nullptr != fp);
+
+  if (buffer_size && !fwrite(buffer, buffer_size, 1, fp))
+  {
+    std::cout << "ERROR writing to output file " << file_path << std::endl;
+  }
+
+  fclose(fp);
+}
+
+// The resulting file is expected to be consumed by RTL testbench or hardware.
+static void writeBufferToFile(const uint8_t *buffer, uint32_t buffer_size,
+                              const char *file_path) {
+  FILE *fp = fopen(file_path, "w");
+  assert(nullptr != fp);
+
+  // Write buffer size (in bytes) to the first line
+  for (uint32_t b = 0; b < buffer_size; b+=4) {
+    if (b && ((b % 128) == 0))
+    {
+      fprintf(fp, "\n");
+    }
+    fprintf(fp, "0x%08x", *((uint32_t*)&buffer[b]));
+    if(b + 4 < buffer_size)
+    {
+      fprintf(fp, ",");
+    }
+  }
+
+  fclose(fp);
+}
+
+// Create all files that the splitter is responsible for
+void writeInputOutputToFiles (
+  const std::vector<int>& arch_hash,
+  const std::string& build_version,
+  const std::string& arch_name,
+  const DLAInput &input,
+  const DLAOutput &output
+) {
+  uint8_t arch_build[ARCH_HASH_SIZE + BUILD_VERSION_SIZE + ARCH_NAME_SIZE];
+
+  memset(&arch_build[0], 0, ARCH_HASH_SIZE + BUILD_VERSION_SIZE);
+  memcpy(&arch_build[0], arch_hash.data(), ARCH_HASH_SIZE);
+  memcpy(&arch_build[ARCH_HASH_SIZE], build_version.c_str(), std::min(build_version.length(),static_cast<size_t>(BUILD_VERSION_SIZE)));
+  memcpy(&arch_build[ARCH_HASH_SIZE + BUILD_VERSION_SIZE], arch_name.c_str(), std::min(arch_name.length(),static_cast<size_t>(ARCH_NAME_SIZE)));
+  writeBufferToFile(arch_build,
+                    sizeof(arch_build),
+                    "arch_build.mem");
+  writeBufferToFile(arch_build,
+                    sizeof(arch_build),
+                    "arch_build.bin");
+  const auto &config_fbs_buffer =
+    input.compiled_result->get_config_filter_bias_scale_array();
+
+  // Only dump filters and config memory file when they are saved in DDR
+  if (!input.compiled_result->get_ddrfree_header().enable_parameter_rom) {
+    writeBufferToFile(&(config_fbs_buffer[0][0]),
+                      input.config_buffer_size,
+                      "config.mem");
+    writeBufferToBinFile(&(config_fbs_buffer[0][0]),
+                      input.config_buffer_size,
+                      "config.bin");
+    writeBufferToFile(&(config_fbs_buffer[0][0]) + input.config_buffer_size,
+                      input.filter_bias_scale_buffer_size,
+                      "filter.mem");
+    writeBufferToBinFile(&(config_fbs_buffer[0][0]) + input.config_buffer_size,
+                      input.filter_bias_scale_buffer_size,
+                      "filter.bin");
+  } else {
+    std::cout << "Graph filters and DLA configs are not dumped because parameter ROM is enabled in the AOT file." << std::endl;
+  }
+  uint8_t* input_buffer = nullptr;
+  size_t input_size = 0;
+  if (input.input_feature_buffer) {
+    input_buffer = input.input_feature_buffer;
+    input_size = input.input_feature_buffer_size;
+  }
+  writeBufferToFile(input_buffer,
+                    input_size,
+                    "input.mem");
+  writeBufferToBinFile(input_buffer,
+                    input_size,
+                    "input.bin");
+  uint32_t inter_size = input.intermediate_feature_buffer_size;
+  writeBufferToFile((const uint8_t*)&inter_size,
+                     sizeof(inter_size),
+                     "inter_size.mem");
+  uint32_t output_size = input.output_feature_buffer_size;
+  writeBufferToFile((const uint8_t*)&output_size,
+                     sizeof(output_size),
+                     "output_size.mem");
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp
new file mode 100644
index 0000000..23247d5
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_batch_job.cpp
@@ -0,0 +1,68 @@
+// Copyright 2022 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/*
+  The raw_batch_job, raw_graph_job, and raw_device implement the interfaces
+  used by dliaPlugin to mimic a inference flow without actually providing a
+  inference. It is used to get the transformed input performed by the dliaPlugin
+  upper layers
+*/
+
+#include "raw_batch_job.h"
+#include "dla_aot_utils.h"
+
+unique_ptr<BatchJob> RawBatchJob::MakeUnique(const CompiledResult * compiledResult,
+                            DLAInput* dlaBuffers,
+                            int instance,
+                            uint32_t debugLevel,
+                            std::string AES_key,
+                            std::string IV_key,
+                            bool encryption_enabled) {
+    return unique_ptr<BatchJob>(new RawBatchJob(compiledResult, dlaBuffers, instance, debugLevel, AES_key, IV_key, encryption_enabled));
+}
+
+RawBatchJob::RawBatchJob(const CompiledResult * compiledResult,
+        DLAInput* dlaBuffers,
+        int instance,
+        uint32_t debugLevel,
+        std::string AES_key,
+        std::string IV_key,
+        bool encryption_enabled) : compiledResult(compiledResult) {
+  dlaBuffers_ = dlaBuffers;
+  instance_ = instance;
+  debugLevel_= debugLevel;
+  AES_key_ = AES_key;
+  IV_key_ = IV_key;
+  encryption_enabled_ = encryption_enabled;
+  output_.output_feature_buffer = new uint8_t[dlaBuffers_->output_feature_buffer_size];
+  memset(output_.output_feature_buffer, 0, dlaBuffers_->output_feature_buffer_size);
+  assert(nullptr != output_.output_feature_buffer);
+}
+
+// Emulation device has no DDR. This function is just storing a pointer to the array
+// Note: inputAray should not be deleted until the end of the Emulation runs
+// i.e. StartDla completes
+void RawBatchJob::LoadInputFeatureToDDR(void* inputArray) {
+  dlaBuffers_->input_feature_buffer = (uint8_t*) inputArray;
+  StartDla();
+}
+
+void RawBatchJob::StartDla() {
+  // Write input / output buffers to files
+  writeInputOutputToFiles(compiledResult->get_arch_hash(), compiledResult->get_build_version_string(), compiledResult->get_arch_name(), *dlaBuffers_, output_);
+}
+
+// Emulation device has no DDR. Output is copied into the outputArray.
+void RawBatchJob::ReadOutputFeatureFromDDR(void* outputArray) const {
+  memcpy(outputArray, output_.output_feature_buffer, dlaBuffers_->output_feature_buffer_size);
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp
new file mode 100644
index 0000000..0b8e838
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_device.cpp
@@ -0,0 +1,67 @@
+// Copyright 2022 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/*
+  The raw_batch_job, raw_graph_job, and raw_device implement the interfaces
+  used by dliaPlugin to mimic a inference flow without actually providing a
+  inference. It is used to get the transformed input performed by the dliaPlugin
+  upper layers
+*/
+
+#include "raw_device.h"
+#include "raw_graph_job.h"
+unique_ptr<Device> Device::MakeUnique(const arch_params* archParams,
+                                      uint32_t waitForDlaTimeoutSeconds) {
+  return unique_ptr<Device>(new RawDevice(archParams));
+}
+
+RawDevice::RawDevice(const arch_params* archParams) {
+  numInstances_ = 1;
+  archParams_ = archParams;
+}
+
+GraphJob* RawDevice::CreateGraphJob(const CompiledResult * compiledResult,
+  size_t numPipelines,
+  int instance,
+  std::string AES_key,
+  std::string IV_key,
+  bool encryption_enabled,
+  const std::string export_dir,
+  const std::string parameter_rom_export_dir)
+{
+  (void) export_dir;  // unused in HW runtime. CoreDLA utilizes base pointers, which the SW reference utilizes this variable. We void it here.
+  (void) parameter_rom_export_dir;
+  assert(instance < numInstances_);
+  allGraphJobs_.push_back(move(RawGraphJob::MakeUnique(archParams_, compiledResult, numPipelines, instance, 0,
+                          AES_key, IV_key, encryption_enabled)));
+  return (allGraphJobs_.back()).get();
+}
+
+void RawDevice::WaitForDla(int instance, size_t threadId/* = 0 */, std::function<bool()> isCancelled) {
+  //RawDevice does not do any real work. No need to wait
+}
+
+int RawDevice::GetNumInferencesCompleted(int instance) const {
+  std::cout << "This function, GetNumInferencesCompleted, is not implemented for raw device" << std::endl;
+  return 0;
+}
+
+double RawDevice::GetActiveHWTimeMs(int instance) const {
+  std::cout << "This function, GetActiveHWTimeMs, is not implemented for raw device" << std::endl;
+  return 0;
+}
+
+double RawDevice::GetAvgHWTimePerJobMs(size_t num_jobs, int instance) const {
+  std::cout << "This function, GetAvgHWTimePerJobMs, is not implemented for raw device" << std::endl;
+  return 0;
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp
new file mode 100644
index 0000000..c698110
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/dla_aot_splitter_plugin/src/raw_graph_job.cpp
@@ -0,0 +1,89 @@
+// Copyright 2022 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/*
+  The raw_batch_job, raw_graph_job, and raw_device implement the interfaces
+  used by dliaPlugin to mimic a inference flow without actually providing a
+  inference. It is used to get the transformed input performed by the dliaPlugin
+  upper layers
+*/
+
+#include "raw_graph_job.h"
+#include "dla_aot_utils.h"
+#include <fstream>
+#include "dla_defines.h"
+
+unique_ptr<GraphJob> RawGraphJob::MakeUnique(const arch_params* archParams,
+  const CompiledResult * compiledResult,
+  size_t numPipelines,
+  int instance,
+  uint32_t debugLevel = 0,
+  std::string AES_key = "",
+  std::string IV_key = "",
+  bool encryption_enabled = false)
+{
+  return unique_ptr<GraphJob>(new RawGraphJob(archParams, compiledResult, numPipelines, instance, debugLevel, AES_key, IV_key, encryption_enabled));
+}
+
+RawGraphJob::RawGraphJob(const arch_params* archParams,
+  const CompiledResult * compiledResult,
+  size_t numPipelines,
+  int instance,
+  uint32_t debugLevel,
+  std::string AES_key,
+  std::string IV_key,
+  bool encryption_enabled)
+{
+  assert(numPipelines);
+  instance_ = instance;
+  debugLevel_ = debugLevel;
+  batchJobsRequested_ = 0;
+  // input feature buffer size
+  // TODO: support multi-input graph
+  dlaBuffers_.input_feature_buffer_size =
+      compiledResult->get_conv_input_size_in_bytes();
+  // input feature buffer to be allocated outside this routine
+
+  // output buffer size
+  dlaBuffers_.output_feature_buffer_size =
+      compiledResult->get_conv_output_size_in_bytes();
+
+  // intermediate buffer size
+  dlaBuffers_.intermediate_feature_buffer_size =
+      compiledResult->get_conv_intermediate_size_in_bytes();
+
+  // config and filter buffer size
+  size_t num_config_words = compiledResult->get_num_config_words();
+  dlaBuffers_.config_buffer_size = num_config_words * CONFIG_WORD_SIZE;
+  dlaBuffers_.filter_bias_scale_buffer_size =
+      compiledResult->get_total_filter_bias_scale_buffer_size();
+  // store a pointer to CompiledResult to use config and filter buffer directly without copying
+  dlaBuffers_.compiled_result = compiledResult;
+  for(size_t i = 0; i < numPipelines; i++) {
+    batchJobs_.push_back(move(RawBatchJob::MakeUnique(compiledResult, &dlaBuffers_, instance_, debugLevel_, AES_key, IV_key, encryption_enabled)));
+  }
+
+  dlaBuffers_.input_feature_buffer = NULL;
+}
+
+BatchJob* RawGraphJob::GetBatchJob() {
+  graphJobMutex.lock();
+  if(batchJobsRequested_ >= batchJobs_.size()) {
+    graphJobMutex.unlock();
+    return nullptr;
+  }
+  auto * batchJob = batchJobs_[batchJobsRequested_].get();
+  batchJobsRequested_++;
+  graphJobMutex.unlock();
+  return batchJob;
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp b/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp
new file mode 100644
index 0000000..44448e8
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/inc/dla_aot_splitter.hpp
@@ -0,0 +1,130 @@
+// Copyright 2022-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief message for images argument
+static const char input_message[] =
+    "Optional. Path to a folder with images and/or binaries or to specific image or binary file.";
+
+/// @brief message for compiled model argument
+static const char compiled_model_message[] = "Optional. Path to a .bin file with a trained compiled model";
+
+// @brief message for the custom plugins.xml file option
+static const char plugins_message[] = "Optional. Select a custom plugins to use.";
+
+// @brief message folding_option flag
+static const char folding_option_message[] = "Optional. Set the folding options for dla compiler: options 0-3.";
+
+// @brief message fold_preprocessing flag
+static const char fold_preprocessing_message[] = "Optional. Enable fold preprocessing option for dla compiler.";
+
+// @brief message bgr flag
+static const char bgr_message[] = "Optional. Indicate images are in bgr format.";
+
+// @brief message encryption_key flag
+static const char encryption_key_message[] =
+    "Optional. Encryption key (using hexidecimal characters, 16 bytes- 32 hexidecimal char).";
+
+// @brief message encryption_iv flag
+static const char encryption_iv_message[] =
+    "Optional. Initialization vector for encryption. (8 bytes - 16 hexidecimal char)";
+
+// @brief message binary flag
+static const char bin_data_message[] =
+    "Optional. Specify that the input should be read as binary data (otherwise, if input tensor has depth 1, or 3 it "
+    "will default to U8 image processing).";
+
+/// @brief message resize flag
+static const char input_image_resize_message[] =
+    "Optional. Input image resizing methods when the input image width and height do not match the desired "
+    "input width and height of the model. resize: Resizing the input image to the model input size; "
+    "pad_resize: Pad the input image with black pixels (i.e., 0) into a squared image and "
+    "resize the padded image to model input size.";
+
+/// @brief message enable early-access features flag
+static const char enable_early_access_message[] =
+    "Optional. Enables early access (EA) features of FPGA AI Suite. These are features that are actively being "
+    "developed and have not yet met production quality standards. These features may have flaws. "
+    "Consult the FPGA AI Suite documentation for details.";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief Define parameter for set image file <br>
+/// i or mif is a required parameter
+DEFINE_string(i, "", input_message);
+
+/// @brief Define parameter for compiled model file <br>
+/// It is not a required parameter
+DEFINE_string(cm, "", compiled_model_message);
+
+/// @brief Path to a plugins_xml file
+DEFINE_string(plugins, "", plugins_message);
+
+/// @brief Define flag whether the image is in bgr format
+DEFINE_bool(bgr, false, bgr_message);
+
+/// Select folding options; 0,1,2,3
+DEFINE_int32(folding_option, 1, folding_option_message);
+
+/// @brief Define flag for enabling folding preprocessing
+DEFINE_bool(fold_preprocessing, false, fold_preprocessing_message);
+
+/// @brief encryption key
+DEFINE_string(encryption_key, "", encryption_key_message);
+
+/// @brief initialization vector
+DEFINE_string(encryption_iv, "", encryption_iv_message);
+
+/// @brief Specify that the inputs should be read as binary.
+DEFINE_bool(bin_data, false, bin_data_message);
+
+/// @brief Define flag for using input image resize <br>
+DEFINE_string(resize_type, "", input_image_resize_message);
+
+/// @brief Enables early-access (EA) features of CoreDLA <br>
+DEFINE_bool(enable_early_access, false, enable_early_access_message);
+
+/**
+ * @brief This function show a help message
+ */
+static void showUsage() {
+  std::cout << std::endl;
+  std::cout << "aot_splitter [OPTION]" << std::endl;
+  std::cout << "Options:" << std::endl;
+  std::cout << std::endl;
+  std::cout << "    -h, --help                                  " << help_message << std::endl;
+  std::cout << "    -i \"<path>\"                                 " << input_message << std::endl;
+  std::cout << "    -cm \"<path>\"                                " << compiled_model_message << std::endl;
+  std::cout << "    -plugins                           " << plugins_message << std::endl;
+  std::cout << "    -bgr                                        " << bgr_message << std::endl;
+  std::cout << "    -bin_data                                   " << bin_data_message << std::endl;
+  std::cout << "    -resize_type \"resize/pad_resize\"            " << input_image_resize_message << std::endl;
+  std::cout << "    -folding_option                             " << folding_option_message << std::endl;
+  std::cout << "    -fold_preprocessing                         " << fold_preprocessing_message << std::endl;
+  std::cout << "    -encryption_key                             " << encryption_key_message << std::endl;
+  std::cout << "    -encryption_iv                              " << encryption_iv_message << std::endl;
+  std::cout << "    -enable_early_access                        " << enable_early_access_message << std::endl;
+}
diff --git a/python/openvino/runtime/dla_aot_splitter/sdl.cmake b/python/openvino/runtime/dla_aot_splitter/sdl.cmake
new file mode 100644
index 0000000..3f8af7a
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/sdl.cmake
@@ -0,0 +1,96 @@
+
+####################################################################
+## SDL required compiler flags
+####################################################################
+# Needed for all builds
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wformat -Wformat-security")
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
+
+set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fPIE")
+set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fPIE")
+
+# Release build only
+set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
+if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9)
+  set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-strong")
+  set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -z noexecstack -z relro -z now")
+
+  # These are for 8478-CT158 in the SDL process
+  # ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ )
+else()
+  set (CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fstack-protector-all")
+endif()
+
+set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -D_FORTIFY_SOURCE=2")
+if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9)
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-strong")
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -z noexecstack -z relro -z now")
+else()
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fstack-protector-all")
+endif()
+
+# These are for 8478-CT158 in the SDL process
+# ( https://sdp-prod.intel.com/bunits/intel/coredla/coredla-ip-20212/tasks/phase/development/8478-CT158/ )
+set (CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv")
+
+####################################################################
+
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -ggdb3")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
+
+#### Sanitizer settings ####
+# Address
+set(CMAKE_C_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+set(CMAKE_CXX_FLAGS_ASAN "-O1 -g -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+# Memory
+set(CMAKE_C_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+set(CMAKE_CXX_FLAGS_MSAN "-O1 -g -fsanitize=memory -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+# Thread
+set(CMAKE_C_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+set(CMAKE_CXX_FLAGS_TSAN "-O1 -g -fsanitize=thread -fno-omit-frame-pointer -fno-optimize-sibling-calls")
+
+
+set (CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+# Enable all warnings except unknown-pragmas.  Wunknown-pragmas must be excluded because
+# it is triggered by header file included from OpenCL runtime
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas")
+
+# Make warnings errors to avoid having them in SDL report
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
+
+# Should cleanup the signed and unsigned compares then remove this exception
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=sign-compare -Wno-error=unused-function -Wno-error=switch -Wno-error=unused-variable -Wno-error=unused-value -Wno-error=unused-but-set-variable -Wno-error=undef -Wno-error=return-type -Wno-error=reorder")
+
+# This is required on Ubuntu 18; the new linker behaviour transforms
+# RPATH into RUNPATH (which can be seen in the output of 'readelf -d').
+# However, RUNPATH does not work recursively, so when OpenVINO reads
+# the plugins.xml file and searches for the specified libcoreDlaRuntimePlugin.so
+# library, it fails.  The --disable-new-dtags option causes the linker
+# to keep RPATH as RPATH (rather than morphing to RUNPATH).
+#
+# References:
+#  https://stackoverflow.com/questions/52018092/how-to-set-rpath-and-runpath-with-gcc-ld
+#  https://stackoverflow.com/questions/59248421/c-secondary-dependency-resolution-with-runpath
+#
+# The solution below seems preferable to setting LD_LIBRARY_PATH, if only barely.
+# For additional motivation, go ahead and throw away part of your day reading either
+# of the screeds:
+#  http://xahlee.info/UnixResource_dir/_/ldpath.html
+#  https://gms.tf/ld_library_path-considered-harmful.html
+# You may find that neither is fully convincing, of course.
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--disable-new-dtags")
diff --git a/python/openvino/runtime/dla_aot_splitter/src/main.cpp b/python/openvino/runtime/dla_aot_splitter/src/main.cpp
new file mode 100644
index 0000000..ffc098e
--- /dev/null
+++ b/python/openvino/runtime/dla_aot_splitter/src/main.cpp
@@ -0,0 +1,475 @@
+// Copyright 2022-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+
+#include <openvino/openvino.hpp>
+#include "samples/args_helper.hpp"
+#include "samples/common.hpp"
+#include "samples/slog.hpp"
+
+// #include "average_precision.hpp"
+#include "dla_aot_splitter.hpp"
+// #include "infer_request_wrap.hpp"
+#include "dla_plugin_config.hpp"
+#include "inputs_filling.hpp"
+#include "utils.hpp"
+
+using DebugNetworkData = std::map<std::string, uint64_t>;
+
+bool exists_test(const std::string& name) {
+  struct stat buffer;
+  return (stat(name.c_str(), &buffer) == 0);
+}
+
+// This function appears in dla_benchmark/main.cpp too.
+bool dir_open_test(const std::string& name) {
+#if (!defined(_WIN32) && !defined(_WIN64))
+  // If we can open the directory then return true
+  DIR* dp = opendir(name.c_str());
+  if (dp != nullptr) {
+    closedir(dp);
+    return true;
+  }
+#endif  // !_WIN32 && !_WIN64
+  struct stat sb;
+  if (stat(name.c_str(), &sb) == 0) {
+    if ((sb.st_mode & S_IFMT) != S_IFREG) {
+      slog::err << "File " << name << " cannot be opened!" << slog::endl;
+      throw std::logic_error("File cannot be opened!");
+    }
+  }
+  return true;
+}
+
+// copy arguments into a new array to split the '-i=<arg>' into
+// two arguments (i.e. '-i' and '<arg>') to overcome a bug
+// parseInputFilesArguments function where is doesn't recognize
+// the -i=<arg> format
+void parseCommandLine(int argc, char** argv) {
+  int num_args = argc;
+  // allocated enough memory in case we needed to split the -i argument into two
+  char** arguments = new char*[num_args + 1];
+  for (int i = 0, j = 0; j < argc; ++i, ++j) {
+    if (strstr(argv[j], "-i=")) {
+      // number of arguments will increase by one after splitting
+      num_args++;
+      arguments[i] = new char[3];
+      strcpy(arguments[i++], "-i");
+      // copy the reset of the argument (i.e. post "-i=")
+      arguments[i] = new char[strlen(argv[j]) - 2];
+      strcpy(arguments[i], argv[j] + 3);
+      continue;
+    }
+    arguments[i] = new char[strlen(argv[j]) + 1];
+    strcpy(arguments[i], argv[j]);
+  }
+  // the parse function is modifying the arguments point so we need to keep
+  // a copy of the original pointer value to delete it properly
+  char** orig_arg_ptr = arguments;
+  gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true);
+  // delete the allocated memory
+  for (int i = 0; i < num_args; ++i) {
+    delete[] orig_arg_ptr[i];
+  }
+  delete[] orig_arg_ptr;
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& netSize) {
+  // ---------------------------Parsing and validating input arguments--------------------------------------
+  slog::info << "Parsing input parameters" << slog::endl;
+
+  // Check for any flags that are missing their preceding dashes
+  // GFlags quietly ignores any flags missing their dashes, which can cause
+  // aot_splitter to run with settings other than what the user intended
+
+  // GFlags supports two different styles of flag:
+  // 1. --<flag>
+  // 2. -<flag>
+  // It also supports two different ways of specifying values for flags which
+  // take values:
+  // 1. --<flag>=<value>
+  // 2. --<flag> <value>
+
+  // If we are not expecting a flag, we are expecting a value for the
+  // preceding flag
+  bool expectingFlag = true;
+  // Start at 1 to skip the command itself
+  for (int i = 1; i < argc; i++) {
+    if (expectingFlag) {
+      // A flag is always denoted by the first char being '-'
+      if (argv[i][0] != '-') {
+        slog::err << "Argument " << argv[i] << " is invalid. You"
+                  << " may have forgotten a preceding '-'." << slog::endl;
+        throw std::logic_error("One or more invalid arguments");
+      }
+
+      char* flagNameStart = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1];
+      std::string flagName;
+
+      gflags::CommandLineFlagInfo flagInfo;
+      if (strstr(flagNameStart, "=")) {
+        flagName = std::string(flagNameStart, size_t(strstr(flagNameStart, "=") - flagNameStart));
+      } else {
+        flagName = std::string(flagNameStart);
+      }
+
+      // We expect a flag in the next argv if the current flag is a bool,
+      // because bool flags do not take a value.
+      // If GetCommandLineFlagInfo returns false, we assume the current
+      // flag is a boolean because boolean flags can be specified as
+      // -no<flag>, which is equivalent to -<flag>=false, or the flag
+      // simply being omitted. However, "no<flag>" is not recognized by
+      // GetCommandLineFlagInfo.
+      // Therefore, if the name is not recognized either the flag is a
+      // boolean flag or doesn't exist. In the latter case, gflags errors
+      // when we call parseCommandLine so we can assume here it's a bool.
+      if (!GetCommandLineFlagInfo(flagName.c_str(), &flagInfo) || strstr(argv[i], "=") || flagInfo.type == "bool") {
+        expectingFlag = true;
+      } else {
+        expectingFlag = false;
+      }
+    } else {
+      // If we were expecting a value, doesn't matter what it is
+      // gflags will check all values are the correct type, and
+      // aot_splitter checks if the values received are sane
+      expectingFlag = true;
+    }
+  }
+
+  parseCommandLine(argc, argv);
+
+  if (FLAGS_help || FLAGS_h) {
+    showUsage();
+    // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it
+    // is an OpenCL/DLAv1 device.  Since it is not, it then errors-out when the device
+    // does not response as expected to the OpenCL query.
+    // showAvailableDevices();
+    std::cout << "\n";
+    return false;
+  }
+
+  if (FLAGS_cm.empty()) {
+    throw std::logic_error("Model is required but not set. Please set -cm option.");
+  } else {
+    std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+    netSize = m_paths.size();
+    slog::info << "Found " << netSize << " compiled graph" << (netSize == 1 ? "" : "s") << slog::endl;
+    for (auto& m_path : m_paths) {
+      if (!exists_test(m_path)) {
+        slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm."
+                  << slog::endl;
+        throw std::logic_error("Compiled model file path does not exist.");
+      }
+    }
+  }
+
+  if (!FLAGS_plugins.empty()) {
+    slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl;
+  }
+
+  if (!exists_test(FLAGS_plugins)) {
+    slog::err << "plugins_xml file: " << FLAGS_plugins << " doesn't exist. Please provide a valid path." << slog::endl;
+    throw std::logic_error("plugins_xml file path does not exist.");
+  }
+
+  return true;
+}
+
+static void next_step(const std::string additional_info = "") {
+  static size_t step_id = 0;
+  static const std::map<size_t, std::string> step_names = {
+      {1, "Parsing and validating input arguments"},
+      {2, "Loading Inference Engine"},
+      {3, "Setting device configuration"},
+      {4, "Reading the Intermediate Representation network"},
+      {5, "Resizing network to match image sizes and given batch"},
+      {6, "Configuring input of the model"},
+      {7, "Loading the model to the device"},
+      {8, "Setting optimal runtime parameters"},
+      {9, "Creating infer requests and filling input blobs with images"},
+      {10, "Measuring performance"},
+      {11, "Dumping statistics report"},
+      {12, "Dumping the output values"}};
+
+  step_id++;
+  if (step_names.count(step_id) == 0) {
+    THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size();
+  }
+
+  std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id)
+            << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
+}
+
+template <typename T>
+T getMedianValue(const std::vector<T>& vec) {
+  std::vector<T> sortedVec(vec);
+  std::sort(sortedVec.begin(), sortedVec.end());
+  return (sortedVec.size() % 2 != 0)
+             ? sortedVec[sortedVec.size() / 2ULL]
+             : (sortedVec[sortedVec.size() / 2ULL] + sortedVec[sortedVec.size() / 2ULL - 1ULL]) / static_cast<T>(2.0);
+}
+
+/**
+ * @brief The entry point of the dla benchmark
+ */
+int main(int argc, char* argv[]) {
+  try {
+    // Declaring the ExecutableNetwork object as a pointer to workaround the segfault
+    // that occurs when destructing the object. Now that it's declared as a pointer
+    // the complier won't automatically call the destructor of the object at the end
+    // of this scope and we won't delete the allocated memory either
+    std::vector<ov::CompiledModel*> exeNetworks;
+    size_t netSize = 0;  // parse the size of networks for arguments check
+
+    size_t return_code = 0;  // universal return code, return this value after dumping out Debug info
+
+    // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
+    next_step();
+
+    if (!ParseAndCheckCommandLine(argc, argv, netSize)) {
+      return 0;
+    }
+
+    bool isNetworkCompiled = !FLAGS_cm.empty();
+    if (isNetworkCompiled) {
+      slog::info << "Network is compiled" << slog::endl;
+    }
+
+    // The set of arguments printed is meant to be a useful summary to the
+    // user, rather than all of the arguments to aot_splitter
+    slog::info << "Printing summary of arguments being used by aot_splitter" << slog::endl
+               << "Device (-d) .......................... "
+               << "HETERO:FPGA" << slog::endl
+               << "Compiled model (-cm) ................. " << FLAGS_cm << slog::endl
+               << "Input images directory (-i) .......... "
+               << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl
+               << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl
+               << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl;
+
+    /** This vector stores paths to the processed images **/
+    auto multiInputFiles = VectorMap<std::vector<std::string>>(
+        SplitMultiInputFilesArguments(netSize),  // get input directory list
+        [&](const std::vector<std::string>& inputArgs) mutable {
+          std::vector<std::string> files;
+          for (auto& inputArg : inputArgs) {
+            // Test if the path exists
+            if (!exists_test(inputArg)) {
+              slog::err << "Specified image path: " << inputArg << " does not exist" << slog::endl;
+              throw std::logic_error("Image path does not exist");
+            }
+            // Test whether the path can be opened if it's a directory
+            dir_open_test(inputArg);
+            readInputFilesArguments(files, inputArg);
+          }
+
+          return files;
+        });
+    if (multiInputFiles.size() == 0) {
+      // failed to read input files
+      slog::err << "Failed to read input files" << slog::endl;
+      return 1;
+    }
+
+    uint32_t num_batches = 1;
+
+    // ----------------- 2. Loading the Inference Engine -----------------------------------------------------------
+    next_step();
+
+    // Get optimal runtime parameters for device
+    std::string device_name = "HETERO:FPGA";
+    ov::Core core(FLAGS_plugins);
+
+    if (device_name.find("FPGA") != std::string::npos) {
+      if (FLAGS_encryption_key != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}});
+      }
+      if (FLAGS_encryption_iv != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}});
+      }
+    }
+
+    slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl;
+
+    // ----------------- 3. Setting device configuration -----------------------------------------------------------
+    next_step();
+
+    size_t batchSize = 1;
+    std::vector<std::string> topology_names;
+    if (!isNetworkCompiled) {
+    } else {
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      // ----------------- 7. Loading the model to the device --------------------------------------------------------
+      next_step();
+
+      int folding_option = 1;
+      bool fold_preprocessing = false;
+      bool enable_early_access = false;
+      if (FLAGS_folding_option) {
+        folding_option = FLAGS_folding_option;
+      }
+      if (FLAGS_fold_preprocessing) {
+        fold_preprocessing = FLAGS_fold_preprocessing;
+      }
+      if (FLAGS_enable_early_access) {
+        enable_early_access = FLAGS_enable_early_access;
+      }
+      core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}});
+
+      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+      exeNetworks = vectorMapWithIndex<ov::CompiledModel*>(
+          split(FLAGS_cm, MULTIGRAPH_SEP),  // get a list of compiled graphs
+          [&](const std::string& compiled_graph_path, size_t index) {
+            std::stringstream generated_name;
+            generated_name << "Graph_" << index;
+            slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as "
+                       << generated_name.str() << slog::endl;
+            std::filebuf objFileBuf;
+            objFileBuf.open(compiled_graph_paths[index].c_str(), std::ios::in | std::ios::binary);
+            std::istream objIstream(&objFileBuf);
+            auto exeNetwork = new ov::CompiledModel();
+            *exeNetwork = core.import_model(objIstream, device_name, {});
+            topology_names.push_back(generated_name.str());
+            objFileBuf.close();
+            printInputAndOutputsInfoShort(*exeNetwork);
+            if (batchSize == 0) {
+              batchSize = 1;
+            }
+            const auto& inputs = exeNetwork->inputs();
+            for (const auto& item : inputs) {
+              auto& dims = item.get_shape();
+              if (dims[0] != batchSize) {
+                slog::err << "Batch size of the compiled model is " << dims[0] << " and batch size provided is "
+                          << batchSize << slog::endl;
+                std::cout << "Set the same batch size = " << dims[0] << " when running the app" << std::endl;
+                std::cout << "Or recompile model with batch size = " << batchSize << std::endl;
+                exit(5);
+              }
+            }
+            return exeNetwork;
+          });
+    }
+    // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
+    next_step();
+
+    // Number of requests
+    uint32_t nireq = 1;
+    if (nireq == 0) {
+      nireq = 1;
+    }
+    int niter = 1;
+
+    if (niter > 0) {
+      num_batches = niter;
+    }
+
+    // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
+    next_step();
+    std::vector<dla_benchmark::InputsInfo> inputInfos;
+    // Data structure hierarchy
+    // Outermost vec: which model it corresponds to (multigraph)
+    // Map: input/output name and its corresponding TensorVector
+    // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch
+    std::vector<std::map<std::string, ov::TensorVector>> inputsData;
+    std::vector<std::map<std::string, ov::TensorVector>> outputTensors(exeNetworks.size());
+
+    std::vector<std::unique_ptr<InferRequestsQueue>> inferRequestsQueues;
+    const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type;
+    for (size_t netIdx = 0; netIdx < exeNetworks.size(); netIdx++) {
+      // Handle the case that use same inputs for all networks
+      const auto& inputFiles = netIdx >= multiInputFiles.size() ? multiInputFiles.back() : multiInputFiles[netIdx];
+      inputInfos.push_back(GetInputsInfo(batchSize, exeNetworks[netIdx]->inputs(), FLAGS_bin_data));
+      inputsData.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles,
+                                            batchSize,
+                                            inputInfos[netIdx],
+                                            num_batches,
+                                            resize_type,
+                                            FLAGS_bgr,
+                                            FLAGS_bin_data,
+                                            false /* verbose outputs not supported for aot splitter */));
+      // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv
+      inferRequestsQueues.push_back(
+          std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(exeNetworks[netIdx]), nireq))));
+    }
+
+    /** Start inference & calculate performance **/
+    /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/
+    std::vector<size_t> iterations(exeNetworks.size(), 0);
+
+    try {
+      {
+        // set up all infer request and prep all i/o Blobs
+        for (size_t net_id = 0; net_id < exeNetworks.size(); net_id++) {
+          for (size_t iireq = 0; iireq < nireq; iireq++) {
+            auto inferRequest = inferRequestsQueues.at(net_id)->get_idle_request();
+            if (!inferRequest) {
+              THROW_IE_EXCEPTION << "No idle Infer Requests!";
+            }
+
+            if (niter != 0LL) {
+              const auto& outputs = exeNetworks[net_id]->outputs();
+              for (const auto& output : outputs) {
+                const std::string& name = output.get_any_name();
+                outputTensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape());
+                inferRequest->set_tensor(name, outputTensors.at(net_id).at(name).at(iterations.at(net_id)));
+              }
+              const auto& inputs = exeNetworks[net_id]->inputs();
+              for (auto& input : inputs) {
+                const std::string& inputName = input.get_any_name();
+                const auto& data = inputsData.at(net_id).at(inputName)[iterations.at(net_id)];
+                inferRequest->set_tensor(inputName, data);
+              }
+            }
+
+            {
+              std::cout << "Generating Artifacts" << std::endl;
+              inferRequest->infer();
+            }
+          }
+        }
+      }
+    } catch (const std::exception& ex) {
+      std::cerr << ex.what() << std::endl;
+      slog::err << "Generation failed" << slog::endl;
+      return_code = 1;
+    }
+
+    if (return_code) return return_code;
+  } catch (const std::exception& ex) {
+    slog::err << ex.what() << slog::endl;
+    return 3;
+  }
+
+  return 0;
+}
diff --git a/python/openvino/runtime/dla_benchmark/CMakeLists.txt b/python/openvino/runtime/dla_benchmark/CMakeLists.txt
new file mode 100644
index 0000000..3a50459
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "dla_benchmark")
+
+if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+  set(CMAKE_CXX_STANDARD 20)
+else()
+  set (CMAKE_CXX_STANDARD 14)
+endif()
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT WIN32)
+  if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+    set (CMAKE_CXX_FLAGS "-std=c++14 ${CMAKE_CXX_FLAGS}")
+  endif()
+endif()
+
+file (GLOB MAIN_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/../common/utils/src/*.cpp
+)
+
+file (GLOB MAIN_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
+)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+if (DE10_AGILEX)
+  add_definitions(-DDE10_AGILEX)
+endif()
+
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+# For FPGA plugin configs and properties.
+target_include_directories(${TARGET_NAME} PRIVATE
+    "$ENV{COREDLA_ROOT}/dla_plugin/inc/"
+    "$ENV{COREDLA_ROOT}/util/inc/"
+)
+
+if (NOT WIN32)
+    set (LIB_DL dl)
+endif()
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+  openvino::runtime
+  openvino_dev_api
+  ${OpenCV_LIBRARIES}
+  coreDLAHeteroPlugin
+  format_reader
+  ie_samples_utils
+)
+
+if (NOT WIN32)
+    target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_DL} pthread)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
+
+# For libcoreDlaRuntimePlugin.so - typically specified by $COREDLA_ROOT/runtime/plugins.xml
+set_target_properties(${TARGET_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN/..")
+
+# Enable high graph logging by defining its macro
+# Change to add_compile_definitions() once we move to cmake >= 3.12
+if (DLA_ENABLE_LOGGING)
+  target_compile_definitions(${TARGET_NAME} PRIVATE -DENABLE_HG_LOGGING)
+endif()
+
+# Ensure number of inference request is 1 when using the system-console plugin
+if (SYSTEM_CONSOLE_PLATFORM)
+  target_compile_definitions(${TARGET_NAME} PRIVATE -DMAX_NUM_INFERENCE_REQUEST=1)
+endif()
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT DEMO)
diff --git a/python/openvino/runtime/dla_benchmark/README.md b/python/openvino/runtime/dla_benchmark/README.md
new file mode 100644
index 0000000..9734013
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/README.md
@@ -0,0 +1,179 @@
+# Benchmark C++ Tool
+
+This topic demonstrates how to use the Benchmark C++ Tool to estimate deep learning inference performance on supported devices. Performance can be measured for two inference modes: synchronous (latency-oriented) and asynchronous (throughput-oriented).
+
+> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Tool. For the Python* implementation, refer to [Benchmark Python* Tool](../python_demos/OpenVINO_benchmark_app/README.md).
+
+## New Features Added
+
+Some of the changes made in the dla_benchmark C++ tool for the Intel FPGA AI Suite are:
+* Dumping output values into a text file named `result.txt`.
+* In the `result.txt` file, in addition to output values, output tensor index is added to each value after the # sign to allow easier identification when the graph has multiple outputs.
+* In addition to `result.txt`, the dla_benchmark will generate another text file named `result_tensor_boundaries.txt` that lists which lines of the result.txt file are for which output tensor as well as the layout and dimension of each output tensor.
+* Top1/top5 accuracy check is added.
+
+> **NOTE**: The following README is directly from OpenVINO.
+
+## How It Works
+
+Upon start-up, the application reads command-line parameters and loads a network and images/binary files to the Inference Engine plugin, which is chosen depending on a specified device. The number of infer requests and execution approach depend on the mode defined with the `-api` command-line parameter.
+
+> **NOTE**: By default, Inference Engine samples, tools and demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the sample or demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](./docs/MO_DG/prepare_model/convert_model/Converting_Model_General.md).
+
+If you run the application in the synchronous mode, it creates one infer request and executes the `Infer` method.
+If you run the application in the asynchronous mode, it creates as many infer requests as specified in the `-nireq` command-line parameter and executes the `StartAsync` method for each of them. If `-nireq` is not set, the application will use the default value for specified device.
+
+A number of execution steps is defined by one of the following parameters:
+* Number of iterations specified with the `-niter` command-line argument
+* Time duration specified with the `-t` command-line argument
+* Both of them (execution will continue until both conditions are met)
+* Predefined duration if `-niter` and `-t` are not specified. Predefined duration value depends on a device.
+
+During the execution, the application collects latency for each executed infer request.
+
+Reported latency value is calculated as a median value of all collected latencies. Reported throughput value is reported
+in frames per second (FPS) and calculated as a derivative from:
+* Reported latency in the Sync mode
+* The total execution time in the Async mode
+
+Throughput value also depends on batch size.
+
+The application can save a summary of the run, including the selected command line parameters and a copy of the high-level execution statistics (e.g. overall throughput, execution wall-clock time), by setting the `-save_run_summary` flag. This summary is saved in dla_benchmark_run_summary.csv.
+
+The application also saves executable graph information serialized to a XML file if you specify a path to it with the
+`-exec_graph_path` parameter.
+
+
+## Run the Tool
+Notice that the dla_benchmark usually produces optimal performance for any device out of the box.
+
+**So in most cases you don't need to play the app options explicitly and the plain device name is enough**, for example, for CPU:
+```sh
+./dla_benchmark -m <model> -i <input> -d CPU
+```
+
+But it is still may be non-optimal for some cases, especially for very small networks. More details can read in [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md).
+
+As explained in the  [Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) section, for all devices, including new [MULTI device](./docs/IE_DG/supported_plugins/MULTI.md) it is preferable to use the FP16 IR for the model.
+Also if latency of the CPU inference on the multi-socket machines is of concern, please refer to the same
+[Introduction to Performance Topics](./docs/IE_DG/Intro_to_Performance.md) document.
+
+Running the application with the `-h` option yields the following usage message:
+```
+./dla_benchmark -h
+InferenceEngine:
+        API version ............ <version>
+        Build .................. <number>
+[ INFO ] Parsing input parameters
+
+dla_benchmark [OPTION]
+Options:
+
+    -h, --help                Print a usage message
+    -i "<path>"               Optional. Path to a folder with images and/or binaries or to specific image or binary file.
+    -m "<path>"               Required. Path to an .xml file with a trained model.
+    -d "<device>"             Optional. Specify a target device to infer on (the list of available devices is shown below). Default value is CPU.
+                              Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin.
+    -l "<absolute_path>"      Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
+    -api "<sync/async>"       Optional. Enable Sync/Async API. Default value is "async".
+    -niter "<integer>"        Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+    -nireq "<integer>"        Optional. Number of infer requests. Default value is determined automatically for a device.
+    -b "<integer>"            Optional. Batch size value. If not specified, the batch size value is determined from Intermediate Representation.
+    -stream_output            Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a multiline output.
+    -t                        Optional. Time in seconds to execute topology.
+    -progress                 Optional. Show progress bar (can affect performance measurement). Default values is "false".
+
+  CPU-specific performance options:
+    -nstreams "<integer>"     Optional. Number of streams to use for inference on the CPU in throughput mode
+                              (for HETERO device cases use format <device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>).
+                              Default value is determined automatically for a device.
+                              Please note that although the automatic selection usually provides a reasonable performance,
+                              it still may be non-optimal for some cases, especially for very small networks.
+    -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO cases).
+    -pin "YES"/"NUMA"/"NO"    Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO")
+                              CPU threads pinning for CPU-involved inference.
+
+  Statistics dumping options:
+    -save_run_summary         Optional. Enable saving a summary of the run containing the specified command-line parameters and a copy of the performance report printed to stdout.
+    -report_folder            Optional. Path to a folder where statistics report is stored.
+    -exec_graph_path          Optional. Path to a file where to store executable graph information serialized.
+```
+
+Running the application with the empty list of options yields the usage message given above and an error message.
+
+Application supports topologies with one or more inputs. If a topology is not data sensitive, you can skip the input parameter. In this case, inputs are filled with random values.
+If a model has only image input(s), please a provide folder with images or a path to an image as input.
+If a model has some specific input(s) (not images), please prepare a binary file(s), which is filled with data of appropriate precision and provide a path to them as input.
+If a model has mixed input types, input folder should contain all required files. Image inputs are filled with image files one by one. Binary inputs are filled with binary inputs one by one.
+
+To run the tool, you can use public or Intel's pre-trained models. To download the models, use the OpenVINO [Model Downloader](./tools/downloader/README.md) or go to [https://download.01.org/opencv/](https://download.01.org/opencv/).
+
+> **NOTE**: Before running the tool with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
+## Examples of Running the Tool
+
+This section provides step-by-step instructions on how to run the Benchmark Tool with the `googlenet-v1` public model on CPU or FPGA devices. As an input, the `car.png` file from the `<INSTALL_DIR>/deployment_tools/demo/` directory is used.
+
+> **NOTE:** The Internet access is required to execute the following steps successfully. If you have access to the Internet through the proxy server only, please make sure that it is configured in your OS environment.
+
+1. Download the model. Go to the the Model Downloader directory and run the `downloader.py` script with specifying the model name and directory to download the model to:
+   ```sh
+   cd <INSTAL_DIR>/deployment_tools/open_model_zoo/tools/downloader
+   ```
+   ```sh
+   python3 downloader.py --name googlenet-v1 -o <models_dir>
+   ```
+2. Convert the model to the Inference Engine IR format. Go to the Model Optimizer directory and run the `mo.py` script with specifying the path to the model, model format (which must be FP32 for CPU and FPG) and output directory to generate the IR files:
+   ```sh
+   cd <INSTALL_DIR>/deployment_tools/model_optimizer
+   ```
+   ```sh
+   python3 mo.py --input_model <models_dir>/public/googlenet-v1/googlenet-v1.caffemodel --data_type FP32 --output_dir <ir_dir>
+   ```
+3. Run the tool with specifying the `<INSTALL_DIR>/deployment_tools/demo/car.png` file as an input image, the IR of the `googlenet-v1` model and a device to perform inference on. The following commands demonstrate running the Benchmark Tool in the asynchronous mode on CPU and FPGA devices:
+
+   * On CPU:
+   ```sh
+   ./dla_benchmark -m <ir_dir>/googlenet-v1.xml -d CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```
+   * On FPGA:
+   ```sh
+   ./dla_benchmark -m <ir_dir>/googlenet-v1.xml -d HETERO:FPGA,CPU -api async -i <INSTALL_DIR>/deployment_tools/demo/car.png --progress true
+   ```
+
+The application outputs the number of executed iterations, total duration of execution, latency and throughput.
+Additionally, if you set the `-save_run_summary` flag the application saves a report containing the selected command line parameters and a copy of the overall performance report printed to stdout. If you set `-exec_graph_path`, the application reports executable graph information serialized. All measurements are reported in milliseconds.
+
+Below are fragments of sample output for CPU and FPGA devices:
+
+* For CPU:
+   ```
+   [Step 8/9] Measuring performance (Start inference asyncronously, 60000 ms duration, 4 inference requests in parallel using 4 streams)
+   Progress: [....................] 100.00% done
+
+   [Step 9/9] Dumping statistics report
+   [ INFO ] Statistics collecting was not requested. No reports are dumped.
+   Progress: [....................] 100.00% done
+
+   Count:      4612 iterations
+   Duration:   60110.04 ms
+   Latency:    50.99 ms
+   Throughput: 76.73 FPS
+   ```
+
+* For FPGA:
+   ```
+   [Step 10/11] Measuring performance (Start inference asynchronously, 5 inference requests using 4 streams for CPU, limits: 120000 ms duration)
+   Progress: [....................] 100% done
+
+   [Step 11/11] Dumping statistics report
+   Count:      102515 iterations
+   Duration:   120007.38 ms
+   Latency:    5.84 ms
+   Throughput: 854.24 FP
+   ```
+
+## See Also
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
+* [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+* [Model Downloader](./tools/downloader/README.md)
diff --git a/python/openvino/runtime/dla_benchmark/average_precision.cpp b/python/openvino/runtime/dla_benchmark/average_precision.cpp
new file mode 100644
index 0000000..84008b7
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/average_precision.cpp
@@ -0,0 +1,696 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// The function of this file is to provide mAP and COCO AP calculation in metrics_eval
+// and metrics_update. The calculation is comprised with two parts, 1) data preprocessing,
+// and 2) metrics calculation. Data preprocessing consists of prediction box parsing;
+// resize and filtering; non-max suppression; and clipping. The preprocessed data is stored
+// in `PredictionEntry` and `AnnotationEntry` structs, which are used in the `metrics_update`
+// and `metrics_eval`. `metrics_update` updates intermidiate statistics to form the batched
+// statistics, and the `metrics_eval` calculated the integral of the ROC of P-R curve. All of
+// the metadata should be set in the header file and the runtime invariants are set using
+// `set_runtime`. The validate_yolo_wrapper is the main entery point of the subroutine.
+//
+// The mAP algorithm is built according to the section 2.2 in https://arxiv.org/pdf/1607.03476.pdf
+// and OpenVINO's accuracy_checker. The COCO AP algorithm is specified in
+// https://cocodataset.org/#detection-eval. The result is compared value-by-value with the
+// result from OpenVINO's accuracy_checker using dlsdk launcher. To obtain the the golden
+// result, apply the steps in https://docs.openvino.ai/latest/omz_models_model_yolo_v3_tf.html.
+
+#include "average_precision.hpp"
+#include <cmath>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#else
+#include <dirent.h>
+#endif
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <utility>
+#include <sstream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <samples/slog.hpp>
+#include "utils.hpp"
+
+#define VERBOSE 0
+
+// Parses predicted boxes in `results_data` to a 2d tensor `raw_predictions`. The parameter
+// `batch` indicates the image which corresponds to those predicted boxes.
+// Order: conv2d_12[1x255x26x26] -> conv2d_9[1x255x13x13], NCHW order
+void parse_prediction_boxes(std::vector<double> &predicted_val, Tensor2d<double> &raw_predictions) {
+  raw_predictions.emplace_back(std::vector<double>{});
+  const std::vector<unsigned> &grid_sizes = yolo_meta.grid_sizes.at(runtime_vars.name);
+
+  int total_boxes{0};
+  std::for_each(std::begin(grid_sizes), std::end(grid_sizes), [&](unsigned n) {
+    total_boxes += std::pow(n, 2) * yolo_meta.box_per_channel;
+  });
+
+  for (int count = 0; count < total_boxes; count++) {
+    raw_predictions.emplace_back(Box<double>{});
+    raw_predictions[count].reserve(yolo_meta.pbox_size);
+  }
+
+  auto index_of = [=](int n, int c, int h, int w, int C, int H, int W) {
+    return n * C * H * W + c * H * W + h * W + w;
+  };
+
+  // first are boxes in 26x26 grid
+  // treat each tensor as 3 batchs
+  for (int grid : grid_sizes) {
+    // offset to where the data is retrieved
+    int data_offset{0};
+    // offset to where the data is inserted
+    int position_offset{0};
+    for (int n : grid_sizes) {
+      if (n == grid) break;
+      data_offset += pow(n, 2) * yolo_meta.channel;
+      position_offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+
+    int N = yolo_meta.box_per_channel, C = yolo_meta.pbox_size, H = grid, W = grid;
+
+    for (int n = 0; n < N; n++) {
+      for (int c = 0; c < C; c++) {
+        for (int h = 0; h < H; h++) {
+          for (int w = 0; w < W; w++) {
+            // corresponds to #c data for grid #h,w, of the #n anchor
+            Box<double> &pbox = raw_predictions[position_offset + n * H * W + h * W + w];
+            // fills prediction boxes
+            pbox.emplace_back(predicted_val[data_offset + index_of(n, c, h, w, C, H, W)]);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Parses annotation boxes stored in text file and stores in a 3d tensor `raw_annotation`.
+// Precondition: the file is formatted such that each line contains 5 doubles and separated
+// by spaces, i.e. [class, x, y, width, height]. Returns -3 if cannot read from file.
+int parse_annotation_boxes(Tensor3d<double> &raw_annotation, const std::string &path) {
+  int err = 0;
+  std::ifstream annotation_file(path);
+  if (!annotation_file.is_open()) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    err = -3;
+  } else {
+    Tensor2d<double> annotation_box;
+    int class_id;
+    double x, y, w, h;
+    while (annotation_file >> class_id >> x >> y >> w >> h) {
+      annotation_box.emplace_back(Box<double>{x, y, w, h, (double)class_id});
+    }
+    raw_annotation.emplace_back(annotation_box);
+  }
+  return err;
+}
+
+// Extracts filenames in `path` with given extension specified in `extensions`.
+// Returns the number of file with extension `ext`, or -1 for error.
+int walk_dirent(std::vector<std::string> &names, const std::string &path, std::string ext) {
+#if defined(_WIN32) || defined(_WIN64)
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+  int count = 0;
+  for (const auto &entry : fs::directory_iterator(path)) {
+    if (fs::is_regular_file(entry)) {
+      std::string filename = entry.path().filename().string();
+      std::string file_extension = filename.substr(filename.find_last_of(".") + 1);
+      if (file_extension == ext) {
+        names.emplace_back(filename);
+        count++;
+      }
+    }
+  }
+#endif
+#else
+  DIR *dir = opendir(path.c_str());
+  int count = 0;
+  if (!dir) {
+    slog::err << "Couldn't access path: " << path << slog::endl;
+    count = -1;
+  } else {
+    for (struct dirent *dent; (dent = readdir(dir)) != nullptr;) {
+      std::string dirname(dent->d_name);
+      std::string stem = GetStem(dirname);
+      std::string extension = GetExtension(dirname);
+      if (stem == "" || stem == "." || extension != ext) continue;
+      names.emplace_back(stem);
+      count += 1;
+    }
+    closedir(dir);
+  }
+#endif
+  return count;
+}
+
+// Dispatches each step of collecting predicted boxes, annotation boxes, and shapes.
+// The function returns 0 on success, -1 for mismatch in the number of annotation files
+// and validation images, -2 for missing annotation file, -3 for failing to access annotation
+// file, and -4 for failing to access validation image.
+int collect_validation_dataset(std::vector<std::string> &image_paths,
+                               Tensor3d<double> &raw_annotations,
+                               Tensor2d<double> &shapes) {
+  int err = 0;
+
+  // set of annotation file name
+  std::vector<std::string> tmp;
+  int num_file = walk_dirent(tmp, runtime_vars.groundtruth_loc, runtime_vars.gt_extension);
+  if (num_file < (int)(runtime_vars.batch_size * runtime_vars.niter)) {
+    if (num_file >= 0) {
+      slog::err << "Not enough validation data found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+                << num_file << " provided." << slog::endl;
+    }
+    err = -1;
+  } else {
+    std::set<std::string> annotation_file_index(tmp.begin(), tmp.end());
+
+    // gets all images, corresponding annotation, and shapes
+    std::string gt_path;
+    for (unsigned batch = 0; batch < runtime_vars.batch_size * runtime_vars.niter; batch++) {
+      std::string image_path(image_paths[batch]);
+      std::string img_name = GetStem(image_path);
+      if (annotation_file_index.find(img_name) == annotation_file_index.end()) {
+        slog::err << "Missing annotation file for validation image: " << image_paths[batch] << slog::endl;
+        err = -2;
+        break;
+      } else {
+        gt_path = runtime_vars.groundtruth_loc + "/" + img_name + "." + runtime_vars.gt_extension;
+
+        // gets image dimensions
+        cv::Mat image = cv::imread(image_paths[batch]);
+        if (image.data == nullptr || image.empty()) {
+          slog::err << "Couldn't open input image: " << image_paths[batch] << slog::endl;
+          err = -4;
+          break;
+        }
+
+        err = parse_annotation_boxes(raw_annotations, gt_path);
+        if (err != 0) break;
+        shapes.emplace_back(Box<double>{(double)image.cols, (double)image.rows});
+      }
+    }
+  }
+  return err;
+}
+
+// Removes items at `indices` in the vector `vec`
+template <typename T>
+void reduce_by_index(std::vector<T> &vec, std::vector<unsigned> indices) {
+  std::sort(indices.begin(), indices.end());
+  for (auto it = indices.rbegin(); it != indices.rend(); it++) {
+    vec.erase(vec.begin() + *it);
+  }
+}
+
+// Calculates and returns the Intersection over Union score for two boxes by
+// calculating their area of overlap and area of union.
+double intersection_over_union(Box<double> box1, Box<double> box2) {
+  using namespace std;
+  {
+    double intersect_length_x =
+        max(0.0, min(box1[X_MAX], box2[X_MAX]) - max(box1[X_MIN], box2[X_MIN]) + yolo_meta.boundary);
+    double intersect_length_y =
+        max(0.0, min(box1[Y_MAX], box2[Y_MAX]) - max(box1[Y_MIN], box2[Y_MIN]) + yolo_meta.boundary);
+    double intersection_of_area = intersect_length_x * intersect_length_y;
+    double box1_area =
+        (box1[X_MAX] - box1[X_MIN] + yolo_meta.boundary) * (box1[Y_MAX] - box1[Y_MIN] + yolo_meta.boundary);
+    double box2_area =
+        (box2[X_MAX] - box2[X_MIN] + yolo_meta.boundary) * (box2[Y_MAX] - box2[Y_MIN] + yolo_meta.boundary);
+    double union_of_area = box1_area + box2_area - intersection_of_area;
+    return (union_of_area > 0.0) ? intersection_of_area / union_of_area : 0.0;
+  }  // namespace std
+}
+
+// This function returns the index of the largest element in the vector `vec`.
+template <typename T>
+int argmax(std::vector<T> vec) {
+  return std::distance(vec.begin(), std::max_element(vec.begin(), vec.end()));
+}
+
+// This function returns the index of the largest element in the iterator from `begin` to `end`.
+template <typename Iter>
+int argmax(Iter begin, Iter end) {
+  return std::distance(begin, std::max_element(begin, end));
+}
+
+// Resize the coordinates of bounding boxes from relative ratio to grid cell to the actual coordinates in pixel.
+// This function resizes prediction boxes in the 2d tensor `raw_predictions` based on the definition in page 4 of
+// https://arxiv.org/pdf/1612.08242.pdf. The prediction boxes are also filtered based on their confidence score
+// and class specific score. The result is stored in an instance of `PredictionEntry` which is used for statistics
+// calculation.
+void resize_and_filter_prediction_boxes(Tensor2d<double> &raw_predictions,
+                                        PredictionEntry &prediction,
+                                        const unsigned batch) {
+  unsigned size = 0;
+
+#if VERBOSE == 1
+  unsigned c12 = 0, c9 = 0, c58 = 0, c66 = 0, c74 = 0;
+#endif
+
+  for (unsigned grid : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+    unsigned offset{0};
+    for (unsigned n : yolo_meta.grid_sizes.at(runtime_vars.name)) {
+      if (n == grid) break;
+      offset += pow(n, 2) * yolo_meta.box_per_channel;
+    }
+    for (unsigned x = 0; x < grid; x++) {
+      for (unsigned y = 0; y < grid; y++) {
+        for (unsigned n = 0; n < yolo_meta.box_per_channel; n++) {
+          unsigned bbox_idx = offset + n * pow(grid, 2) + y * grid + x;
+          Box<double> &bbox = raw_predictions[bbox_idx];
+
+          // find the predicted label as the class with highest score
+          int label = argmax(bbox.begin() + (yolo_meta.pbox_size - yolo_meta.num_classes), bbox.end());
+          double cls_score = bbox[BBOX_CONFIDENCE] * bbox[(yolo_meta.pbox_size - yolo_meta.num_classes) + label];
+          // filter outliers with low confidence score or class score
+          if (bbox[BBOX_CONFIDENCE] < yolo_meta.confidence_threshold || cls_score < yolo_meta.confidence_threshold)
+            continue;
+          prediction.cls.push_back(label);
+          prediction.cls_score.push_back(cls_score);
+#if VERBOSE == 1
+          c74 += (unsigned)(grid == 52);
+          c66 += (unsigned)(grid == 26);
+          c58 += (unsigned)(grid == 13);
+          c12 += (unsigned)(grid == 26);
+          c9 += (unsigned)(grid == 13);
+#endif
+          // deduce anchor box width and height
+          unsigned dim = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid).size() / yolo_meta.box_per_channel;
+          double anchor_w = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim];
+          double anchor_h = yolo_meta.anchor_sizes.at(runtime_vars.name).at(grid)[n * dim + 1];
+
+          // calculate width and height of bbox
+          double bbox_center_x = (bbox[BBOX_X] + x) / grid;
+          double bbox_center_y = (bbox[BBOX_Y] + y) / grid;
+          double bbox_w = exp(bbox[BBOX_W]) * anchor_w / yolo_meta.dst_image_size[IMG_W];
+          double bbox_h = exp(bbox[BBOX_H]) * anchor_h / yolo_meta.dst_image_size[IMG_H];
+
+          // calculate actual coordinates of bbox
+          double x_max, x_min, y_max, y_min;
+          double w = runtime_vars.source_image_sizes[batch][IMG_W];
+          double h = runtime_vars.source_image_sizes[batch][IMG_H];
+
+          x_max = w * (bbox_center_x + bbox_w / 2.0);
+          x_min = w * (bbox_center_x - bbox_w / 2.0);
+          y_max = h * (bbox_center_y + bbox_h / 2.0);
+          y_min = h * (bbox_center_y - bbox_h / 2.0);
+
+          prediction.x_max.emplace_back(x_max);
+          prediction.x_min.emplace_back(x_min);
+          prediction.y_max.emplace_back(y_max);
+          prediction.y_min.emplace_back(y_min);
+
+          size += 1;
+        }
+      }
+    }
+  }
+  prediction.size = size;
+#if VERBOSE == 1
+  if (runtime_vars.name == "yolo-v3-tf") {
+    slog::info << "prediction boxes from conv2d58: " << c58 << slog::endl;
+    slog::info << "prediction boxes from conv2d66: " << c66 << slog::endl;
+    slog::info << "prediction boxes from conv2d74: " << c74 << slog::endl;
+  } else if (runtime_vars.name == "yolo-v3-tiny-tf") {
+    slog::info << "prediction boxes from conv2d12: " << c12 << slog::endl;
+    slog::info << "prediction boxes from conv2d9: " << c9 << slog::endl;
+  }
+#endif
+}
+
+// Returns indices of `vec` sorted in descending order.
+std::vector<unsigned> argsort_gt(const std::vector<double> &vec) {
+  std::vector<unsigned> order(vec.size());
+  std::generate(order.begin(), order.end(), [n = 0]() mutable { return n++; });
+  std::sort(order.begin(), order.end(), [&](int i1, int i2) { return vec[i1] > vec[i2]; });
+  return order;
+}
+
+// Performs non-maximum suppression algorithm to eliminate repetitive bounding boxes.
+// A bounding box is preserved iff. it has the highest confidence score over all
+// overlapping bounding boxes.
+void nms(PredictionEntry &prediction) {
+  if (prediction.size == 0) return;
+  std::vector<unsigned> &&order = argsort_gt(prediction.cls_score);
+  std::vector<unsigned> keep;
+  std::set<unsigned> discard;
+  unsigned top_score_idx;
+
+  while (discard.size() < order.size()) {
+    bool has_top = false;
+    for (unsigned idx : order) {
+      if (discard.find(idx) != discard.end()) continue;
+      if (!has_top) {
+        has_top = true;
+        top_score_idx = idx;
+        keep.emplace_back(top_score_idx);
+        discard.insert(top_score_idx);
+        continue;
+      }
+      double iou = intersection_over_union(prediction.box_at(idx), prediction.box_at(top_score_idx));
+      if (iou > yolo_meta.iou_threshold) {
+        discard.insert(idx);
+      }
+    }
+  }
+
+  std::vector<unsigned> discard_idx(discard.size());
+  std::vector<unsigned> indexes(discard.begin(), discard.end());
+  std::sort(indexes.begin(), indexes.end());
+  std::sort(keep.begin(), keep.end());
+  std::vector<unsigned>::iterator it =
+      std::set_difference(indexes.begin(), indexes.end(), keep.begin(), keep.end(), discard_idx.begin());
+  discard_idx.resize(it - discard_idx.begin());
+
+  // remove filtered predicted bounding boxes.
+  reduce_by_index(prediction.x_max, discard_idx);
+  reduce_by_index(prediction.x_min, discard_idx);
+  reduce_by_index(prediction.y_max, discard_idx);
+  reduce_by_index(prediction.y_min, discard_idx);
+  reduce_by_index(prediction.cls_score, discard_idx);
+  reduce_by_index(prediction.cls, discard_idx);
+  prediction.size -= discard_idx.size();
+}
+
+// Calculates the actual size of the groundtruth bounding boxes.
+void resize_annotation_boxes(Tensor3d<double> &raw_annotations, AnnotationEntry &annotation, const unsigned batch) {
+  for (auto &gt_box : raw_annotations[batch]) {
+    annotation.x_max.emplace_back(gt_box[BBOX_X] + gt_box[BBOX_W]);
+    annotation.x_min.emplace_back(gt_box[BBOX_X]);
+    annotation.y_max.emplace_back(gt_box[BBOX_Y] + gt_box[BBOX_H]);
+    annotation.y_min.emplace_back(gt_box[BBOX_Y]);
+    annotation.cls.emplace_back(gt_box[BBOX_CONFIDENCE]);
+  }
+  annotation.size = raw_annotations[batch].size();
+}
+
+// Limits the coordinates of predicted bounding boxes within the dimension of source image.
+void clip_box(PredictionEntry &prediction, const unsigned batch) {
+  if (prediction.size == 0) return;
+  double x_upper_bound = runtime_vars.source_image_sizes[batch][IMG_W];
+  double y_upper_bound = runtime_vars.source_image_sizes[batch][IMG_H];
+  auto _clip = [](double v, double lower, double upper) {  return (v < lower) ? lower : ((v > upper) ? upper : v); };
+  for (unsigned idx = 0; idx < prediction.size; idx++) {
+    prediction.x_max[idx] = _clip(prediction.x_max[idx], 0, x_upper_bound);
+    prediction.x_min[idx] = _clip(prediction.x_min[idx], 0, x_upper_bound);
+    prediction.y_max[idx] = _clip(prediction.y_max[idx], 0, y_upper_bound);
+    prediction.y_min[idx] = _clip(prediction.y_min[idx], 0, y_upper_bound);
+  }
+}
+
+// Calculates area under the PR curve using 11-intervaled sum.
+double average_precision(const std::vector<double> &precision, const std::vector<double> &recall, unsigned interval) {
+  double result = 0.0;
+  double step = 1 / (double)(interval - 1);
+  for (unsigned intvl = 0; intvl < interval; intvl++) {
+    double point = step * intvl;
+    double max_precision = 0.0;
+    for (unsigned idx = 0; idx < recall.size(); idx++) {
+      if (recall[idx] >= point) {
+        if (precision[idx] > max_precision) {
+          max_precision = precision[idx];
+        }
+      }
+    }
+    result += max_precision / (double)interval;
+  }
+  return result;
+}
+
+// Stores intermediate statistics for AP calculation. AP's are calculated from
+// true-positive, false-positive, and the number of targets, sorted
+// by the class score of the predicted bounding box.
+typedef struct _map_stats {
+  int num_gt_object;
+  std::vector<double> scores;
+  std::vector<int> true_positive;
+  std::vector<int> false_positive;
+
+  _map_stats() { this->num_gt_object = 0; }
+} mAPStats;
+
+// Calculates the 11-point interpolated mAP.
+std::vector<mAPStats> mean_average_precision(PredictionEntry &prediction, AnnotationEntry &annotation, double thresh) {
+  std::vector<int> class_list(yolo_meta.num_classes);
+  std::generate(class_list.begin(), class_list.end(), [n = 0]() mutable { return n++; });
+
+  std::vector<mAPStats> image_result(yolo_meta.num_classes, mAPStats{});
+
+  // average precision for each class
+  for (int category : class_list) {
+    // total number of bounding boxes in the annotation.
+    int num_gt_object =
+        std::count_if(annotation.cls.begin(), annotation.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    // total number of predicted bounding boxes.
+    int num_pred_boxes =
+        std::count_if(prediction.cls.begin(), prediction.cls.end(), [&](int &cls) { return (cls == (int)category); });
+
+    image_result[category].num_gt_object = num_gt_object;
+
+    // stores the scores for sorting out the correct order of TP and FP.
+    image_result[category].true_positive.resize(num_pred_boxes, 0);
+    image_result[category].false_positive.resize(num_pred_boxes, 0);
+    image_result[category].scores.resize(num_pred_boxes, 0.0);
+    std::set<unsigned> matched_gtbox;
+
+    unsigned pred_num = 0;
+    std::vector<unsigned> &&sorted_pbox_idx = argsort_gt(prediction.cls_score);
+    for (unsigned &pbox_idx : sorted_pbox_idx) {
+      if (prediction.cls[pbox_idx] != category) continue;
+      image_result[category].scores[pred_num] = prediction.cls_score[pbox_idx];
+
+      unsigned most_overlapped_idx = 0;
+      double most_overlapped_iou = 0.0;
+
+      // finds the most overlapped predicted bounding box.
+      for (unsigned gtbox_idx = 0; gtbox_idx < annotation.size; gtbox_idx++) {
+        if (annotation.cls[gtbox_idx] != category) continue;
+        double iou = intersection_over_union(prediction.box_at(pbox_idx), annotation.box_at(gtbox_idx));
+        if (iou > most_overlapped_iou) {
+          most_overlapped_iou = iou;
+          most_overlapped_idx = gtbox_idx;
+        }
+      }
+      // when there is no ground truth, all predicted boxes are false positive,
+      // and they are preserved for batched AP calculation.
+      if (!num_gt_object) {
+        image_result[category].false_positive[pred_num++] = 1;
+      } else {
+        // the predicted bounding box is a true positive iff. it is the most overlapped,
+        // the matched groundtruth bounding box has not been matched previously, and
+        // the iou is above `thresh`.
+        if (most_overlapped_iou >= thresh) {
+          if (matched_gtbox.find(most_overlapped_idx) == matched_gtbox.end()) {
+            matched_gtbox.insert(most_overlapped_idx);
+            image_result[category].true_positive[pred_num++] = 1;
+          } else {
+            image_result[category].false_positive[pred_num++] = 1;
+          }
+        } else {
+          image_result[category].false_positive[pred_num++] = 1;
+        }
+      }
+    }
+  }
+  return image_result;
+}
+
+// Initializes runtime variables in `runtime_vars` struct.
+void set_runtime(std::string name,
+                 unsigned niter,
+                 unsigned batch_size,
+                 const std::string &input_loc,
+                 const std::string &annotation_loc) {
+  runtime_vars.name = name;
+  runtime_vars.niter = niter;
+  runtime_vars.batch_size = batch_size;
+  runtime_vars.groundtruth_loc = annotation_loc;
+  runtime_vars.input_loc = input_loc;
+}
+
+// Return type of function `validate_yolo`.
+struct metrics {
+  std::vector<mAPStats> map;
+  Tensor2d<mAPStats> coco;
+};
+
+// Main function that takes the results data and annotation location, and calculates mAP score for the network.
+struct metrics validate_yolo(std::vector<double> &results_data,
+                             Tensor3d<double> &raw_annotations,
+                             const unsigned batch) {
+  Tensor2d<double> raw_predictions;
+  PredictionEntry prediction;
+  AnnotationEntry annotation;
+
+  // executes accuracy check recipes.
+  try {
+    parse_prediction_boxes(results_data, raw_predictions);
+    resize_and_filter_prediction_boxes(raw_predictions, prediction, batch);
+    resize_annotation_boxes(raw_annotations, annotation, batch);
+    nms(prediction);
+    clip_box(prediction, batch);
+  } catch (const std::exception &e) {
+    slog::err << "Abort postprocessing." << slog::endl;
+    std::cerr << e.what() << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // mAP
+  std::vector<mAPStats> map_stats = mean_average_precision(prediction, annotation, yolo_meta.pascal_voc_metric);
+
+  // COCO metric
+  Tensor2d<mAPStats> coco_ap_stats;
+  std::for_each(std::begin(yolo_meta.coco_metric), std::end(yolo_meta.coco_metric), [&](const double thresh) {
+    coco_ap_stats.emplace_back(mean_average_precision(prediction, annotation, thresh));
+  });
+
+  return {map_stats, coco_ap_stats};
+}
+
+// This function appends all of the elements in `v2` at the end of `v1` in order.
+template <typename T>
+void extend(std::vector<T> &v1, const std::vector<T> &v2) {
+  v1.reserve(v1.size() + v2.size());
+  v1.insert(v1.end(), v2.begin(), v2.end());
+}
+
+// Updates the batched statistics from individual image's result. The final batched AP and COCO AP is
+// calculated based on updated `batched_stats`.
+void metrics_update(std::vector<mAPStats> &batched_stats, const std::vector<mAPStats> &img_stats) {
+  for (unsigned cat = 0; cat < yolo_meta.num_classes; cat++) {
+    batched_stats[cat].num_gt_object += img_stats[cat].num_gt_object;
+    // updates batched statistics. omits the class where no prediction presents.
+    if (!img_stats[cat].scores.size()) continue;
+    extend(batched_stats[cat].scores, img_stats[cat].scores);
+    extend(batched_stats[cat].true_positive, img_stats[cat].true_positive);
+    extend(batched_stats[cat].false_positive, img_stats[cat].false_positive);
+  }
+}
+
+// Calculates AP using the given integral function.
+double metrics_eval(const std::vector<mAPStats> &stats, unsigned interval) {
+  std::vector<double> class_aps;
+  for (unsigned category = 0; category < yolo_meta.num_classes; category++) {
+    // omits the class when there is no prediction presents.
+    if (!stats[category].scores.size()) continue;
+    // the predictions are false-positive when there is no groundtruth for this
+    // class, and therefore the class AP is 0.0
+    if (stats[category].num_gt_object == 0 && stats[category].scores.size()) {
+      class_aps.push_back(0.0);
+      continue;
+    }
+
+    int TP = 0, FP = 0;
+    std::vector<double> precision, recall;
+
+    // sorts the tp and fp based on the order of confidence score.
+    std::vector<unsigned> &&sorted_stats_index = argsort_gt(stats[category].scores);
+    // calculates intermediate statistics calculation.
+    for (unsigned idx : sorted_stats_index) {
+      TP += stats[category].true_positive[idx];
+      FP += stats[category].false_positive[idx];
+      precision.emplace_back(TP / (double)(TP + FP));
+      recall.emplace_back(TP / (double)stats[category].num_gt_object);
+    }
+    // returns ROC of P-R curve.
+    class_aps.emplace_back(average_precision(precision, recall, interval));
+  }
+  return std::accumulate(class_aps.begin(), class_aps.end(), 0.0) / (double)class_aps.size();
+}
+
+// Wrapper of the function `validate_yolo`. This function prepares data and dispatches metrics calculations for each
+// validation image, accumulates metrics results, and returns the batched mAP and COCO AP.
+std::pair<double, double> validate_yolo_wrapper(std::map<std::string, ov::TensorVector> &raw_results,
+                                                const std::vector<ov::Output<const ov::Node>> &result_layout,
+                                                std::vector<std::string> input_files) {
+  slog::info << "Start validating yolo." << slog::endl;
+  std::ofstream fout;
+  fout.open("ap_report.txt");
+  // preserves all correct paths to validation images.
+  int num_image = runtime_vars.niter * runtime_vars.batch_size;
+  std::vector<std::string> input_image_paths;
+  std::sort(std::begin(input_files), std::end(input_files));
+  // input_files is guaranteed not to be empty since that case is filtered out.
+  for (auto &path : input_files) {
+    if (path == "") break;
+    if (num_image == 0) break;
+    input_image_paths.push_back(path);
+    num_image--;
+  }
+
+  // checks if there exists enough image files; this should always pass unless the image file is
+  // deleted right after the inferencing step.
+  if (num_image != 0) {
+    slog::err << "Not enough image input found. " << runtime_vars.batch_size * runtime_vars.niter << " required, "
+              << (runtime_vars.batch_size * runtime_vars.niter - num_image) << " provided." << slog::endl;
+    exit(EXIT_FAILURE);
+  }
+  // stores all annotation boxes for each image from groundtruth file.
+  // if an input image does not have a corresponding groundtruth file, an error occurs.
+  Tensor3d<double> raw_annotations;
+  int err = collect_validation_dataset(input_image_paths, raw_annotations, runtime_vars.source_image_sizes);
+  if (err) exit(EXIT_FAILURE);
+
+  // updates the metrics each image at a time to reduce memory overhead. the result for each image
+  // is accumulated in `batched_stats` and it will be used for batched mAP and COCO AP calculation.
+  metrics batched_stats;
+  batched_stats.map.resize(yolo_meta.num_classes, mAPStats{});
+  batched_stats.coco.resize(yolo_meta.coco_metric.size(), std::vector<mAPStats>{});
+  std::for_each(batched_stats.coco.begin(), batched_stats.coco.end(), [&](std::vector<mAPStats> &stats) {
+    stats.resize(yolo_meta.num_classes, mAPStats{});
+  });
+
+  for (unsigned batch = 0; batch < runtime_vars.niter; batch++) {
+    for (unsigned img = 0; img < runtime_vars.batch_size; img++) {
+      // stores the flattened output tensors from the resulting convolution layers.
+      std::vector<double> curr_img_data;
+      for (auto &item : result_layout) {
+        const std::string &name = item.get_any_name();
+        auto curr_outputBlob = raw_results.at(name).at(batch);
+        auto output_tensor_start = curr_outputBlob.data<float>();
+        unsigned output_size = curr_outputBlob.get_size() / runtime_vars.batch_size;
+        unsigned offset = img * output_size;
+        for (unsigned idx = 0; idx < output_size; idx++) {
+          curr_img_data.push_back(output_tensor_start[idx + offset]);
+        }
+      }
+
+      struct metrics &&curr_img_stats =
+          validate_yolo(curr_img_data, raw_annotations, img + batch * runtime_vars.batch_size);
+      metrics_update(batched_stats.map, curr_img_stats.map);
+      for (unsigned thresh = 0; thresh < yolo_meta.coco_metric.size(); thresh++) {
+        metrics_update(batched_stats.coco[thresh], curr_img_stats.coco[thresh]);
+      }
+
+      double img_AP = metrics_eval(curr_img_stats.map, yolo_meta.ap_interval);
+      // fout << "image " << input_files[img] << " AP @ 0.5" << std::endl;
+      fout << std::fixed << std::setprecision(10) << img_AP << std::endl;
+    }
+  }
+
+  double map = metrics_eval(batched_stats.map, yolo_meta.ap_interval);
+  double coco_ap = 0.0;
+  for (auto &coco_stats : batched_stats.coco) {
+    coco_ap += metrics_eval(coco_stats, yolo_meta.coco_interval);
+  }
+  coco_ap /= (double)yolo_meta.coco_metric.size();
+
+  fout << "\nAP at IoU=.50: " << std::fixed << std::setprecision(6) << map * 100 << "%" << std::endl;
+  fout << "AP at IoU=.50:.05:.95: " << std::fixed << std::setprecision(10) << coco_ap * 100 << "%" << std::endl;
+  fout.close();
+
+  std::cout << "ap_report.txt has been generated in the current directory." << std::endl;
+
+  return std::make_pair(map, coco_ap);
+}
diff --git a/python/openvino/runtime/dla_benchmark/average_precision.hpp b/python/openvino/runtime/dla_benchmark/average_precision.hpp
new file mode 100644
index 0000000..821eaa1
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/average_precision.hpp
@@ -0,0 +1,156 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file defines functions calculate mAP and COCO AP metrics. See average_precision.cpp for a
+// detailed explaination.
+
+#ifndef DLA_BENCHMARK_OBJECT_DETECTION_H_
+#define DLA_BENCHMARK_OBJECT_DETECTION_H_
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include <inference_engine.hpp>
+
+#undef UNICODE
+
+// Indexes for raw bounding box.
+#define BBOX_X 0
+#define BBOX_Y 1
+#define BBOX_W 2
+#define BBOX_H 3
+#define BBOX_CONFIDENCE 4
+
+// Indices for input image shapes.
+#define IMG_W 0
+#define IMG_H 1
+
+// Indices for parsed bounding boxes.
+#define X_MAX 0
+#define X_MIN 1
+#define Y_MAX 2
+#define Y_MIN 3
+
+// Convenient aliases.
+template <typename T>
+using Box = std::vector<T>;
+
+template <typename T>
+using Tensor2d = std::vector<std::vector<T>>;
+
+template <typename T>
+using Tensor3d = std::vector<std::vector<std::vector<T>>>;
+
+using Blob_t = std::vector<InferenceEngine::BlobMap>;
+
+// A set of supported YOLO graphs and its variants.
+static std::set<std::string> supported_yolo_versions = {"yolo-v3-tf", "yolo-v3-tiny-tf"};
+
+// Each image will have a prediction entry containing coordinates,
+// class scores of prediction boxes, predicted class, and size.
+typedef struct prediction_entry_t {
+  std::vector<double> x_max;
+  std::vector<double> x_min;
+  std::vector<double> y_max;
+  std::vector<double> y_min;
+  // scores for highest class
+  std::vector<double> cls_score;
+  // class with highest probability
+  std::vector<int> cls;
+  unsigned size;
+
+  Box<double> box_at(unsigned idx) { return {x_max[idx], x_min[idx], y_max[idx], y_min[idx]}; }
+} PredictionEntry;
+
+// Each image will have an annotation entry containing coordinates and
+// the true label specified in `cls`.
+typedef struct annotation_entry_t {
+  std::vector<double> x_max;
+  std::vector<double> x_min;
+  std::vector<double> y_max;
+  std::vector<double> y_min;
+  std::vector<int> cls;
+  unsigned size;
+
+  Box<double> box_at(unsigned idx) { return {x_max[idx], x_min[idx], y_max[idx], y_min[idx]}; }
+} AnnotationEntry;
+
+// Stores runtime variables.
+static struct runtime_const_t {
+  // Actually means number of validation image.
+  unsigned niter;
+  unsigned batch_size;
+  std::string name;
+  std::string groundtruth_loc;
+  std::string input_loc;
+  std::string report_folder;
+  const std::string gt_extension = "txt";
+
+  Tensor2d<std::string> input_image_path;
+  Tensor2d<double> source_image_sizes;
+} runtime_vars;
+
+// Stores constants for evaluation.
+static struct meta_t {
+  // Filtering parameters,
+  const double confidence_threshold = 0.001;
+  const double iou_threshold = 0.5;
+
+  // Parameters for parsing and resizing.
+  const unsigned num_classes = 80;
+  const unsigned channel = 255;
+  const unsigned box_per_channel = 3;
+  const unsigned pbox_size = 85;
+  const std::vector<double> dst_image_size = {416, 416};
+
+  // Dimensions of grid cells and anchor boxes.
+  const std::map<std::string, std::map<unsigned, std::vector<double>>> anchor_sizes{
+      {
+          "yolo-v3-tiny-tf",
+          {{13, {81, 82, 135, 169, 344, 319}}, {26, {23, 27, 37, 58, 81, 82}}},
+      },
+      {
+          "yolo-v3-tf",
+          {{13, {116, 90, 156, 198, 373, 326}}, {26, {30, 61, 62, 45, 59, 119}}, {52, {10, 13, 16, 30, 33, 23}}},
+      }};
+  const std::map<std::string, std::vector<unsigned>> grid_sizes = {
+      {"yolo-v3-tiny-tf", {26, 13}},
+      {"yolo-v3-tf", {13, 26, 52}},
+  };
+
+  // Use of `boundary` in IoU calculation.
+  const int boundary = 1;
+
+  // IoU threshold for metrics calculation.
+  const double strict_metric = 0.75;
+  const double pascal_voc_metric = 0.5;
+  const std::vector<double> coco_metric = {0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95};
+
+  // AP calculation
+  const unsigned ap_interval = 11;
+  const unsigned coco_interval = 101;
+} yolo_meta;
+
+// Returns `true` if the given YOLO graph, `name`, is supported. Else, `false` is returned.
+bool inline is_yolo_supported(std::string &name) {
+  return (supported_yolo_versions.find(name) != supported_yolo_versions.end());
+}
+
+// Sets runtime variables.
+void set_runtime(std::string name,
+                 unsigned niter,
+                 unsigned batch_size,
+                 const std::string &input_loc,
+                 const std::string &annotation_loc);
+
+// Entry point of this subroutine.
+std::pair<double, double> validate_yolo_wrapper(std::map<std::string, ov::TensorVector> &raw_results,
+                                                const std::vector<ov::Output<const ov::Node>> &result_layout,
+                                                std::vector<std::string> input_files);
+
+#endif  // DLA_BENCHMARK__OBJECT_DETECTION_H_
diff --git a/python/openvino/runtime/dla_benchmark/convert_annotations.py b/python/openvino/runtime/dla_benchmark/convert_annotations.py
new file mode 100755
index 0000000..0f3d9e6
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/convert_annotations.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# ============================================================================
+# This script takes two paths as input. The first path is to the annotation file
+# in json format. This annotation file is the validation data used in the 2017 COCO
+# competition for object detection, downloaded from https://cocodataset.org/
+# The second path indicates the folder where the user wants to store the converted
+# annotation files in plain text format. Each file in the destination folder contains
+# the true label and the bounding boxes for its corresponding validation image.
+# To use the average precision calculation in the dla_benchmark, you must
+# provide the text-formatted annotation files.
+# Note that 91 classes are used in the mscoco paper https://arxiv.org/pdf/1405.0312.pdf,
+# whereas 80 are used in validation 2014/2017 dataset.
+# ============================================================================
+
+import json
+import sys
+
+
+def cat80(cat: int) -> int:
+    '''
+    The validation dataset omits 11 classes and causes mismatches with
+    the predicted classes in the benckmark_app. This function maps the
+    class id from the json annotation file to those used in the dla_benchmark.
+    '''
+    diff = 1
+    if cat > 11:
+        diff += 1
+    if cat > 25:
+        diff += 1
+    if cat > 28:
+        diff += 2
+    if cat > 44:
+        diff += 1
+    if cat > 65:
+        diff += 1
+    if cat > 67:
+        diff += 2
+    if cat > 70:
+        diff += 1
+    if cat > 82:
+        diff += 1
+    if cat > 90:
+        diff += 1
+    return cat - diff
+
+
+def parse_annotation_file(path_to_annotation: str, destination_folder: str) -> int:
+    fin = open(path_to_annotation)
+    json_data = json.load(fin)
+    per_image_data = dict()
+
+    # Gets all bounding boxes and labels w.r.t. each validation image.
+    for annotation in json_data["annotations"]:
+        image_id = annotation["image_id"]
+        bbox_data = [str(cat80(annotation["category_id"]))] + list(map(str, annotation["bbox"]))
+        if image_id in per_image_data:
+            per_image_data[image_id].append(bbox_data)
+        else:
+            per_image_data[image_id] = [bbox_data]
+    fin.close()
+
+    # Creates and writes to text files.
+    for image_meta in json_data["images"]:
+        file_path = rf'{destination_folder}/{image_meta["file_name"][:-4]}.txt'
+        if image_meta["id"] in per_image_data:
+            bboxes = per_image_data[image_meta["id"]]
+        else:
+            bboxes = []
+        with open(file_path, "w") as fout:
+            fout.write("\n".join([" ".join(bbox) for bbox in bboxes]))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        sys.exit(
+            ("Usage: {0} "
+             "<path to the validation file in json format> "
+             "<path to the folder to store the annotation text files> "
+             )
+            .format(sys.argv[0]))
+
+    json_instances = sys.argv[1]
+    destination = sys.argv[2]
+
+    try:
+        parse_annotation_file(json_instances, destination)
+    except Exception as err:
+        print(err)
+    else:
+        print("Finished.")
diff --git a/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp b/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp
new file mode 100644
index 0000000..8d3eb80
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/dla_benchmark.hpp
@@ -0,0 +1,495 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief message for images argument
+static const char input_message[] =
+    "Optional. Path to a folder with images and/or binaries or to specific image or binary file.";
+
+/// @brief message for model argument
+static const char model_message[] =
+    "Required unless running the ahead-of-time flow using -cm. Path to an .xml file with a trained model";
+
+static const char network_file_alias_message[] = "Required unless -m or -cm is present. Alias for -m";
+
+/// @brief message for compiled model argument
+static const char compiled_model_message[] = "Optional. Path to a .bin file with a trained compiled model";
+
+/// @brief message for execution mode
+static const char api_message[] = "Optional. Enable Sync/Async API. Default value is \"async\".";
+
+/// @brief message for compile/inference device type.
+static const char target_device_message[] =
+    "Optional. Specify a target device to infer on Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. ";
+
+/// @brief message for iterations count
+/** static const char iterations_count_message[] = "Optional. Number of iterations. " \
+"If not specified, the number of iterations is calculated depending on a device."; **/
+static const char iterations_count_message[] = "Required. Number of iterations.";
+
+/// @brief message for requests count
+static const char infer_requests_count_message[] =
+    "Optional. Number of infer requests. Default value is determined automatically for device.";
+
+/// @brief message for #threads for CPU inference
+static const char infer_num_threads_message[] =
+    "Optional. Number of threads to use for inference on the CPU "
+    "(including HETERO).";
+
+/// @brief message for #streams for CPU inference
+static const char infer_num_streams_message[] =
+    "Optional. Number of streams to use for inference on the CPU in throughput mode "
+    "(for HETERO device cases use format <dev1>:<nstreams1>,<dev2>:<nstreams2> or just <nstreams>). "
+    "Default value is determined automatically for a device. Please note that although the automatic selection "
+    "usually provides a reasonable performance, it still may be non - optimal for some cases, especially for "
+    "very small networks. See sample's README for more details.";
+
+/// @brief message for user library argument
+static const char custom_cpu_library_message[] =
+    "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
+
+static const char batch_size_message[] =
+    "Optional. Batch size value. If not specified, the batch size value is determined from Intermediate "
+    "Representation.";
+
+static const char batch_size_alias_message[] = "Optional. Alias for -b.";
+
+static const char min_subgraph_layers_message[] =
+    "Optional. Minimum number of layers allowed in a subgraph that runs on FPGA. Subgraph with fewer"
+    " layers than this value will run on CPU in Hetero plugin. Must be >= 1";
+
+/// @brief message for CPU threads pinning option
+static const char infer_threads_pinning_message[] =
+    "Optional. Enable threads->cores (\"YES\", default), threads->(NUMA)nodes (\"NUMA\") "
+    "or completely disable (\"NO\") "
+    "CPU threads pinning for CPU-involved inference.";
+
+/// @brief message for stream_output option
+static const char stream_output_message[] =
+    "Optional. Print progress as a plain text. When specified, an interactive progress bar is replaced with a "
+    "multiline output.";
+
+/// @brief message for the save_run_summary option
+static const char save_run_summary_message[] =
+    "Optional. Enable saving a summary of the run containing the "
+    "specified command line parameters and a copy of the performance report "
+    "printed to stdout.";
+
+/// @brief message for report_folder option
+static const char report_folder_message[] = "Optional. Path to a folder where statistics report is stored.";
+
+// @brief message for progress bar option
+static const char progress_message[] =
+    "Optional. Show progress bar (can affect performance measurement). Default values is \"false\".";
+
+/// @brief message for the custom plugins.xml file option
+static const char plugins_message[] = "Optional. Select a custom plugins_xml file to use. "
+    "-plugins=emulation to use xml file for software emulation";
+
+/// @brief message for the custom plugins_xml_file.xml file option
+static const char old_plugins_message[] =
+    "***DEPRECATED OPTION*** Please use NEW -plugins option to specify which custom plugins xml file to use";
+
+/// @brief message for ground truth file
+static const char groundtruth_loc_message[] =
+    "Optional. Select a ground truth file to use for calculating top 1 top 5 results.";
+
+/// @brief message for architecture .arch file
+static const char arch_file_message[] = "Optional. Provide a path for the architecture .arch file.";
+
+/// @brief message for --arch flag.
+static const char arch_alias_message[] = "Optional. Alias for -arch_file.";
+
+/// @brief message performance estimation
+static const char perf_est_message[] = "Optional. Perform performance estimation.";
+
+/// @brief message folding_option flag
+static const char folding_option_message[] = "Optional. Set the folding options for dla compiler: options 0-3.";
+
+/// @brief message fold_preprocessing flag
+static const char fold_preprocessing_message[] = "Optional. Enable fold preprocessing option for dla compiler.";
+
+/// @brief message bgr flag
+static const char bgr_message[] = "Optional. Indicate images are in bgr format.";
+
+/// @brief message dump_output flag
+static const char dump_output_message[] = "Optional. Dumps output of graph to result.txt and result.bin file(s).";
+
+/// @brief message for output_dir option
+static const char output_dir_message[] = "Optional. Path to a folder where result files are dumped to.";
+
+/// @brief message encryption_key flag
+static const char encryption_key_message[] =
+    "Optional. Encryption key (using hexidecimal characters, 16 bytes- 32 hexidecimal char).";
+
+/// @brief message encryption_iv flag
+static const char encryption_iv_message[] =
+    "Optional. Initialization vector for encryption. (8 bytes - 16 hexidecimal char)";
+
+/// @brief message debug network flag
+static const char debug_network_message[] = "Optional. Dump the contents from the debug network.";
+
+/// @brief message emulator_decryption flag
+static const char emulator_decryption_message[] =
+    "Optional. Set to true to enable decryption using emulator. Disable encryption in the import.";
+
+/// @brief message hidden_help flag
+static const char hidden_help_message[] = "Print help options that are experimental or for internal use.";
+
+/// @brief message estimate_per_layer_latencies flag
+static const char estimate_per_layer_latencies_message[] =
+    "Optional. Estimates the number of cycles each layer will consume during execution based on the internal model "
+    "Performance Estimator uses to estimate throughput. For internal use only.";
+
+/// @brief message average_precision flag
+static const char enable_object_detection_ap_message[] =
+    "Optional. Set to true to show average precision and COCO average precision for YOLO graphs in the report.";
+
+/// @brief message yolo_version flag
+static const char yolo_version_message[] = "Optional. The version of the YOLO graph. Required for average precision report.";
+
+/// @brief message binary flag
+static const char bin_data_message[] =
+    "Optional. Specify that the input should be read as binary data (otherwise, if input tensor has depth 1, or 3 it "
+    "will default to U8 image processing).";
+
+/// @brief message pc flag
+static const char pc_message[] = "Optional. Report performance counters for the CPU subgraphs, if there is any.";
+
+/// @brief message pcsort flag
+static const char pcsort_message[] =
+    "Optional. Report performance counters for the CPU subgraphs and analysis sort hotpoint opts. "
+    "sort: Analysis opts time cost, print by hotpoint order; "
+    "no_sort: Analysis opts time cost, print by normal order; "
+    "simple_sort: Analysis opts time cost, only print EXECUTED opts by normal order.";
+
+/// @brief message scale flag
+static constexpr char input_image_scale_message[] =
+    "Optional. Scale factors for each channel in [R, G, B] format. "
+    "Applies normalization as (x - mean) / scale. "
+    "Example: -scale_values input[1, 1, 1]. Not performed on FPGA.";
+
+/// @brief message mean flag
+static constexpr char input_image_mean_message[] =
+    "Optional. Per-channel mean subtraction values in [R, G, B] format. "
+    "Used for model input normalization as (x - mean) / scale. "
+    "Example: -mean_values input[255,255,255]. Not performed on FPGA.";
+
+/// @brief message resize flag
+static const char input_image_resize_message[] =
+    "Optional. Image resizing when the input image dimensions do not match the model."
+    "'resize': Resizing the image to the model input size."
+    "'pad_resize': Pad the image with zeros and resize to model input size.";
+
+/// @brief message enable early-access features flag
+static const char enable_early_access_message[] =
+    "Optional. Enables early access (EA) features of FPGA AI Suite. These are features that are actively being "
+    "developed and have not yet met production quality standards. These features may have flaws. "
+    "Consult the FPGA AI Suite documentation for details.";
+
+/// @brief message report LSU memory access count
+static const char report_lsu_counters_message[] =
+    "Optional. Report the number of memory accesses made by the "
+    "input feature reader, output feature writer, and filter reader "
+    "of each CoreDLA instance since device initialization. No report from the counters by default.";
+
+/// @brief message for verbose flag
+static const char verbose_message[] = "Optional. If true DLA Benchmark outputs detailed logs.";
+
+/// @brief mesage for maximum file size flag
+static const char output_output_file_size_message[] =
+    "Optional. Maximum file size in MB that can be dumped to a txt. Used to avoid creating files that cannot be opened.";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief Define parameter for set image file <br>
+/// i or mif is a required parameter
+DEFINE_string(i, "", input_message);
+
+/// @brief Define parameter for set model file <br>
+/// It is a required parameter
+DEFINE_string(m, "", model_message);
+
+/// @brief Alias for -m
+DEFINE_string(network_file, "", network_file_alias_message);
+
+/// @brief Define parameter for compiled model file <br>
+/// It is not a required parameter
+DEFINE_string(cm, "", compiled_model_message);
+
+/// @brief Define execution mode
+DEFINE_string(api, "async", api_message);
+
+/// @brief device the target device to infer on <br>
+DEFINE_string(d, "", target_device_message);
+
+/// @brief Absolute path to CPU library with user layers <br>
+/// It is a required parameter
+DEFINE_string(l, "", custom_cpu_library_message);
+
+/// @brief Iterations count (default 0)
+/// Sync mode: iterations count
+/// Async mode: StartAsync counts
+DEFINE_int32(niter, 0, iterations_count_message);
+
+/// @brief Number of infer requests in parallel
+DEFINE_int32(nireq, 0, infer_requests_count_message);
+
+/// @brief Number of threads to use for inference on the CPU in throughput mode (also affects Hetero cases)
+DEFINE_int32(nthreads, 0, infer_num_threads_message);
+
+/// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
+DEFINE_string(nstreams, "", infer_num_streams_message);
+
+/// @brief Define parameter for batch size <br>
+/// Default is 1
+DEFINE_int32(b, 1, batch_size_message);
+
+/// @brief alias for -b
+DEFINE_int32(batch_size, 1, batch_size_alias_message);
+
+/// @brief Minimum number of layers allowed in a subgraph that runs on FPGA
+DEFINE_int32(min_subgraph_layers, 2, min_subgraph_layers_message);
+
+// @brief Enable plugin messages
+DEFINE_string(pin, "YES", infer_threads_pinning_message);
+
+/// @brief Enables multiline text output instead of progress bar
+DEFINE_bool(stream_output, false, stream_output_message);
+
+/// @brief Enables saving a summary of the run
+DEFINE_bool(save_run_summary, false, save_run_summary_message);
+
+/// @brief Path to a folder where statistics report is stored
+DEFINE_string(report_folder, "", report_folder_message);
+
+/// @brief Define flag for showing progress bar <br>
+DEFINE_bool(progress, false, progress_message);
+
+/// @brief Path to a plugins_xml file
+DEFINE_string(plugins, "", plugins_message);
+
+/// @brief Deprecated argument for path to a plugins_xml file
+DEFINE_string(plugins_xml_file, "", old_plugins_message);
+
+/// @brief Path to a groundtruth file
+DEFINE_string(groundtruth_loc, "", groundtruth_loc_message);
+
+/// @brief Path to arch file
+DEFINE_string(arch_file, "", arch_file_message);
+
+/// @brief Path to arch file, same as arch_file
+DEFINE_string(arch, "", arch_alias_message);
+
+/// @brief Define flag for enable performance estimation
+DEFINE_bool(perf_est, false, perf_est_message);
+
+/// @brief Define flag whether the image is in bgr format
+DEFINE_bool(bgr, false, bgr_message);
+
+/// @brief Define flag for enable output results dumping
+DEFINE_bool(dump_output, false, dump_output_message);
+
+/// @brief Define flag for output directory where result files are dumped to
+DEFINE_string(output_dir, "", output_dir_message);
+
+/// Select folding options; 0,1,2,3
+DEFINE_int32(folding_option, 1, folding_option_message);
+
+/// @brief Define flag for enabling folding preprocessing
+DEFINE_bool(fold_preprocessing, false, fold_preprocessing_message);
+
+/// @brief encryption key
+DEFINE_string(encryption_key, "", encryption_key_message);
+
+/// @brief initialization vector
+DEFINE_string(encryption_iv, "", encryption_iv_message);
+
+/// @brief Define flag for enabling dump of debug network values
+DEFINE_bool(debug_network, false, debug_network_message);
+
+/// @brief encryption_key
+DEFINE_bool(emulator_decryption, false, emulator_decryption_message);
+
+/// @brief Flag for printing the hidden help message
+DEFINE_bool(hidden_help, false, hidden_help_message);
+
+/// @brief Whether Performance Estimator should calculate theoretical per-layer cycle counts. Internal use only. Must be
+/// called with -perf_est.
+DEFINE_bool(estimate_per_layer_latencies, false, estimate_per_layer_latencies_message);
+
+/// @brief Show average precision in the report
+DEFINE_bool(enable_object_detection_ap, false, enable_object_detection_ap_message);
+
+/// @brief Let user specify the version of their YOLO graph.
+DEFINE_string(yolo_version, "", yolo_version_message);
+
+/// @brief Specify that the inputs should be read as binary.
+DEFINE_bool(bin_data, false, bin_data_message);
+
+/// @brief Report performance counters for the CPU subgraphs.
+DEFINE_bool(pc, false, pc_message);
+
+/// @brief Report performance counters for the CPU subgraphs and analysis sort hotpoint opts.
+DEFINE_string(pcsort, "", pcsort_message);
+
+/// @brief Define flag for using input image scale <br>
+DEFINE_string(scale_values, "", input_image_scale_message);
+
+/// @brief Define flag for using input image mean <br>
+DEFINE_string(mean_values, "", input_image_mean_message);
+
+/// @brief Define flag for using input image resize <br>
+DEFINE_string(resize_type, "", input_image_resize_message);
+
+/// @brief Enables early-access (EA) features of CoreDLA <br>
+DEFINE_bool(enable_early_access, false, enable_early_access_message);
+
+/// @brief Pass the name of the streaming input linux FIFO for use in the emulator model
+DEFINE_string(streaming_input_pipe, "", "");
+
+/// @brief Report the input feature reader, output feature writer, and filter reader memory access counts
+DEFINE_bool(report_lsu_counters, false, report_lsu_counters_message);
+
+/// @brief define flag dla benchmark verbosity
+DEFINE_bool(verbose, false, verbose_message);
+
+/// @brief maximum file size in MB that can be dumped to a txt. Used to avoid creating files that cannot be opened.
+DEFINE_int32(max_output_file_size, 200, output_output_file_size_message);
+
+/**
+ * @brief Options that impact graph compile.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowCompileOptions() {
+  std::cout << std::endl << "Graph Compile Options:" << std::endl;
+  std::cout << "    -folding_option                             " << folding_option_message << std::endl;
+  std::cout << "    -fold_preprocessing                         " << fold_preprocessing_message << std::endl;
+  std::cout << "    -min-subgraph-layers \"<integer>\"            " << min_subgraph_layers_message << std::endl;
+}
+
+/**
+ * @brief Options that evaluate the correctness of the inference result.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowAccuracyOptions() {
+  std::cout << std::endl << "Accuracy Options:" << std::endl;
+  std::cout << "    -dump_output                                " << dump_output_message << std::endl;
+  std::cout << "    -groundtruth_loc                            " << groundtruth_loc_message << std::endl;
+  std::cout << "    -enable_object_detection_ap                 " << enable_object_detection_ap_message << std::endl;
+  std::cout << "    -yolo_version \"yolo-v3-tf/yolo-v3-tiny-tf\"  " << yolo_version_message << std::endl;
+}
+
+/**
+ * @brief Shows options for statistic dumping, report dumping
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowStatsOrReportDumpingOptions() {
+  std::cout << std::endl << "Statistics dumping options:" << std::endl;
+  std::cout << "    -perf_est                                   " << perf_est_message << std::endl;
+  std::cout << "    -progress                                   " << progress_message << std::endl;
+  std::cout << "    -stream_output                              " << stream_output_message << std::endl;
+  std::cout << "    -save_run_summary                           " << save_run_summary_message << std::endl;
+  std::cout << "    -report_folder                              " << report_folder_message << std::endl;
+}
+
+/**
+ * @brief Shows preprocessing options for input data
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowPreprocessingOptions() {
+  std::cout << std::endl << "Preprocessing Options:" << std::endl;
+  std::cout << "    -bgr                                        " << bgr_message << std::endl;
+  std::cout << "    -resize_type \"resize/pad_resize\"            " << input_image_resize_message << std::endl;
+  std::cout << "    -scale_values                               " << input_image_scale_message << std::endl;
+  std::cout << "    -mean_values                                " << input_image_mean_message << std::endl;
+}
+
+/**
+ * @brief Shows help options for inference on the FPGA or any OpenVINO device.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowInferenceOptions() {
+  std::cout << std::endl << "Inference Options:" << std::endl;
+  std::cout << "    -api \"<sync/async>\"                         " << api_message << std::endl;
+  std::cout << "    -niter \"<integer>\"                          " << iterations_count_message << std::endl;
+  std::cout << "    -nireq \"<integer>\"                          " << infer_requests_count_message << std::endl;
+  std::cout << "    -b \"<integer>\"                              " << batch_size_message << std::endl;
+  std::cout << "    -batch-size \"<integer>\"                     " << batch_size_alias_message << std::endl;
+}
+
+/**
+ * @brief Shows help options for OpenVINO devices (CPU, GPU)
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowOpenVINODeviceOptions() {
+  std::cout << std::endl << "CPU or GPU options:" << std::endl;
+  std::cout << "    -nstreams \"<integer>\"                       " << infer_num_streams_message << std::endl;
+  std::cout << "    -nthreads \"<integer>\"                       " << infer_num_threads_message << std::endl;
+  std::cout << "    -pin \"YES/NO\"                               " << infer_threads_pinning_message << std::endl;
+  std::cout << "    -l \"<absolute_path>\"                        " << custom_cpu_library_message << std::endl;
+  std::cout << "    -pc                                           " << pc_message << std::endl;
+  std::cout << "    -pcsort \"sort/no_sort/simple_sort\"          " << pcsort_message << std::endl;
+}
+
+/**
+ * @brief This function prints a help message outlining options that are hidden from the user.
+ * Options listed here should be experimental or features for internal use.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void PrintHiddenHelp() {
+  std::cout << std::endl << "Hidden Options. Experimental, early access or internal options." << std::endl;
+  std::cout << "    -enable_early_access              " << enable_early_access_message << std::endl;
+  std::cout << "    -estimate_per_layer_latencies     " << estimate_per_layer_latencies_message << std::endl;
+  std::cout << "    -debug_network                    " << debug_network_message << std::endl;
+  std::cout << "    -max_output_file_size                    " << output_output_file_size_message << std::endl;
+}
+
+/**
+ * @brief This function shows a help message. Add your new option in the appropriate section.
+ * Please make sure your help text aligns with the other option in command line.
+ */
+static void ShowUsage() {
+  std::cout << std::endl;
+  std::cout << "dla_benchmark [OPTION]" << std::endl;
+  std::cout << "Options:" << std::endl;
+  std::cout << std::endl;
+  std::cout << "    -h, --help                                  " << help_message << std::endl;
+  std::cout << "    -m \"<path>\"                                 " << model_message << std::endl;
+  std::cout << "    -network-file \"<path>\"                      " << network_file_alias_message << std::endl;
+  std::cout << "    -cm \"<path>\"                                " << compiled_model_message << std::endl;
+  std::cout << "    -d \"<device>\"                               " << target_device_message << std::endl;
+  std::cout << "    -plugins                                    " << plugins_message << std::endl;
+  std::cout << "    -plugins_xml_file                           " << old_plugins_message << std::endl;
+  std::cout << "    -arch_file                                  " << arch_file_message << std::endl;
+  std::cout << "    -arch                                       " << arch_alias_message << std::endl;
+  std::cout << "    -i \"<path>\"                                 " << input_message << std::endl;
+  std::cout << "    -bin_data                                   " << bin_data_message << std::endl;
+  std::cout << "    -output_dir                                 " << output_dir_message << std::endl;
+  std::cout << "    -encryption_key                             " << encryption_key_message << std::endl;
+  std::cout << "    -encryption_iv                              " << encryption_iv_message << std::endl;
+  std::cout << "    -emulator_decryption                        " << emulator_decryption_message << std::endl;
+  std::cout << "    -verbose                                    " << verbose_message << std::endl;
+  std::cout << "    -hidden_help                                " << hidden_help_message << std::endl;
+  ShowInferenceOptions();
+  ShowCompileOptions();
+  ShowPreprocessingOptions();
+  ShowAccuracyOptions();
+  ShowStatsOrReportDumpingOptions();
+  ShowOpenVINODeviceOptions();
+}
+
diff --git a/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp b/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp
new file mode 100644
index 0000000..9ddc3dd
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/infer_request_wrap.hpp
@@ -0,0 +1,168 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Wrappers for single inference requests and queues of inference requests.
+//              Largely based off OpenVino's benchmark_app/infer_request_wrap.hpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/infer_request_wrap.hpp]
+//              Note: Not all functions of InferenceEngine::InferRequest is wrapped. More functions can be added.
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <openvino/openvino.hpp>
+#include "statistics_report.hpp"
+#include "utils.hpp"
+
+typedef std::function<void(size_t id, const double latency, const std::exception_ptr& ptr)> QueueCallbackFunction;
+
+// Wrapper class for InferenceEngine::InferRequest. Handles asynchronous callbacks
+class InferReqWrap final {
+ public:
+  using Ptr = std::shared_ptr<InferReqWrap>;
+
+  ~InferReqWrap() = default;
+
+  explicit InferReqWrap(ov::CompiledModel& model, size_t id, QueueCallbackFunction callbackQueue)
+      : _request(model.create_infer_request()), _id(id), _callbackQueue(callbackQueue) {
+    _request.set_callback([&](const std::exception_ptr& ptr) {
+      _endTime = Time::now();
+      _callbackQueue(_id, get_execution_time_in_milliseconds(), ptr);
+    });
+  }
+
+  void start_async() {
+    _startTime = Time::now();
+    _request.start_async();
+  }
+
+  void wait() { _request.wait(); }
+
+  void infer() {
+    _startTime = Time::now();
+    _request.infer();
+    _endTime = Time::now();
+    _callbackQueue(_id, get_execution_time_in_milliseconds(), nullptr);
+  }
+
+  std::vector<ov::ProfilingInfo> get_performance_counts() { return _request.get_profiling_info(); }
+
+  ov::Tensor get_tensor(const std::string& name) { return _request.get_tensor(name); }
+
+  double get_execution_time_in_milliseconds() const {
+    auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
+    return static_cast<double>(execTime.count()) * 0.000001;
+  }
+
+  void set_tensor(const std::string& name, const ov::Tensor& data) { _request.set_tensor(name, data); }
+
+  void set_tensor(const ov::Output<const ov::Node>& port, const ov::Tensor& data) { _request.set_tensor(port, data); }
+
+  ov::Tensor get_output_tensor() { return _request.get_output_tensor(); }
+
+ private:
+  ov::InferRequest _request;
+  Time::time_point _startTime;
+  Time::time_point _endTime;
+  size_t _id;
+  QueueCallbackFunction _callbackQueue;
+};
+
+// Handles a queue of inference requests.
+class InferRequestsQueue final {
+ public:
+  InferRequestsQueue(ov::CompiledModel& model, size_t nireq) {
+    for (size_t id = 0; id < nireq; id++) {
+      requests.push_back(std::make_shared<InferReqWrap>(model,
+                                                        id,
+                                                        std::bind(&InferRequestsQueue::put_idle_request,
+                                                                  this,
+                                                                  std::placeholders::_1,
+                                                                  std::placeholders::_2,
+                                                                  std::placeholders::_3)));
+      _idleIds.push(id);
+    }
+    reset_times();
+  }
+
+  ~InferRequestsQueue() {
+    // Inference Request guarantee that it will wait for all asynchronous internal tasks in destructor
+    // So it should be released before any context that the request can use inside internal asynchronous tasks
+    // For example all members of InferRequestsQueue would be destroyed before `requests` vector
+    // So requests can try to use this members from `put_idle_request()` that would be called from request callback
+    // To avoid this we should move this vector declaration after all members declaration or just clear it manually in
+    // destructor
+    requests.clear();
+  }
+
+  void reset_times() {
+    _startTime = Time::time_point::max();
+    _endTime = Time::time_point::min();
+    _latencies.clear();
+  }
+
+  double get_durations_in_milliseconds() {
+    return std::chrono::duration_cast<ns>(_endTime - _startTime).count() * 0.000001;
+  }
+
+  void put_idle_request(size_t id, const double latency, const std::exception_ptr& ptr = nullptr) {
+    std::unique_lock<std::mutex> lock(_mutex);
+    if (ptr) {
+      inferenceException = ptr;
+    } else {
+      _latencies.push_back(latency);
+      _idleIds.push(id);
+      _endTime = std::max(Time::now(), _endTime);
+    }
+    _cv.notify_one();
+  }
+
+  InferReqWrap::Ptr get_idle_request() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    _cv.wait(lock, [this] {
+      if (inferenceException) {
+        std::rethrow_exception(inferenceException);
+      }
+      return _idleIds.size() > 0;
+    });
+    auto request = requests.at(_idleIds.front());
+    _idleIds.pop();
+    _startTime = std::min(Time::now(), _startTime);
+    return request;
+  }
+
+  void wait_all() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    _cv.wait(lock, [this] {
+      if (inferenceException) {
+        std::rethrow_exception(inferenceException);
+      }
+      return _idleIds.size() == requests.size();
+    });
+  }
+
+  std::vector<double>& get_latencies() { return _latencies; }
+
+  Time::time_point get_start_time() { return _startTime; }
+
+  Time::time_point get_end_time() { return _endTime; }
+
+  std::vector<InferReqWrap::Ptr> requests;
+
+ private:
+  std::queue<size_t> _idleIds;
+  std::mutex _mutex;
+  std::condition_variable _cv;
+  Time::time_point _startTime;
+  Time::time_point _endTime;
+  std::vector<double> _latencies;
+  std::exception_ptr inferenceException = nullptr;
+};
diff --git a/python/openvino/runtime/dla_benchmark/inputs_filling.cpp b/python/openvino/runtime/dla_benchmark/inputs_filling.cpp
new file mode 100644
index 0000000..0d20a14
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/inputs_filling.cpp
@@ -0,0 +1,885 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file implements all supported formats of filling input tensors with input data.
+//              Functions in this file has been based off/modified from OpenVINO's input filling algorithms,
+//              which would be a good place to start for future OpenVINO uplifts.
+//              Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/input_fillings.cpp]
+
+#include "inputs_filling.hpp"
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <functional>
+#include <limits>
+#include <tuple>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <opencv2/videoio.hpp>
+#include <samples/ocv_common.hpp>
+#include <samples/slog.hpp>
+#include "format_reader_ptr.h"
+#include "shared_tensor_allocator.hpp"
+#include "utils.hpp"
+
+/**
+ * @brief Struct to store info of an image read by the FormatReader::Reader class
+*/
+struct ReaderInfo {
+  std::shared_ptr<uint8_t> data;  // Image data
+  const size_t file_index;        // Index of the image in the file_paths vector
+  const size_t channels;          // Number of channels used by the reader to store the image
+
+  ReaderInfo(std::shared_ptr<uint8_t>& data, size_t file_index, size_t channels)
+      : data(data), file_index(file_index), channels(channels) {}
+};
+
+// Since the reader always expands the image being read into an rgb image.
+// The only way to tell that an image is in fact an rgb and not a grayscale
+// image, it to find if the values in channel 0 differ from channel 1 or 2.
+// Return true if this is a grayscale image or an rgb image than can safely
+// be considered a grayscale image since all channel values are the same.
+static bool IsGrayScaleImage(const ReaderInfo& reader_info, uint32_t image_size) {
+  const auto num_channels = reader_info.channels;
+  const auto& image_data = reader_info.data;
+  // Iterate through the image surface
+  for (size_t pid = 0; pid < image_size; pid++) {
+    // Iterate through the image channels
+    for (size_t ch = 1; ch < num_channels; ++ch) {
+      if (image_data.get()[pid * num_channels + ch] != image_data.get()[pid * num_channels]) return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+using uniformDistribution = typename std::conditional<
+    std::is_floating_point<T>::value,
+    std::uniform_real_distribution<T>,
+    typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
+
+/**
+ * @brief Fills a tensor with image data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that data as uint8 and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param files vector of file paths to the input images
+ * @param input_id image input id, ie image 1, image 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param bgr boolean indicating if input channels need to be reversed
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the image
+*/
+template <typename T>
+ov::Tensor CreateTensorFromImage(const std::vector<std::string>& files,
+                                 const size_t input_id,
+                                 const size_t batch_size,
+                                 const size_t input_size,
+                                 const size_t request_id,
+                                 const dla_benchmark::InputInfo& input_info,
+                                 const std::string& input_name,
+                                 const FormatReader::Reader::ResizeType resize_type,
+                                 const bool bgr = false,
+                                 const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+  /** Collect images data ptrs **/
+  std::vector<ReaderInfo> vreader;
+  vreader.reserve(batch_size);
+
+  size_t img_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    img_batch_size = batch_size;
+  } else {
+    slog::warn << input_name << ": layout does not contain batch dimension. Assuming batch 1 for this input"
+               << slog::endl;
+  }
+
+  for (size_t i = 0, input_idx = request_id * batch_size * input_size + input_id; i < img_batch_size; i++, input_idx += input_size) {
+    input_idx %= files.size();
+    FormatReader::ReaderPtr reader(files[input_idx].c_str());
+    if (input_idx <= MAX_COUT_WITHOUT_VERBOSE || verbose) {
+      slog::info << "Prepare image " << files[input_idx] << slog::endl;
+      if (!verbose && input_idx == MAX_COUT_WITHOUT_VERBOSE) {
+        slog::info << "Truncating list of input files. Run with --verbose for complete list." << slog::endl;
+      }
+    }
+    if (reader.get() == nullptr) {
+      slog::warn << "Image " << files[input_idx] << " cannot be read!" << slog::endl << slog::endl;
+      continue;
+    }
+
+    /** Getting image data **/
+    std::shared_ptr<uint8_t> image_data(reader->getData(input_info.GetWidth(), input_info.GetHeight(), resize_type));
+    if (image_data) {
+      // Store the number of channels used in storing the image in the reader
+      // If the image is grayscale, the reader would will still store it as a three
+      // channel image and therefore to read the image correctly we need to read the
+      // first channel value and then skip the next two.
+      const auto reader_channels = reader->size() / (reader->width() * reader->height());
+      vreader.emplace_back(image_data, input_idx, reader_channels);
+    }
+  }
+
+  /** Fill input tensor with image. First b channel, then g and r channels **/
+  const size_t num_channels = input_info.GetChannels();
+  const size_t width = input_info.GetWidth();
+  const size_t height = input_info.GetHeight();
+  const size_t batch = input_info.GetBatch();
+
+  const size_t image_size = width * height;  // Calculate the image size
+
+  // Lambda expression for calculating the pixel index in inputBlobData
+  const auto get_index = [=](size_t image_id, size_t pid, size_t ch) {
+    // Reverse the channel index if bgr is set to true
+    return image_id * image_size * num_channels + (bgr ? ch : (num_channels - ch - 1)) * image_size + pid;
+  };
+
+  // Lambda expression for calculating the channel (if bgr)
+  const auto get_channel = [=](size_t ch) {
+    return bgr ? ch : (num_channels - ch - 1);
+  };
+
+  /** Iterate over all input images **/
+  for (size_t image_id = 0; image_id < vreader.size(); ++image_id) {
+    const auto& reader_info = vreader.at(image_id);
+    // Error out of the graph has a single channel input and the image is not grayscale
+    if (num_channels == 1 && !IsGrayScaleImage(reader_info, image_size)) {
+      THROW_IE_EXCEPTION
+          << "Graph input is grayscale (has a single channel) and the following image is in RGB format:\n\t"
+          << files.at(reader_info.file_index);
+    }
+    const auto reader_channels = reader_info.channels;
+    /** Iterate over all pixel in image (b,g,r) **/
+    for (size_t pid = 0; pid < image_size; pid++) {
+      /** Iterate over all channels **/
+      for (size_t ch = 0; ch < num_channels; ++ch) {
+        // check if scale values are 0
+        if (input_info.scale_values[get_channel(ch)] == 0) {
+          throw ov::Exception("Cannot apply scale value of 0");
+        }
+        // Reader is created with the assumption that the number of channels is always the maximum
+        data[get_index(image_id, pid, ch)] = static_cast<T>(
+            (reader_info.data.get()[pid * reader_channels + ch] - input_info.mean_values[get_channel(ch)]) /
+            input_info.scale_values[get_channel(ch)]);
+      }
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, {batch, num_channels, height, width}, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Fills a tensor with video data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param file_paths vector of file paths to the input images
+ * @param input_id binary input id, ie video 1, video 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param bgr boolean indicating if input channels need to be reversed
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the video
+*/
+template <typename T>
+ov::Tensor CreateTensorFromVideo(const std::vector<std::string>& file_paths,
+                                 const size_t input_id,
+                                 const size_t batch_size,
+                                 const size_t input_size,
+                                 const size_t request_id,
+                                 const dla_benchmark::InputInfo& input_info,
+                                 const std::string& input_name,
+                                 const bool bgr = false,
+                                 const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  const size_t input_idx = (request_id * input_size + input_id) % file_paths.size();
+
+  const size_t channels = input_info.GetChannels();
+  const size_t height = input_info.GetHeight();
+  const size_t width = input_info.GetWidth();
+  const size_t frame_count = input_info.GetDepth();
+  const size_t batch = input_info.GetBatch();
+
+  std::vector<cv::Mat> frames_to_write;
+  frames_to_write.reserve(batch_size * frame_count);
+  if (verbose) slog::info << "Prepare Video " << file_paths[input_idx] << slog::endl;
+
+  // Open Video
+  cv::VideoCapture cap(file_paths[input_idx]);
+  if (!cap.isOpened()) {
+    throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+  }
+
+  // Get amount of frames in video and calculate a step to partition the video into clips
+  size_t video_frames = 0;
+  size_t step;
+  size_t cur_video_pos = 0;
+  cv::Mat calc_frame;
+
+  // Using while loop instead of cv::get() since cv::get() isn't guaranteed to return
+  // the correct amount of frames
+  while ((cap.read(calc_frame))) {
+    if (calc_frame.empty()) {
+      break;
+    }
+    video_frames++;
+  }
+
+  // Reopen the file at the starting position
+  cap.release();
+  cap.open(file_paths[input_idx].c_str());
+  if (!cap.isOpened()) {
+    throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+  }
+
+  if (verbose) {
+    slog::info << "Video file " << file_paths[input_idx] << " contains " << video_frames << " readable frames."
+               << slog::endl;
+  }
+
+  // Calculate step to partition video into "batch_size" amount of clips
+  if (batch_size == 1) {
+    step = frame_count;
+  } else if (video_frames < frame_count) {
+    step = 1;
+  } else {
+    step = std::max((size_t)1, (video_frames - frame_count) / (batch_size - 1));
+  }
+
+  // Get frames
+  for (size_t clip_start = 0; clip_start < batch_size * step; clip_start += step) {
+    // Attempt to set position using OpenCV + Video Codec
+    bool success = cap.set(cv::CAP_PROP_POS_FRAMES, clip_start);
+
+    // Unsupported by codec, set manually
+    if (!success) {
+      if (cur_video_pos < clip_start) {
+        while (cur_video_pos != clip_start) {
+          cap.read(calc_frame);
+          cur_video_pos++;
+        }
+      } else if (cur_video_pos > clip_start) {
+        // Reopen the file at the starting position
+        cap.release();
+        cap.open(file_paths[input_idx].c_str());
+        if (!cap.isOpened()) {
+          throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+        }
+        cur_video_pos = 0;
+        while (cur_video_pos != clip_start) {
+          cap.read(calc_frame);
+          cur_video_pos++;
+        }
+      }
+    }
+
+    for (size_t curr_frame = 0; curr_frame < frame_count; curr_frame++) {
+      cv::Mat frame;
+      cap.read(frame);
+
+      // Frame is empty -> Clip is shorter than frame_count, loop from start of clip
+      if (frame.empty()) {
+        if (verbose)
+          slog::info << "A video clip was shorter than the desired frame count, looping video." << slog::endl;
+        bool success = cap.set(cv::CAP_PROP_POS_FRAMES, clip_start);
+
+        // If unsupported by codec, set manually
+        if (!success) {
+          // Reopen the file at the starting position
+          cap.release();
+          cap.open(file_paths[input_idx].c_str());
+          if (!cap.isOpened()) {
+            throw std::runtime_error("Video file " + file_paths[input_idx] + " cannot be read!");
+          }
+          cur_video_pos = 0;
+          while (cur_video_pos != clip_start) {
+            cap.read(calc_frame);
+            cur_video_pos++;
+          }
+        } else {
+          cur_video_pos = clip_start;
+        }
+
+        cap.read(frame);
+
+        // If it's still empty, then there's an error with reading
+        if (frame.empty()) {
+          slog::err << "Video file " << file_paths[input_idx] << " frames cannot be read!" << slog::endl << slog::endl;
+          continue;
+        }
+      }
+
+      cur_video_pos++;
+      // If bgr=false, convert to RGB
+      if (!bgr) {
+        cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);
+      }
+
+      // Check frame sizing, resize if it doesn't match expected blob size
+      cv::Mat resized_frame(frame);
+      if (static_cast<int>(width) != frame.size().width || static_cast<int>(height) != frame.size().height) {
+        // Resizes to 256 and centre crops based on actual needed dimensions, may add a flag for this in the future
+        // to be cleaner
+        if (static_cast<int>(width) < 256 && static_cast<int>(height) < 256) {
+          double scale;
+          if (frame.size().width <= frame.size().height)
+            scale = double(256) / frame.size().width;
+          else
+            scale = double(256) / frame.size().height;
+          cv::resize(frame, resized_frame, cv::Size(0, 0), scale, scale);
+          const int offsetW = (resized_frame.size().width - static_cast<int>(width)) / 2;
+          const int offsetH = (resized_frame.size().height - static_cast<int>(height)) / 2;
+          const cv::Rect roi(offsetW, offsetH, static_cast<int>(width), static_cast<int>(height));
+          resized_frame = resized_frame(roi).clone();
+        } else {
+          cv::resize(frame, resized_frame, cv::Size(width, height));
+        }
+      }
+      // Save frame to write
+      frames_to_write.emplace_back(resized_frame);
+    }
+  }
+
+  // Write frames to blob
+  for (size_t b = 0; b < batch_size; b++) {
+    size_t batch_offset = b * channels * frame_count * height * width;
+    for (size_t c = 0; c < channels; c++) {
+      size_t channel_offset = c * frame_count * height * width;
+      for (size_t frameId = b * frame_count; frameId < (b + 1) * frame_count; frameId++) {
+        const cv::Mat& frame_to_write = frames_to_write.at(frameId);
+        size_t frame_offset_id = frameId % frame_count;
+        size_t frame_offset = frame_offset_id * height * width;
+        for (size_t h = 0; h < height; h++) {
+          for (size_t w = 0; w < width; w++) {
+            data[batch_offset + channel_offset + frame_offset + h * width + w] = frame_to_write.at<cv::Vec3b>(h, w)[c];
+          }
+        }
+      }
+    }
+  }
+  cap.release();
+  return ov::Tensor(input_info.type, {batch, channels, frame_count, height, width}, ov::Allocator(allocator));
+}
+
+/**
+ * @brief Fills a tensor with image info data
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ *
+ * @param image_size Size of image width x height
+ * @param batch_size batch size of the tensor
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @return ov::Tensor containing the input data
+*/
+template <typename T>
+ov::Tensor CreateTensorImInfo(const std::pair<size_t, size_t>& image_size,
+                              size_t batch_size,
+                              const dla_benchmark::InputInfo& input_info,
+                              const std::string& input_name) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  size_t info_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    info_batch_size = batch_size;
+  } else {
+    slog::warn << input_name << ": layout is not set or does not contain batch dimension. Assuming batch 1. "
+               << slog::endl;
+  }
+
+  for (size_t b = 0; b < info_batch_size; b++) {
+    size_t im_info_size = tensor_size / info_batch_size;
+    for (size_t i = 0; i < im_info_size; i++) {
+      size_t index = b * im_info_size + i;
+      if (0 == i)
+        data[index] = static_cast<T>(image_size.first);
+      else if (1 == i)
+        data[index] = static_cast<T>(image_size.second);
+      else
+        data[index] = 1;
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, input_info.data_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Fills a tensor with binary data from input files
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ * Determines which image to use based of of input_id, batch_size, input_size, and request_id.
+ * Reads that and creates an input tensor of type T corresponding to input element type.
+ *
+ * @param files vector of file paths to the input images
+ * @param input_id binary input id, ie binary 1, binary 2...
+ * @param batch_size batch size of the tensor
+ * @param input_size number of images to be used
+ * @param request_id infer request id
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param input_name name of the input
+ * @param verbose prints extra logging information if true
+ * @return ov::Tensor containing the input data extracted from the binary
+*/
+template <typename T>
+ov::Tensor CreateTensorFromBinary(const std::vector<std::string>& files,
+                                  const size_t input_id,
+                                  const size_t batch_size,
+                                  const size_t input_size,
+                                  const size_t request_id,
+                                  const dla_benchmark::InputInfo& input_info,
+                                  const std::string& input_name,
+                                  const bool verbose = false) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  char* data = allocator->get_buffer();
+  size_t binary_batch_size = 1;
+  if (!input_info.layout.empty() && ov::layout::has_batch(input_info.layout)) {
+    binary_batch_size = batch_size;
+  } else {
+    slog::warn << input_name
+               << ": layout is not set or does not contain batch dimension. Assuming that binary "
+                  "data read from file contains data for all batches."
+               << slog::endl;
+  }
+
+  for (size_t b = 0, input_idx = request_id * batch_size * input_size + input_id; b < binary_batch_size; b++, input_idx += input_size) {
+    input_idx %= files.size();
+    if (input_idx <= MAX_COUT_WITHOUT_VERBOSE || verbose) {
+      slog::info << "Prepare binary file " << files[input_idx] << slog::endl;
+      if (!verbose && input_idx == MAX_COUT_WITHOUT_VERBOSE) {
+        slog::info << "Truncating list of input files. Run with --verbose for complete list." << slog::endl;
+      }
+    }
+    std::ifstream binary_file(files[input_idx], std::ios_base::binary | std::ios_base::ate);
+    OPENVINO_ASSERT(binary_file, "Cannot open ", files[input_idx]);
+
+    auto file_size = static_cast<std::size_t>(binary_file.tellg());
+    binary_file.seekg(0, std::ios_base::beg);
+    OPENVINO_ASSERT(binary_file.good(), "Can not read ", files[input_idx]);
+    auto input_size = tensor_size * sizeof(T) / binary_batch_size;
+    OPENVINO_ASSERT(file_size == input_size,
+                    "File ",
+                    files[input_idx],
+                    " contains ",
+                    file_size,
+                    " bytes, but the model expects ",
+                    input_size);
+
+    if (input_info.layout != "CN") {
+      binary_file.read(&data[b * input_size], input_size);
+    } else {
+      for (size_t i = 0; i < input_info.GetChannels(); i++) {
+        binary_file.read(&data[(i * binary_batch_size + b) * sizeof(T)], sizeof(T));
+      }
+    }
+  }
+
+  auto tensor = ov::Tensor(input_info.type, input_info.data_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Randomly fills input tensor, used when no input files is provided
+ *
+ * Helper function to GetStaticTensors(), not used outside this file.
+ *
+ * @param input_info InputInfo struct corresponding to the input node of the tensor
+ * @param rand_min Min. random value
+ * @param rand_max Max. random value
+ * @return ov::Tensor containing the the randomly generated input data
+*/
+template <typename T, typename T2>
+ov::Tensor CreateTensorRandom(const dla_benchmark::InputInfo& input_info,
+                              T rand_min = std::numeric_limits<uint8_t>::min(),
+                              T rand_max = std::numeric_limits<uint8_t>::max()) {
+  size_t tensor_size =
+      std::accumulate(input_info.data_shape.begin(), input_info.data_shape.end(), 1, std::multiplies<size_t>());
+  auto allocator = std::make_shared<SharedTensorAllocator>(tensor_size * sizeof(T));
+  auto data = reinterpret_cast<T*>(allocator->get_buffer());
+
+  std::mt19937 gen(0);
+  uniformDistribution<T2> distribution(rand_min, rand_max);
+  for (size_t i = 0; i < tensor_size; i++) {
+    data[i] = static_cast<T>(i%255);
+  }
+
+  ov::Shape tensor_shape = input_info.data_shape;
+  // FPGA model only supports channel first.
+  // The transpose for case NHWC and HWC below is ok since the tensor has randomly generated input data.
+  if (input_info.layout == "NHWC") {
+    // Use NCHW instead of NHWC since FPGA model only supports channel first.
+    tensor_shape = {input_info.GetBatch(), input_info.GetChannels(),
+                    input_info.GetHeight(), input_info.GetWidth()};
+  } else if (input_info.layout == "HWC") {
+    // Use CHW instead of HWC since FPGA model only supports channel first.
+    tensor_shape = {input_info.GetChannels(), input_info.GetHeight(), input_info.GetWidth()};
+  }
+
+  auto tensor = ov::Tensor(input_info.type, tensor_shape, ov::Allocator(allocator));
+  return tensor;
+}
+
+/**
+ * @brief Wrapper for CreateImageTensorFromImage, uses approriate stl data type for precision
+ *
+ * See CreateImageTensorFromImage for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetImageTensor(const std::vector<std::string>& files,
+                          const size_t input_id,
+                          const size_t batch_size,
+                          const size_t input_size,
+                          const size_t request_id,
+                          const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                          const FormatReader::Reader::ResizeType resize_type,
+                          const bool bgr = false,
+                          const bool verbose = false) {
+  // Edwinzha: All image data will be read as U8 but saved as a float in tensor data structure.
+  // Saving as U8 results in accuracy loss in diff check, especially in mobilenet graphs.
+  const ov::element::Type_t type = input_info.second.type;
+  if (type == ov::element::f16) {
+    return CreateTensorFromImage<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, resize_type, bgr, verbose);
+  } else  {
+    return CreateTensorFromImage<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, resize_type, bgr, verbose);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorFromVideo, uses appropriate stl data type for precision
+ *
+ * See CreateTensorFromVideo for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetVideoTensor(const std::vector<std::string>& files,
+                          const size_t input_id,
+                          const size_t batch_size,
+                          const size_t input_size,
+                          const size_t request_id,
+                          const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                          const bool bgr = false,
+                          const bool verbose = false) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorFromVideo<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::u8) {
+    return CreateTensorFromVideo<uint8_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::i32) {
+    return CreateTensorFromVideo<int32_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else if (type == ov::element::f16) {
+    return CreateTensorFromVideo<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, bgr, verbose);
+  } else {
+    throw ov::Exception("Video input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorRandom, uses appropriate stl data type for precision
+ *
+ * See CreateTensorRandom for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetRandomTensor(const std::pair<std::string, dla_benchmark::InputInfo>& input_info) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorRandom<float, float>(input_info.second);
+  } else if (type == ov::element::f16) {
+    return CreateTensorRandom<short, short>(input_info.second);
+  } else if (type == ov::element::i32) {
+    return CreateTensorRandom<int32_t, int32_t>(input_info.second);
+  } else if (type == ov::element::u8) {
+    // uniform_int_distribution<uint8_t> is not allowed in the C++17
+    // standard and vs2017/19
+    return CreateTensorRandom<uint8_t, uint32_t>(input_info.second);
+  } else if (type == ov::element::i8) {
+    // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
+    // and vs2017/19
+    return CreateTensorRandom<int8_t, int32_t>(
+        input_info.second, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
+  } else if (type == ov::element::u16) {
+    return CreateTensorRandom<uint16_t, uint16_t>(input_info.second);
+  } else if (type == ov::element::i16) {
+    return CreateTensorRandom<int16_t, int16_t>(input_info.second);
+  } else {
+    throw ov::Exception("Random input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for CreateTensorImInfo, uses appropriate stl data type for precision
+ *
+ * See CreateTensorImInfo for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetImInfoTensor(const std::pair<size_t, size_t>& image_size,
+                           size_t batch_size,
+                           const std::pair<std::string, dla_benchmark::InputInfo>& input_info) {
+  auto type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorImInfo<float>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::f64) {
+    return CreateTensorImInfo<double>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::f16) {
+    return CreateTensorImInfo<ov::float16>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::i32) {
+    return CreateTensorImInfo<int32_t>(image_size, batch_size, input_info.second, input_info.first);
+  } else if (type == ov::element::i64) {
+    return CreateTensorImInfo<int64_t>(image_size, batch_size, input_info.second, input_info.first);
+  } else {
+    throw ov::Exception("Image info input tensor type is not supported:" + input_info.first);
+  }
+}
+
+/**
+ * @brief Wrapper for GetBinaryTensor, uses appropriate stl data type for precision
+ *
+ * See GetBinaryTensor for params. Helper for GetStaticTensors, not used outside this file.
+*/
+ov::Tensor GetBinaryTensor(const std::vector<std::string>& files,
+                           const size_t input_id,
+                           const size_t batch_size,
+                           const size_t input_size,
+                           const size_t request_id,
+                           const std::pair<std::string, dla_benchmark::InputInfo>& input_info,
+                           const bool verbose = false) {
+  const auto& type = input_info.second.type;
+  if (type == ov::element::f32) {
+    return CreateTensorFromBinary<float>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if (type == ov::element::f16) {
+    return CreateTensorFromBinary<ov::float16>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if (type == ov::element::i32) {
+    return CreateTensorFromBinary<int32_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else if ((type == ov::element::u8)) {
+    return CreateTensorFromBinary<uint8_t>(
+        files, input_id, batch_size, input_size, request_id, input_info.second, input_info.first, verbose);
+  } else {
+    throw ov::Exception("Binary input tensor type is not supported: " + input_info.first);
+  }
+}
+
+/**
+ * @brief Main function used by DLA benchmark, creates input tensors based off of input files and precision
+ *
+ * Only creates static tensors (no dims of -1). Calls all other functions in this file.
+ *
+ * @param input_files vector of input file paths
+ * @param batch_size batch size of input
+ * @param inputs_info map of input name to InputInfo struct which contains useful input information
+ *                    such as precision, tensor layout
+ * @param requests_num number of infer requests
+ * @param bgr boolean indicating if channels are reversed, corresponds to user bgr flag
+ * @param is_binary_data boolean indicating if the image data should be binary, corresponding to user binary flag
+ * @param verbose Verbosity boolean. If true, additional logs are printed
+ * @return A map of input name with tensor vectors. TensorVector being an alias of ov::Tensors where
+ *         each index corresponds to the batch
+*/
+std::map<std::string, ov::TensorVector> GetStaticTensors(const std::vector<std::string>& input_files,
+                                                         const size_t& batch_size,
+                                                         dla_benchmark::InputsInfo& inputs_info,
+                                                         size_t requests_num,
+                                                         std::string resize_type,
+                                                         bool bgr = false,
+                                                         bool is_binary_data = false,
+                                                         bool verbose = false) {
+  std::map<std::string, ov::TensorVector> blobs;
+  std::vector<std::pair<size_t, size_t>> net_input_im_sizes;
+  std::vector<std::tuple<size_t, size_t, size_t>> net_input_vid_sizes;
+  FormatReader::Reader::ResizeType resize_type_enum;
+
+  if (resize_type == "resize") {
+    resize_type_enum = FormatReader::Reader::ResizeType::RESIZE;
+  } else if (resize_type == "pad_resize") {
+    resize_type_enum = FormatReader::Reader::ResizeType::PAD_RESIZE;
+  } else {
+    slog::err << resize_type << " is not a valid -resize_type option" << slog::endl;
+    exit(1);
+  }
+
+  for (auto& item : inputs_info) {
+    const std::string& name = item.first;
+    const auto& input_info = item.second;
+    if (input_info.IsImage() && !is_binary_data) {
+      net_input_im_sizes.emplace_back(input_info.GetWidth(), input_info.GetHeight());
+    } else if (input_info.IsVideo()) {
+      net_input_vid_sizes.emplace_back(input_info.GetDepth(), input_info.GetWidth(), input_info.GetHeight());
+    }
+    slog::info << "Network input '" << name << "' precision " << input_info.type << ", dimensions "
+               << input_info.layout.to_string() << ": ";
+    slog::info << "[";
+    for (size_t i = 0; i < input_info.data_shape.size(); ++i) {
+      slog::info << input_info.data_shape[i];
+      if (i < input_info.data_shape.size() - 1) {
+        slog::info << " ";
+      }
+    }
+    slog::info << "]" << slog::endl;
+  }
+
+  size_t img_input_count = net_input_im_sizes.size();
+  size_t vid_input_count = net_input_vid_sizes.size();
+  size_t bin_input_count = inputs_info.size() - img_input_count - vid_input_count;
+
+  std::vector<std::string> binary_files;
+  std::vector<std::string> image_files;
+  std::vector<std::string> video_files;
+
+  if (input_files.empty()) {
+    slog::warn << "No input files were given: all inputs will be filled with random values!" << slog::endl;
+  } else {
+    binary_files = FilterFilesByExtensions(input_files, supported_binary_extensions);
+    std::sort(std::begin(binary_files), std::end(binary_files));
+
+    auto bins_to_be_used = bin_input_count * batch_size * requests_num;
+    if (bins_to_be_used > 0 && binary_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_binary_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported binary inputs found! Please check your file "
+                    "extensions: "
+                 << ss.str() << slog::endl;
+    } else if (bins_to_be_used > binary_files.size()) {
+      slog::warn << "Some binary input files will be duplicated: " << bins_to_be_used << " files are required but only "
+                 << binary_files.size() << " are provided" << slog::endl;
+    } else if (bins_to_be_used < binary_files.size()) {
+      slog::warn << "Some binary input files will be ignored: only " << bins_to_be_used << " are required from "
+                 << binary_files.size() << slog::endl;
+    }
+
+    image_files = FilterFilesByExtensions(input_files, supported_image_extensions);
+    std::sort(std::begin(image_files), std::end(image_files));
+
+    auto imgs_to_be_used = img_input_count * batch_size * requests_num;
+    if (imgs_to_be_used > 0 && image_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_image_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported image inputs found! Please check your file "
+                    "extensions: "
+                 << ss.str() << slog::endl;
+    } else if (imgs_to_be_used > image_files.size()) {
+      slog::warn << "Some image input files will be duplicated: " << imgs_to_be_used << " files are required but only "
+                 << image_files.size() << " are provided" << slog::endl;
+    } else if (imgs_to_be_used < image_files.size()) {
+      slog::warn << "Some image input files will be ignored: only " << imgs_to_be_used << " are required from "
+                 << image_files.size() << slog::endl;
+    }
+
+    video_files = FilterFilesByExtensions(input_files, supported_video_extensions);
+    std::sort(std::begin(video_files), std::end(video_files));
+    auto vids_to_be_used = vid_input_count * requests_num;
+    if (vids_to_be_used > 0 && video_files.empty()) {
+      std::stringstream ss;
+      for (auto& ext : supported_video_extensions) {
+        if (!ss.str().empty()) {
+          ss << ", ";
+        }
+        ss << ext;
+      }
+      slog::warn << "No supported video inputs found! Please check your file extensions: " << ss.str() << slog::endl;
+    } else if (vids_to_be_used > video_files.size()) {
+      slog::warn << "Some video input files will be duplicated: " << vids_to_be_used << " files are required but only "
+                 << video_files.size() << " are provided" << slog::endl;
+    } else if (vids_to_be_used < video_files.size()) {
+      slog::warn << "Some video input files will be ignored: only " << vids_to_be_used << " are required from "
+                 << video_files.size() << slog::endl;
+    }
+  }
+
+  for (size_t i = 0; i < requests_num; ++i) {
+    size_t img_input_id = 0;
+    size_t bin_input_id = 0;
+    size_t vid_input_id = 0;
+
+    for (auto& item : inputs_info) {
+      const std::string& input_name = item.first;
+      const auto& input_info = item.second;
+      if (item.second.IsImage() && !is_binary_data) {
+        if (!image_files.empty()) {
+          // Fill with images
+          blobs[input_name].push_back(GetImageTensor(
+              image_files, img_input_id++, batch_size, img_input_count, i, {input_name, input_info}, resize_type_enum, bgr, verbose));
+          continue;
+        }
+      } else if (input_info.IsVideo()) {
+        if (!video_files.empty()) {
+          // Fill with videos
+          blobs[input_name].push_back(GetVideoTensor(
+              video_files, vid_input_id++, batch_size, vid_input_count, i, {input_name, input_info}, bgr, verbose));
+          continue;
+        }
+      } else {
+        if (!binary_files.empty()) {
+          // Fill with binary files
+          blobs[input_name].push_back(
+              GetBinaryTensor(binary_files, bin_input_id++, batch_size, bin_input_count, i, {input_name, input_info}, verbose));
+          continue;
+        }
+        if (input_info.IsImageInfo() && (net_input_im_sizes.size() == 1)) {
+          // Most likely it is image info: fill with image information
+          auto image_size = net_input_im_sizes.at(0);
+          blobs[input_name].push_back(GetImInfoTensor(image_size, batch_size, {input_name, input_info}));
+          continue;
+        }
+      }
+      // Fill random
+      slog::info << "No suitable input data found, filling input tensors with random data.\n";
+      blobs[input_name].push_back(GetRandomTensor({input_name, input_info}));
+    }
+  }
+
+  return blobs;
+}
+
+/**
+ * @brief Copies data from a source OpenVINO Tensor to a destination Tensor.
+ *
+ * @param dst The destination Tensor where data will be copied.
+ * @param src The source Tensor from which data will be copied.
+ */
+void CopyTensorData(ov::Tensor& dst, const ov::Tensor& src) {
+  if (src.get_shape() != dst.get_shape() || src.get_byte_size() != dst.get_byte_size()) {
+    throw std::runtime_error(
+        "Source and destination tensors shapes and byte sizes are expected to be equal for data copying.");
+  }
+
+  memcpy(dst.data(), src.data(), src.get_byte_size());
+}
diff --git a/python/openvino/runtime/dla_benchmark/inputs_filling.hpp b/python/openvino/runtime/dla_benchmark/inputs_filling.hpp
new file mode 100644
index 0000000..e392bd7
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/inputs_filling.hpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file defines methods to fill input data into tensors
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+#include "infer_request_wrap.hpp"
+
+/**
+ * @brief Main function used by DLA benchmark, creates input tensors based off of input files and precision
+ *
+ * Only creates static tensors (no dims of -1). Calls all other functions in this file.
+ *
+ * @param input_files vector of input file paths
+ * @param batch_size batch size of input
+ * @param inputs_info map of input name to InputInfo struct which contains useful input information
+ *                    such as precision, tensor layout
+ * @param requests_num number of infer requests
+ * @param bgr boolean indicating if channels are reversed, corresponds to user bgr flag
+ * @param is_binary_data boolean indicating if the image data should be binary, corresponding to user binary flag
+ * @param verbose Verbosity boolean. If true, additional logs are printed
+ * @return A map of input name with tensor vectors. TensorVector being an alias of ov::Tensors where
+ *         each index corresponds to the batch
+*/
+std::map<std::string, ov::TensorVector> GetStaticTensors(const std::vector<std::string>& input_files,
+                                                         const size_t& batch_size,
+                                                         dla_benchmark::InputsInfo& app_inputs_info,
+                                                         size_t requests_num,
+                                                         std::string resize_type,
+                                                         bool bgr,
+                                                         bool is_binary_data,
+                                                         bool verbose);
+/**
+ * @brief Copies data from a source OpenVINO Tensor to a destination Tensor.
+ *
+ * @param dst The destination Tensor where data will be copied.
+ * @param src The source Tensor from which data will be copied.
+ */
+void CopyTensorData(ov::Tensor& dst, const ov::Tensor& src);
diff --git a/python/openvino/runtime/dla_benchmark/main.cpp b/python/openvino/runtime/dla_benchmark/main.cpp
new file mode 100644
index 0000000..9d9055d
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/main.cpp
@@ -0,0 +1,1575 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Main file of DLA benchmark. Entry point of DLA for just in time, ahead of time execution
+//              and any use case of DLA performing inference. This file is responsible for the end to end flow of DLA,
+//              from reading user input arguments, creating input tensors, compiling models, running inference
+//              dumping results. DLA benchmark is loosely based off of OpenVINO's sample benchmark app.
+//              For future OpenVINO uplifts viewing their sample app is a good place to start.
+//              Ref: [openvinotoolkit/openvino › samples/cpp/benchmark_app/main.cpp]
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+#if defined(_WIN32) || defined(_WIN64)
+#include <io.h>
+#define NOMINMAX
+#include <Windows.h>
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <regex>
+
+#include <samples/args_helper.hpp>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+// DLA utils
+#include "dla_stl_utils.h"
+#include "dla_defines.h"
+
+// DLA benchmark
+#include "average_precision.hpp"
+#include "dla_benchmark.hpp"
+#include "dla_plugin_config.hpp"
+#include "infer_request_wrap.hpp"
+#include "inputs_filling.hpp"
+#include "progress_bar.hpp"
+#include "statistics_report.hpp"
+#include "top1_top5.hpp"
+#include "utils.hpp"
+
+using DebugNetworkData = std::map<std::string, uint64_t>;
+using LSUCounterData   = std::map<std::string, uint64_t>;
+
+static const size_t progressBarDefaultTotalCount = 1000;
+
+// Get value from env variable named 'name', if it exists.
+// If not, returns provided default value.
+template <class T>
+T GetEnvOrDefault(const char* name, T default_value) {
+  char* str_val = std::getenv(name);
+  T result = default_value;
+  if (str_val != NULL) {
+    std::stringstream ss;
+    ss << str_val;
+    ss >> result;
+  }
+  return result;
+}
+
+bool ExistsTest(const std::string& name) {
+  struct stat buffer;
+  return (stat(name.c_str(), &buffer) == 0);
+}
+
+bool isFile(const std::string& path) {
+#if defined(_WIN32) || defined(_WIN64)
+  std::cout << "Windows-specific implementation for checking if something is a file" << std::endl;
+  // Windows-specific implementation
+  DWORD fileAttr = GetFileAttributesA(path.c_str());
+  if (fileAttr == INVALID_FILE_ATTRIBUTES) {
+    // The path does not exist or an error occurred.
+    return false;
+  }
+  // Check if it's not a directory.
+  return !(fileAttr & FILE_ATTRIBUTE_DIRECTORY);
+#else
+  // UNIX-specific implementation
+  struct stat buffer;
+  if (stat(path.c_str(), &buffer) == 0) {
+    return S_ISREG(buffer.st_mode);
+  }
+  return false;
+#endif
+}
+
+// This function appears in dla_aot_splitter/src/main.cpp too
+bool DirOpenTest(const std::string& name) {
+#if (!defined(_WIN32) && !defined(_WIN64))
+  // If we can open the directory then return true
+  DIR* dp = opendir(name.c_str());
+  if (dp != nullptr) {
+    closedir(dp);
+    return true;
+  }
+#endif  // !_WIN32 && !_WIN64
+  struct stat sb;
+  if (stat(name.c_str(), &sb) == 0) {
+    if ((sb.st_mode & S_IFMT) != S_IFREG) {
+      slog::err << "File " << name << " cannot be opened!" << slog::endl;
+      throw std::logic_error("File cannot be opened!");
+    }
+  }
+  return true;
+}
+
+// Define a custom comparison function to sort based on ASCII names
+bool CompareOutputNodeNames(const ov::Output<const ov::Node>& node1, const ov::Output<const ov::Node>& node2) {
+  return node1.get_any_name() < node2.get_any_name();
+}
+
+// copy arguments into a new array to split the '-i=<arg>' into
+// two arguments (i.e. '-i' and '<arg>') to overcome a bug
+// parseInputFilesArguments function where is doesn't recognize
+// the -i=<arg> format
+void ParseCommandLine(int argc, char** argv) {
+  int num_args = argc;
+  // allocated enough memory in case we needed to split the -i argument into two
+  char** arguments = new char*[num_args + 1];
+  for (int i = 0, j = 0; j < argc; ++i, ++j) {
+    if (strstr(argv[j], "-i=")) {
+      // number of arguments will increase by one after splitting
+      num_args++;
+      arguments[i] = new char[3];
+      strcpy(arguments[i++], "-i");
+      // copy the reset of the argument (i.e. post "-i=")
+      arguments[i] = new char[strlen(argv[j]) - 2];
+      strcpy(arguments[i], argv[j] + 3);
+      continue;
+    }
+    arguments[i] = new char[strlen(argv[j]) + 1];
+    strcpy(arguments[i], argv[j]);
+  }
+  // the parse function is modifying the arguments point so we need to keep
+  // a copy of the original pointer value to delete it properly
+  char** orig_arg_ptr = arguments;
+  gflags::ParseCommandLineNonHelpFlags(&num_args, &arguments, true);
+  // delete the allocated memory
+  for (int i = 0; i < num_args; ++i) {
+    delete[] orig_arg_ptr[i];
+  }
+  delete[] orig_arg_ptr;
+}
+
+bool CheckAndSetPluginsPath(const char* coredla_root) {
+  // plugins_xml_file should probably be removed in the future
+  if (!FLAGS_plugins_xml_file.empty()) {
+    FLAGS_plugins = FLAGS_plugins_xml_file;
+    slog::warn << "====================================================================" << slog::endl;
+    slog::warn << "Warning: -plugins_xml_file option is deprecated, please use -plugins." << slog::endl;
+    slog::warn << "====================================================================" << slog::endl;
+  }
+
+  const char* coredla_work = std::getenv("COREDLA_WORK");
+  std::string coredla_root_str = coredla_root;
+  if (FLAGS_plugins.empty()) {
+    if (coredla_work == nullptr) {
+      FLAGS_plugins = coredla_root_str + "/runtime/plugins.xml";
+    } else {
+      std::string coredla_work_str = coredla_work;
+      FLAGS_plugins = coredla_work_str + "/runtime/plugins.xml";
+    }
+
+    if (ExistsTest(FLAGS_plugins)) {
+      slog::info << "Using default plugins xml file - " << FLAGS_plugins << slog::endl;
+      return true;
+    }
+  }
+
+  if (ExistsTest(FLAGS_plugins) && isFile(FLAGS_plugins)) {
+    slog::info << "Using custom plugins xml file - " << FLAGS_plugins << slog::endl;
+    return true;
+  }
+  // Check if user wants a shortcut to software emulation xml file if a path does not exist
+  if (FLAGS_plugins.find("emulation") != std::string::npos) {
+    // Potential paths for the plugins_emulation.xml file
+    std::string deployed_loc_plugins = coredla_root_str + "/bin/plugins_emulation.xml";
+    std::string developer_loc_plugins = coredla_root_str + "/build/coredla/dla/bin/plugins_emulation.xml";
+
+    if (ExistsTest(deployed_loc_plugins))
+      FLAGS_plugins = deployed_loc_plugins;
+    else if (ExistsTest(developer_loc_plugins))
+      FLAGS_plugins = developer_loc_plugins;
+  } else {
+    // if user didn't specify emulation and user did not pass any xml file, raise an error
+    throw std::invalid_argument("Invalid argument for -plugins. Use 'emulation' or a path to custom xml file");
+  }
+
+  if (ExistsTest(FLAGS_plugins)) {
+    slog::info << "Using custom emulation xml file - " << FLAGS_plugins << slog::endl;
+    return true;
+  }
+
+  return false;
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[], size_t& net_size) {
+  // ---------------------------Parsing and validating input arguments--------------------------------------
+  slog::info << "Parsing input parameters" << slog::endl;
+
+  // Check for any flags that are missing their preceding dashes
+  // GFlags quietly ignores any flags missing their dashes, which can cause
+  // dla_benchmark to run with settings other than what the user intended
+
+  // GFlags supports two different styles of flag:
+  // 1. --<flag>
+  // 2. -<flag>
+  // It also supports two different ways of specifying values for flags which
+  // take values:
+  // 1. --<flag>=<value>
+  // 2. --<flag> <value>
+
+  // If we are not expecting a flag, we are expecting a value for the
+  // preceding flag
+  bool expecting_flag = true;
+  // Start at 1 to skip the command itself
+  for (int i = 1; i < argc; i++) {
+    if (expecting_flag) {
+      // A flag is always denoted by the first char being '-'
+      if (argv[i][0] != '-') {
+        slog::err << "Argument " << argv[i] << " is invalid. You"
+                  << " may have forgotten a preceding '-'." << slog::endl;
+        throw std::logic_error("One or more invalid arguments");
+      }
+
+      char* flag_name_start = (argv[i][1] == '-') ? &argv[i][2] : &argv[i][1];
+      std::string flag_name;
+
+      gflags::CommandLineFlagInfo flag_info;
+      if (strstr(flag_name_start, "=")) {
+        flag_name = std::string(flag_name_start, size_t(strstr(flag_name_start, "=") - flag_name_start));
+      } else {
+        flag_name = std::string(flag_name_start);
+      }
+
+      // We expect a flag in the next argv if the current flag is a bool,
+      // because bool flags do not take a value.
+      // If GetCommandLineFlagInfo returns false, we assume the current
+      // flag is a boolean because boolean flags can be specified as
+      // -no<flag>, which is equivalent to -<flag>=false, or the flag
+      // simply being omitted. However, "no<flag>" is not recognized by
+      // GetCommandLineFlagInfo.
+      // Therefore, if the name is not recognized either the flag is a
+      // boolean flag or doesn't exist. In the latter case, gflags errors
+      // when we call ParseCommandLine so we can assume here it's a bool.
+      if (!GetCommandLineFlagInfo(flag_name.c_str(), &flag_info) || strstr(argv[i], "=") || flag_info.type == "bool") {
+        expecting_flag = true;
+      } else {
+        expecting_flag = false;
+      }
+    } else {
+      // If we were expecting a value, doesn't matter what it is
+      // gflags will check all values are the correct type, and
+      // dla_benchmark checks if the values received are sane
+      expecting_flag = true;
+    }
+  }
+
+  ParseCommandLine(argc, argv);
+
+  if (FLAGS_help || FLAGS_h) {
+    ShowUsage();
+    // CoreDLA: Version 2020.3 of OpenVINO assumes that the PAC board with OPAE on it
+    // is an OpenCL/DLAv1 device.  Since it is not, it then errors-out when the device
+    // does not response as expected to the OpenCL query.
+    // showAvailableDevices();
+    std::cout << "\n";
+    return false;
+  }
+
+  if (FLAGS_hidden_help) {
+    PrintHiddenHelp();
+    return false;
+  }
+
+  if (FLAGS_cm.empty()) {
+    std::string network_file_flag;
+    if (!FLAGS_m.empty()) {
+      if (!FLAGS_network_file.empty()) {
+        throw std::invalid_argument(
+            "Both --network-file and -m are specified. Please only use one of the two arguments.");
+      }
+      network_file_flag = FLAGS_m;
+    } else if (!FLAGS_network_file.empty()) {
+      network_file_flag = FLAGS_network_file;
+    } else {
+      throw std::logic_error("Model is required but not set. Please set -m option.");
+    }
+
+    std::vector<std::string> m_paths = split(network_file_flag, MULTIGRAPH_SEP);
+    net_size = m_paths.size();
+    slog::info << "Found " << net_size << " graph" << (net_size == 1 ? "" : "s") << slog::endl;
+    for (auto& m_path : m_paths) {
+      if (!ExistsTest(m_path)) {
+        slog::err << "network file: " << m_path << " doesn't exist. Please provide a valid path with -m." << slog::endl;
+        throw std::logic_error("Model file path does not exist.");
+      }
+    }
+  } else {
+    std::vector<std::string> m_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+    net_size = m_paths.size();
+    slog::info << "Found " << net_size << " compiled graph" << (net_size == 1 ? "" : "s") << slog::endl;
+    for (auto& m_path : m_paths) {
+      if (!ExistsTest(m_path)) {
+        slog::err << "compiled model file: " << FLAGS_cm << " doesn't exist. Please provide a valid path with -cm."
+                  << slog::endl;
+        throw std::logic_error("Compiled model file path does not exist.");
+      }
+    }
+  }
+
+  if (FLAGS_api != "async" && FLAGS_api != "sync") {
+    throw std::logic_error("Incorrect API. Please set -api option to `sync` or `async` value.");
+  }
+
+  if (FLAGS_niter <= 0) {
+    throw std::logic_error("-niter is a required flag and its value must be positive");
+  }
+
+  const char* coredla_root = std::getenv("COREDLA_ROOT");
+  if (coredla_root == nullptr) {
+    slog::err << "ERROR: COREDLA_ROOT environment variable is not set." << slog::endl;
+    throw std::logic_error("Please set up correct environment variables first");
+  }
+
+  if (!CheckAndSetPluginsPath(coredla_root)) {
+    slog::err << "plugins_xml file: " << FLAGS_plugins_xml_file << " doesn't exist. Please provide a valid path."
+              << slog::endl;
+    throw std::logic_error("plugins_xml file path does not exist.");
+  }
+
+  // Checks required arguments for the mAP calculation subroutine.
+  if (FLAGS_enable_object_detection_ap) {
+    if (!FLAGS_yolo_version.size() || !is_yolo_supported(FLAGS_yolo_version)) {
+      slog::err << "Please specify the version of your YOLO graph by setting the -yolo_version option to "
+                   "`yolo-v3-tiny-tf` or `yolo-v3-tf` value."
+                << slog::endl;
+      throw std::logic_error("Incorrect YOLO version.");
+    }
+  }
+
+  // Checks if output directory exists and can be opened
+  if (!FLAGS_output_dir.empty()) {
+    if (!ExistsTest(FLAGS_output_dir)) {
+      slog::err << "Specified output directory: " << FLAGS_output_dir << " does not exist" << slog::endl;
+      throw std::logic_error("Output directory does not exist");
+    }
+    // Test whether the path can be opened if it's a directory
+    DirOpenTest(FLAGS_output_dir);
+  }
+
+  return true;
+}
+
+static void next_step(const std::string additional_info = "") {
+  static size_t step_id = 0;
+  static const std::map<size_t, std::string> step_names = {{1, "Parsing and validating input arguments"},
+                                                           {2, "Loading OpenVINO Runtime"},
+                                                           {3, "Setting device configuration"},
+                                                           {4, "Reading the Intermediate Representation network"},
+                                                           {5, "Resizing network to match image sizes and given batch"},
+                                                           {6, "Configuring input of the model"},
+                                                           {7, "Loading the model to the device"},
+                                                           {8, "Setting optimal runtime parameters"},
+                                                           {9, "Creating infer requests and preparing input tensors"},
+                                                           {10, "Measuring performance"},
+                                                           {11, "Dumping statistics report"},
+                                                           {12, "Dumping the output values"}};
+
+  step_id++;
+  if (step_names.count(step_id) == 0)
+    THROW_IE_EXCEPTION << "Step ID " << step_id << " is out of total steps number " << step_names.size();
+
+  std::cout << "[Step " << step_id << "/" << step_names.size() << "] " << step_names.at(step_id)
+            << (additional_info.empty() ? "" : " (" + additional_info + ")") << std::endl;
+}
+
+template <typename T>
+T GetMedianValue(const std::vector<T>& vec) {
+  std::vector<T> sorted_vec(vec);
+  std::sort(sorted_vec.begin(), sorted_vec.end());
+  return (sorted_vec.size() % 2 != 0)
+             ? sorted_vec[sorted_vec.size() / 2ULL]
+             : (sorted_vec[sorted_vec.size() / 2ULL] + sorted_vec[sorted_vec.size() / 2ULL - 1ULL]) /
+                   static_cast<T>(2.0);
+}
+
+void ReadDebugNetworkInfo(ov::Core core) {
+  if (FLAGS_debug_network) {
+    // On hardware timeout exception, fetch Debug CSR values from all modules attached to the Debug Network
+    std::vector<DebugNetworkData> debug_csr_return =
+        core.get_property("FPGA", "COREDLA_DEBUG_NETWORK_INFO").as<std::vector<DebugNetworkData>>();
+    slog::info << "Dumping Debug Network profiling counters" << slog::endl;
+    for (auto i = 0U; i < debug_csr_return.size(); i++) {
+      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+      // Print debug info for all instances
+      for (auto& instance_csr_return : debug_csr_return[i]) {
+        std::cout << instance_csr_return.first << ": " << instance_csr_return.second << std::endl;
+      }
+    }
+  }
+}
+
+void PrintLSUCounterInfo(ov::Core core) {
+  std::vector<LSUCounterData> lsu_counter_vec =
+    core.get_property("FPGA", "COREDLA_LSU_ACCESS_COUNT").as<std::vector<LSUCounterData>>();
+    slog::info << "Dumping LSU memory access counters" << slog::endl;
+    for (auto i = 0U; i < lsu_counter_vec.size(); i++) {
+      std::cout << "---------- CoreDLA instance " << i << " ----------" << std::endl;
+      for (const auto& entry : lsu_counter_vec.at(i)) {
+        std::cout << entry.first <<": " << entry.second << std::endl;
+      }
+    }
+}
+
+// Returns true if last char of csv is a comma
+bool is_last_char_comma(FILE* file) {
+  if (file == nullptr) return 0;
+
+  int i = -1;
+  std::vector<char> white_space_chars = {'\n', ' ', '\t', '\r', '\f', '\v'};
+  char last_char[1];
+  do {
+    if (std::fseek(file, i, SEEK_END) != 0) {
+      return 0;
+    }
+    if (std::fread(last_char, 1, 1, file) == 0) {
+      return 0;
+    }
+    i--;
+  } while (std::count(white_space_chars.begin(), white_space_chars.end(), last_char[0]) != 0);
+
+  return last_char[0] == ',';
+}
+
+bool fileExists(std::string& path) {
+  struct stat buffer;
+  return (stat(path.c_str(), &buffer) == 0);
+}
+
+void append_value_if_incomplete_to_csv(std::string path, double value) {
+  try {
+    if (!fileExists(path)) {
+      return;
+    }
+
+    FILE* data_file = fopen(path.c_str(), "rb");
+    if (data_file == nullptr) {
+      return;
+    }
+    bool is_comma = is_last_char_comma(data_file);
+    fclose(data_file);
+
+    if (is_comma) {
+      FILE* append_file = fopen(path.c_str(), "a");
+      if (append_file == nullptr) {
+        return;
+      }
+      fprintf(append_file, "%f\n", value);
+      fclose(append_file);
+    }
+  } catch (...) {
+    return;
+  }
+}
+
+/**
+ * @brief The entry point of the dla benchmark
+ */
+int main(int argc, char* argv[]) {
+  std::shared_ptr<StatisticsReport> statistics;
+  try {
+    // Declaring the CompiledModel object as a pointer to workaround the segfault
+    // that occurs when destructing the object. Now that it's declared as a pointer
+    // the complier won't automatically call the destructor of the object at the end
+    // of this scope and we won't delete the allocated memory either
+    std::vector<ov::CompiledModel*> compiled_models;
+    size_t net_size = 0;  // parse the size of networks for arguments check
+
+    size_t return_code = 0;  // universal return code, return this value after dumping out Debug info
+
+    // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
+    next_step();
+
+    if (!ParseAndCheckCommandLine(argc, argv, net_size)) {
+      return 0;
+    }
+
+    bool is_model_compiled = !FLAGS_cm.empty();
+    if (is_model_compiled) {
+      slog::info << "Model is compiled" << slog::endl;
+    }
+
+    std::string arch_file_flag;
+    if (!FLAGS_arch_file.empty()) {
+      if (!FLAGS_arch.empty()) {
+        throw std::invalid_argument(
+            "Both --arch and -arch_file are specified. Please only use one of the two arguments.");
+      }
+      arch_file_flag = FLAGS_arch_file;
+    } else if (!FLAGS_arch.empty()) {
+      arch_file_flag = FLAGS_arch;
+    }
+
+    bool flag_b_default = gflags::GetCommandLineFlagInfoOrDie("b").is_default;
+    bool flag_batch_size_default = gflags::GetCommandLineFlagInfoOrDie("batch_size").is_default;
+
+    size_t batch_size_flag;
+    if (!flag_b_default) {
+      if (!flag_batch_size_default) {
+        throw std::invalid_argument(
+            "Both --batch-size and -b are specified. Please only use one of the two arguments.");
+      }
+      batch_size_flag = FLAGS_b;
+    } else {
+      batch_size_flag = FLAGS_batch_size;
+    }
+
+    if (batch_size_flag > 10000 || batch_size_flag <= 0) {
+      throw std::invalid_argument(
+          "Batch size is too big (>10000) or not a postive number (<=0). Specify the batch size within the specified "
+          "range.");
+    }
+
+    std::string network_file_flag;
+    if (!FLAGS_m.empty()) {
+      if (!FLAGS_network_file.empty()) {
+        throw std::invalid_argument(
+            "Both --network-file and -m are specified. Please only use one of the two arguments.");
+      }
+      network_file_flag = FLAGS_m;
+    } else if (!FLAGS_network_file.empty()) {
+      network_file_flag = FLAGS_network_file;
+    }
+
+    // langsu: ideally use boost to create a sub-folder for ddrfree files
+    // but ed4 toolchain doesn't have boost yet.
+    std::string output_dir;
+    std::string parameter_rom_output_dir;
+    std::string separator = dla::util::path_separator;
+    if (!FLAGS_output_dir.empty()) {
+      output_dir = FLAGS_output_dir + separator;
+      parameter_rom_output_dir = output_dir;
+    } else {
+      output_dir = "." + separator;
+      parameter_rom_output_dir = output_dir;
+    }
+
+    // The set of arguments printed is meant to be a useful summary to the
+    // user, rather than all of the arguments to dla_benchmark
+    slog::info << "Printing summary of arguments being used by dla_benchmark" << slog::endl
+               << "API (-api) ........................... " << FLAGS_api << slog::endl
+               << "Device (-d) .......................... " << FLAGS_d << slog::endl
+               << "Batch size (-b) ...................... " << batch_size_flag << slog::endl
+               << (!FLAGS_cm.empty() ? "Compiled model (-cm) ................. "
+                                     : "Model (-m) ........................... ")
+               << (!FLAGS_cm.empty() ? FLAGS_cm : network_file_flag) << slog::endl
+               << "Num iterations (-niter) .............. "
+               << (FLAGS_niter > 0 ? std::to_string(FLAGS_niter) : "Not specified") << slog::endl
+               << "Input images directory (-i) .......... "
+               << (!FLAGS_i.empty() ? FLAGS_i : "Not specified, will use randomly-generated images") << slog::endl
+               << "Num CPU threads (-nthreads) .......... "
+               << (FLAGS_nthreads > 0 ? std::to_string(FLAGS_nthreads) : "Not specified") << slog::endl
+               << "Architecture file (-arch_file) ....... " << arch_file_flag << slog::endl
+               << "Num inference requests (-nireq) ...... "
+               << (FLAGS_nireq > 0 ? std::to_string(FLAGS_nireq) : "Not specified") << slog::endl
+               << "Plugins file (-plugins) ..... " << FLAGS_plugins << slog::endl
+               << "Groundtruth file (-groundtruth_loc) .. "
+               << (!FLAGS_groundtruth_loc.empty() ? FLAGS_groundtruth_loc : "Not specified") << slog::endl
+               << "Reverse input image channels (-bgr) .. " << (FLAGS_bgr ? "True" : "False") << slog::endl
+               << "EA features " << (FLAGS_enable_early_access ? "enabled." : "disabled.") << slog::endl;
+
+    if (FLAGS_save_run_summary) {
+      std::vector<gflags::CommandLineFlagInfo> flags;
+      StatisticsReport::Parameters command_line_arguments;
+      gflags::GetAllFlags(&flags);
+
+      for (auto& flag : flags) {
+        if (!flag.is_default) {
+          command_line_arguments.push_back({flag.name, flag.current_value});
+        }
+      }
+
+      if (!FLAGS_pcsort.empty() &&
+          (FLAGS_pcsort != "simple_sort" && FLAGS_pcsort != "sort" && FLAGS_pcsort != "no_sort")) {
+        slog::err << "Invalid -pcsort option: " << FLAGS_pcsort << ". Please use one of sort, simple_sort, no_sort."
+                  << slog::endl;
+        return 1;
+      }
+
+      statistics =
+          std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_save_run_summary, FLAGS_report_folder});
+      statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
+    }
+
+    /** This vector stores paths to the processed images **/
+    auto multi_input_files = VectorMap<std::vector<std::string>>(
+        SplitMultiInputFilesArguments(net_size),  // get input directory list
+        [&](const std::vector<std::string>& input_args) mutable {
+          std::vector<std::string> files;
+          for (auto& input_arg : input_args) {
+            // Test if the path exists
+            if (!ExistsTest(input_arg)) {
+              slog::err << "Specified image path: " << input_arg << " does not exist" << slog::endl;
+              throw std::logic_error("Image path does not exist");
+            }
+            // Test whether the path can be opened if it's a directory
+            DirOpenTest(input_arg);
+            readInputFilesArguments(files, input_arg);
+          }
+          return files;
+        });
+
+    if (multi_input_files.size() == 0) {
+      // failed to read input files
+      slog::err << "Failed to read input files" << slog::endl;
+      return 1;
+    }
+
+    if (FLAGS_nstreams.empty()) {
+      slog::warn << "-nstreams default value is determined automatically for a device. " << slog::endl;
+      std::cout << "\tAlthough the automatic selection usually provides a reasonable performance, \n"
+                << "\tbut it still may be non-optimal for some cases, for more information look at README."
+                << std::endl;
+    }
+
+#ifdef DISABLE_JIT
+    if (!network_file_flag.empty()) {
+      slog::err << "Runtime compiled without support for Just-in-Time (JIT) execution!" << slog::endl
+                << "Either specify a compiled model using -cm <compiled_model.bin> "
+                << "or recompile the runtime without the -disable_jit flag." << slog::endl;
+      return 1;
+    }
+#endif
+
+    uint32_t num_batches = 1;
+
+    // ----------------- 2. Loading OpenVINO Runtime/Inference Engine
+    // -----------------------------------------------------------
+    next_step();
+
+    // Get optimal runtime parameters for device
+    std::string device_name = FLAGS_d;
+    if (is_model_compiled) {
+      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);  // separate each AOT file path
+      for (auto& compiled_graph : compiled_graph_paths) {
+        std::filebuf obj_file_buf;
+        // There does not seem to be a way to get the device from the OpenVINO executable network
+        // Instead we manually read through the xml header in the AOT graph to get the device name (an ugly hack
+        // unfortunately)
+        obj_file_buf.open(compiled_graph.c_str(), std::ios::in | std::ios::binary);
+        std::istream obj_istream(&obj_file_buf);
+        std::string xml_header, current_device;
+        getline(obj_istream, xml_header);                               // retrieve xml header from AOT bin file
+        if (xml_header.find("TARGET_FALLBACK") != std::string::npos) {  // uses hetero plugin
+          int start_index = xml_header.find("TARGET_FALLBACK") + 24;
+          int end_index = xml_header.find("</hetero_config>") - 3;
+          current_device =
+              "HETERO:" + xml_header.substr(start_index, end_index - start_index);  // get device from xml header
+        } else {
+          current_device = "FPGA";
+        }
+        if (device_name == "") {  // device flag not specified in AOT flow
+          device_name = current_device;
+        } else {
+          if (current_device != device_name) {  // print error for non-matching devices
+            throw std::logic_error(
+                "The AOT file does not target the expected device.  "
+                "The device specified to dla_benchmark using the -d flag must be the same as the "
+                "device specified to dla_compiler using the --fplugin flag.");
+          }
+        }
+      }
+    } else {
+      if (device_name == "") device_name = "CPU";  // default device for JIT flow is CPU
+    }
+    ov::Core core(FLAGS_plugins);
+
+    if (device_name.find("CPU") != std::string::npos) {
+      core.set_property("FPGA", {{DLIAPlugin::properties::cpu_used.name(), true}});
+    }
+
+    if (arch_file_flag != "" && device_name.find("FPGA") != std::string::npos) {
+      core.set_property("FPGA", {{DLIAPlugin::properties::arch_path.name(), arch_file_flag}});
+      if (!ExistsTest(arch_file_flag)) {
+        slog::err << "architecture file: " << arch_file_flag << " doesn't exist. Please provide a valid path."
+                  << slog::endl;
+        throw std::logic_error("architecture file path does not exist.");
+      }
+      if (FLAGS_encryption_key != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_key.name(), FLAGS_encryption_key}});
+      }
+      if (FLAGS_encryption_iv != "") {
+        core.set_property("FPGA", {{DLIAPlugin::properties::encryption_iv.name(), FLAGS_encryption_iv}});
+      }
+      // If emulator is used, do not perform decryption of compiled results  in the import step
+      if (FLAGS_emulator_decryption) {
+        core.set_property("FPGA", {{DLIAPlugin::properties::emulator_decryption.name(), CONFIG_VALUE(YES)}});
+      }
+      if (FLAGS_min_subgraph_layers < 1) {
+        slog::err << "-min-subgraph-layers must be >= 1" << slog::endl;
+        return 1;
+      }
+      core.set_property("FPGA", {{DLIAPlugin::properties::min_subgraph_layers.name(), FLAGS_min_subgraph_layers}});
+    }
+
+    if (device_name.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
+      // CPU extensions is loaded as a shared library and passed as a pointer to base extension
+      core.add_extension(FLAGS_l);
+      slog::info << "CPU extensions is loaded " << FLAGS_l << slog::endl;
+    }
+
+    slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl;
+    slog::info << "Device info: " << core.get_versions(device_name) << slog::endl;
+
+    // ----------------- 3. Setting device configuration -----------------------------------------------------------
+    next_step();
+
+    auto devices = ParseDevices(device_name);
+    std::map<std::string, uint32_t> device_nstreams = ParseNStreamsValuePerDevice(devices, FLAGS_nstreams);
+    for (auto& pair : device_nstreams) {
+      auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
+      std::vector<std::string> supported_config_keys =
+          core.get_property(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS)).as<std::vector<std::string>>();
+      if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
+        throw std::logic_error(
+            "Device " + pair.first + " doesn't support config key '" + key + "'! " +
+            "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>");
+      }
+    }
+
+    // pc is for CPU only at the moment
+    bool perf_count = FLAGS_pc;
+    std::string perf_count_sort = FLAGS_pcsort;
+    for (auto& device : devices) {
+      if (device == "CPU") {  // CPU supports few special performance-oriented keys
+        if (perf_count || !perf_count_sort.empty()) {
+          core.set_property("CPU", {{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES)}});
+        }
+        // limit threading for CPU portion of inference
+        if (FLAGS_nthreads != 0)
+          core.set_property(device, {{CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads)}});
+        core.set_property(device, {{CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin}});
+        // Set CPU to optimize throughput
+        core.set_property(device, ov::hint::performance_mode(ov::hint::PerformanceMode::THROUGHPUT));
+        // for CPU execution, more throughput-oriented execution via streams
+        if (FLAGS_api == "async") {
+          core.set_property(
+              device,
+              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+                                                                 : ov::streams::AUTO));
+        }
+        device_nstreams[device] = core.get_property(device, ov::streams::num);
+      } else if (device == ("GPU")) {
+        if (FLAGS_api == "async") {
+          core.set_property(
+              device,
+              ov::streams::num(device_nstreams.count(device) > 0 ? ov::streams::Num(device_nstreams.at(device))
+                                                                 : ov::streams::AUTO));
+        }
+        device_nstreams[device] = core.get_property(device, ov::streams::num);
+      }
+    }
+
+    auto double_to_string = [](const double number) {
+      std::stringstream ss;
+      ss << std::fixed << std::setprecision(4) << number;
+      return ss.str();
+    };
+    auto get_total_ms_time = [](Time::time_point& start_time) {
+      return std::chrono::duration_cast<ns>(Time::now() - start_time).count() * 0.000001;
+    };
+
+    size_t batch_size = batch_size_flag;
+    std::vector<std::string> topology_names;
+    ov::element::Type precision = ov::element::undefined;
+    // Vector stores which model (multigraph), InputsInfo is a map of input names and its respctive
+    // input information
+    std::vector<dla_benchmark::InputsInfo> input_infos;
+    if (!is_model_compiled) {
+#ifndef DISABLE_JIT
+      // We choose to ifdef out this block of code because it's more readable than
+      // pulling the block in the "else" out using ifdefs
+      // ----------------- 4. Reading the Intermediate Representation network ----------------------------------------
+      next_step();
+
+      LOG_AND_PRINT(Logger::INFO, "Loading network files\n");
+
+      auto start_time_read = Time::now();
+      // get list of graphs
+      std::vector<std::shared_ptr<ov::Model>> models =
+          VectorMap<std::shared_ptr<ov::Model>>(split(network_file_flag, MULTIGRAPH_SEP), [&](const std::string& m) {
+            std::shared_ptr<ov::Model> model = core.read_model(m);
+            // Assign rt info IMMEDIATELY when DLA benchmark reads the model.
+            // Applying transformations or reshaping may change node names.
+            // Mixed Precision is an EA only feature for 2024.2
+            if (FLAGS_enable_early_access) {
+              for (auto&& node : model->get_ops()) {
+                if (dla::util::NodeTypeUsesPE(node->get_type_name())) {
+                  node->get_rt_info()[DLA_PE_PRECISION_MODE] =
+                      dla::util::ParseNodeForRTInfo(node->get_friendly_name(), DLA_PE_PRECISION_MODE);
+                }
+              }
+            }
+            printInputAndOutputsInfoShort(*model);
+            return model;
+          });
+
+      auto duration_ms = double_to_string(get_total_ms_time(start_time_read));
+      slog::info << "Read network(s) took " << duration_ms << " ms" << slog::endl;
+      if (statistics)
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {{"read network time (ms)", duration_ms}});
+
+      // ----------------- 5. Resizing network to match image sizes and given batch ----------------------------------
+      next_step();
+
+      for (size_t i = 0; i < models.size(); i++) {
+        const auto& model_inputs = std::const_pointer_cast<const ov::Model>(models[i])->inputs();
+        bool reshape = false;
+        input_infos.push_back(
+            GetInputsInfo(batch_size, model_inputs, reshape, FLAGS_bin_data, FLAGS_mean_values, FLAGS_scale_values));
+        if (reshape) {
+          dla_benchmark::PartialShapes shapes = {};
+          for (auto& item : input_infos.back()) shapes[item.first] = item.second.partial_shape;
+          slog::info << "Reshaping model to batch: " << batch_size << slog::endl;
+          models[i]->reshape(shapes);
+        }
+        topology_names.push_back(models[i]->get_friendly_name());
+      }
+
+      // ----------------- 6. Configuring input and output
+      // ----------------------------------------------------------------------
+      next_step();
+      // Set input layouts for all models and their inputs
+      size_t input_info_idx = 0;
+      for (std::shared_ptr<ov::Model> model : models) {
+        auto preproc = ov::preprocess::PrePostProcessor(model);
+        const auto& inputs = model->inputs();
+        for (size_t i = 0; i < inputs.size(); i++) {
+          ov::preprocess::InputInfo& input_info = preproc.input(i);
+          const size_t input_rank = inputs[i].get_partial_shape().size();
+          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(input_rank));
+          const ov::element::Type_t type = input_infos[input_info_idx].at(inputs[i].get_any_name()).type;
+          input_info.tensor().set_element_type(type).set_layout(layout);
+        }
+
+        const auto& outputs = model->outputs();
+        for (size_t i = 0; i < outputs.size(); i++) {
+          const size_t output_rank = outputs[i].get_partial_shape().size();
+          const ov::Layout& layout = ov::Layout(dla::util::getTensorLayout(output_rank));
+          preproc.output(i).tensor().set_element_type(ov::element::f32).set_layout(layout);
+        }
+        // Once the build() method is called, the pre(post)processing steps
+        // for layout and precision conversions are inserted automatically
+        model = preproc.build();
+        input_info_idx++;
+      }
+      // ----------------- 7. Loading the model to the device --------------------------------------------------------
+      next_step();
+
+      // Get the value from the command line arguments (if the command line argument wasn't
+      // used by the user the default value set in dla_benchmark.hpp will be used)
+      int folding_option = FLAGS_folding_option;
+      bool fold_preprocessing = FLAGS_fold_preprocessing;
+      bool estimate_per_layer = FLAGS_estimate_per_layer_latencies;
+      bool enable_early_access = FLAGS_enable_early_access;
+      // TODO(arooney): Remove this once LT hang is fixed.
+      bool multi_infer_req = false;
+      if (FLAGS_nireq > 1 && FLAGS_api == "async") {
+        multi_infer_req = true;
+      }
+
+      core.set_property("FPGA", {{DLIAPlugin::properties::folding_option.name(), std::to_string(folding_option)}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::fold_preprocessing.name(), fold_preprocessing}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::per_layer_estimation.name(), estimate_per_layer}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::enable_early_access.name(), enable_early_access}});
+      core.set_property("FPGA",
+                        {{DLIAPlugin::properties::multiple_inferences.name(), multi_infer_req}});
+      core.set_property("FPGA", {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+
+      auto start_time = Time::now();
+      auto individual_start_time = Time::now();  // timer for each individual graph loading
+      compiled_models = VectorMap<ov::CompiledModel*>(models, [&](std::shared_ptr<ov::Model> model) {
+        // Apply Low Precision transformations to handle quantized graphs
+        // Mohamed_I: currently, this only works if the entire graph fits on the FPGA
+        // because the CPU plugin calls common_optimizations again which has some transformations
+        // that cause the graph to fail (I suspect it's the ConvolutionMultiplyFusion, but I
+        // cannot disable it from the CPU)
+
+        bool FPGA_used = device_name.find("FPGA") != std::string::npos;
+        bool CPU_used = device_name.find("CPU") != std::string::npos;
+
+        ov::AnyMap config;
+        config.emplace(DLIAPlugin::properties::cpu_used.name(), CPU_used);
+        config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+        config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+
+        for (auto&& node : model->get_ops()) {
+          if (std::string("FakeQuantize") == node->get_type_name()) {
+            config.emplace(DLIAPlugin::properties::apply_low_precision_transforms.name(), true);
+            if (CPU_used && FPGA_used) {
+              std::cerr << "ERROR: Quantized graphs only supported through HETERO:FPGA or CPU." << std::endl;
+              throw std::logic_error("HETERO:FPGA,CPU plugin is not supported for quantization.");
+            }
+          }
+        }
+
+        auto compiled_model = new ov::CompiledModel();
+        *compiled_model = core.compile_model(model, device_name, config);
+        duration_ms = double_to_string(get_total_ms_time(individual_start_time));
+        individual_start_time = Time::now();
+        slog::info << "Compile model ( " << model->get_friendly_name() << " ) took " << duration_ms << " ms"
+                   << slog::endl;
+        return compiled_model;
+      });
+      duration_ms = double_to_string(get_total_ms_time(start_time));
+      slog::info << "Load network(s) took " << duration_ms << " ms" << slog::endl;
+      if (statistics)
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {{"load network time (ms)", duration_ms}});
+#endif
+    } else {
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      next_step();
+      slog::info << "Skipping the step for compiled network" << slog::endl;
+      // ----------------- 7. Loading the model to the device --------------------------------------------------------
+      next_step();
+      auto compiled_graph_paths = split(FLAGS_cm, MULTIGRAPH_SEP);
+      compiled_models = vectorMapWithIndex<ov::CompiledModel*>(
+          split(FLAGS_cm, MULTIGRAPH_SEP),  // get a list of compiled graphs
+          [&](const std::string& compiled_graph_path, size_t index) {
+            std::stringstream generated_name;
+            generated_name << "Graph_" << index;
+            slog::info << "Importing model from " << compiled_graph_paths[index] << " to " << device_name << " as "
+                       << generated_name.str() << slog::endl;
+            auto start_time = Time::now();
+            std::ifstream model_stream(compiled_graph_paths[index].c_str(), std::ios_base::in | std::ios_base::binary);
+            if (!model_stream.is_open()) {
+              throw std::runtime_error("Cannot open compiled model file: " + compiled_graph_paths[index]);
+            }
+            auto compiled_model = new ov::CompiledModel();
+            core.set_property("FPGA",
+                              {{DLIAPlugin::properties::streaming_input_pipe.name(), FLAGS_streaming_input_pipe}});
+            // Import specific configs
+            ov::AnyMap config;
+            config.emplace(DLIAPlugin::properties::export_dir.name(), output_dir);
+            config.emplace(DLIAPlugin::properties::parameter_rom_export_dir.name(), parameter_rom_output_dir);
+            *compiled_model = core.import_model(model_stream, device_name, config);
+            topology_names.push_back(generated_name.str());
+            model_stream.close();
+            printInputAndOutputsInfoShort(*compiled_model);
+            auto duration_ms = double_to_string(get_total_ms_time(start_time));
+            slog::info << "Import model took " << duration_ms << " ms" << slog::endl;
+            if (statistics)
+              statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                        {{"import model time (ms)", duration_ms}});
+            if (batch_size == 0) {
+              batch_size = 1;
+            }
+            const auto& inputs = compiled_model->inputs();
+            for (const auto& item : inputs) {
+              const auto& shape = item.get_shape();
+              if (shape[0] != batch_size) {
+                slog::err << "Batch size of the compiled model is " << shape[0] << " and batch size provided is "
+                          << batch_size << slog::endl;
+                std::cout << "Set the same batch size = " << shape[0] << " when running the app" << std::endl;
+                std::cout << "Or recompile model with batch size = " << batch_size << std::endl;
+                exit(5);
+              }
+            }
+            bool reshape_required = false;
+            input_infos.push_back(GetInputsInfo(batch_size,
+                                                compiled_model->inputs(),
+                                                reshape_required,
+                                                FLAGS_bin_data,
+                                                FLAGS_mean_values,
+                                                FLAGS_scale_values));
+            return compiled_model;
+          });
+    }
+    // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
+    next_step();
+
+    // Number of requests
+    uint32_t nireq = FLAGS_nireq;
+#if defined(__arm__) | defined(__aarch64__)
+    // In OpenVINO 2022.3 Arm plugin, when a AOT graph is compiled on CPU and dla_benchmark has -nireq > 1
+    // the program will be killed. We force nireq = 1 for HETERO:CPU graph only.
+    // Note: -d CPU doesn't need to be checked for AOT because dlac does not support -fplugin CPU.
+    if (device_name == "HETERO:CPU" && nireq > 1) {
+      slog::warn << "-nireq > 1 is not supported for HETERO:CPU graph. Forcing -nireq = 1" << slog::endl;
+      nireq = 1;
+    }
+
+#endif
+
+    if (nireq == 0) {
+      if (FLAGS_api == "sync") {
+        nireq = 1;
+      } else {
+        try {
+          nireq = 0;
+          for (auto& compiled_model : compiled_models) {
+            auto req = compiled_model->get_property(ov::optimal_number_of_infer_requests);
+            if (nireq == 0 || nireq > req) nireq = req;
+          }
+        } catch (const std::exception& ex) {
+          throw ov::Exception("Every device used with the dla_benchmark should support " +
+                              std::string(ov::optimal_number_of_infer_requests.name()) +
+                              " Failed to query the metric for the " + device_name + " with error: " + ex.what());
+        }
+      }
+    }
+#ifdef MAX_NUM_INFERENCE_REQUEST
+    if (nireq > MAX_NUM_INFERENCE_REQUEST) {
+      slog::warn << "-nireq > "<< MAX_NUM_INFERENCE_REQUEST << " is not supported for the underlying device. Forcing -nireq = 1" << slog::endl;
+      nireq = 1;
+    }
+#endif
+
+    // Iteration limit
+    uint32_t niter = FLAGS_niter;
+    if (niter > 0) {
+      // Round up niter to a multiple of nireq
+      niter = ((niter + nireq - 1) / nireq) * nireq;
+      // We previously checked that FLAGS_niter >= 0, so okay to cast to uint.
+      if (static_cast<uint32_t>(FLAGS_niter) != niter) {
+        slog::warn << "Number of iterations was aligned by request number from " << FLAGS_niter << " to " << niter
+                   << " using number of requests " << nireq << slog::endl;
+      }
+      num_batches = niter;
+    } else if (niter > 0) {
+      num_batches = niter;
+    }
+
+    // Graph-request limit on device
+    if (device_name.find("FPGA") != std::string::npos) {
+      int ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+      int numOutstandingInferRequest = nireq * net_size / ip_num_instances;
+      int maxOutstandingInferRequest = core.get_property("FPGA", "COREDLA_DMA_CSR_DESCRIPTOR_QUEUE_SIZE").as<int>();
+      if (maxOutstandingInferRequest > 0 && numOutstandingInferRequest > maxOutstandingInferRequest) {
+        slog::err << "Possible number of outstanding inference requests per instance (" << numOutstandingInferRequest
+                  << ") "
+                  << "exceeds the CSR descriptor queue limit (" << maxOutstandingInferRequest << ")" << slog::endl;
+        return 1;
+      }
+    }
+
+    if (statistics) {
+      for (auto& topology_name : topology_names) {
+        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                  {
+                                      {"topology", topology_name},
+                                      {"target device", device_name},
+                                      {"API", FLAGS_api},
+                                      {"precision", std::string(precision.get_type_name())},
+                                      {"batch size", std::to_string(batch_size)},
+                                      {"number of iterations", std::to_string(niter)},
+                                      {"number of parallel infer requests", std::to_string(nireq)},
+                                  });
+      }
+      for (auto& nstreams : device_nstreams) {
+        std::stringstream ss;
+        ss << "number of " << nstreams.first << " streams";
+        statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
+                                  {
+                                      {ss.str(), std::to_string(nstreams.second)},
+                                  });
+      }
+    }
+
+    // ----------------- 9. Creating infer requests and filling input blobs ----------------------------------------
+    next_step();
+
+    // Data structure hierarchy
+    // Outermost vec: which model it corresponds to (multigraph)
+    // Map: input/output name and its corresponding TensorVector
+    // TensorVector: An alias for vector<ov::tensor> where each vector element correspond to the batch
+    std::vector<std::map<std::string, ov::TensorVector>> input_data_tensors;
+    std::vector<std::map<std::string, ov::TensorVector>> output_tensors(compiled_models.size());
+
+    std::vector<std::unique_ptr<InferRequestsQueue>> infer_request_queues;
+    const std::string resize_type = FLAGS_resize_type.empty() ? "resize" : FLAGS_resize_type;
+    for (size_t net_idx = 0; net_idx < compiled_models.size(); net_idx++) {
+      // Handle the case that use same inputs for all networks
+      const auto& inputFiles =
+          net_idx >= multi_input_files.size() ? multi_input_files.back() : multi_input_files[net_idx];
+      input_data_tensors.push_back(GetStaticTensors(inputFiles.empty() ? std::vector<std::string>{} : inputFiles,
+                                                    batch_size,
+                                                    input_infos[net_idx],
+                                                    num_batches,
+                                                    resize_type,
+                                                    FLAGS_bgr,
+                                                    FLAGS_bin_data,
+                                                    FLAGS_verbose));
+      // Use unique_ptr to create InferRequestsQueue objects and avoid copying mutex and cv
+      infer_request_queues.push_back(
+          std::move(std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(*(compiled_models[net_idx]), nireq))));
+    }
+
+    // ----------------- 10. Measuring performance ------------------------------------------------------------------
+    size_t progress_bar_total_count = progressBarDefaultTotalCount;
+
+    std::stringstream ss;
+    ss << "Start inference " << FLAGS_api << "ronously";
+    if (FLAGS_api == "async") {
+      if (!ss.str().empty()) {
+        ss << ", ";
+      }
+      ss << infer_request_queues.size() * infer_request_queues.at(0)->requests.size() << " inference requests";
+      std::stringstream device_ss;
+      for (auto& nstreams : device_nstreams) {
+        if (!device_ss.str().empty()) {
+          device_ss << ", ";
+        }
+        device_ss << nstreams.second << " streams for " << nstreams.first;
+      }
+      if (!device_ss.str().empty()) {
+        ss << " using " << device_ss.str();
+      }
+    }
+    ss << ", limits: " << niter << " iterations with each graph, " << compiled_models.size() << " graph(s)";
+    progress_bar_total_count = niter;
+    next_step(ss.str());
+
+    /** Start inference & calculate performance **/
+    /** to align number if iterations to guarantee that last infer requests are executed in the same conditions **/
+    ProgressBar progress_bar(progress_bar_total_count, FLAGS_stream_output, FLAGS_progress);
+    std::vector<size_t> iterations(compiled_models.size(), 0);
+    try {
+      while ((niter != 0LL && iterations.back() < niter) || (FLAGS_api == "async" && iterations.back() % nireq != 0)) {
+        // set up all infer request and prep all i/o Blobs
+        for (size_t net_id = 0; net_id < compiled_models.size(); net_id++) {
+          for (size_t iireq = 0; iireq < nireq; iireq++) {
+            auto infer_request = infer_request_queues.at(net_id)->get_idle_request();
+            if (!infer_request) {
+              THROW_IE_EXCEPTION << "No idle Infer Requests!";
+            }
+
+            if (niter != 0LL) {
+              const auto& outputs = compiled_models[net_id]->outputs();
+              for (const auto& output : outputs) {
+                const std::string& name = output.get_any_name();
+                output_tensors.at(net_id)[name].emplace_back(output.get_element_type(), output.get_shape());
+                infer_request->set_tensor(output, output_tensors.at(net_id).at(name).at(iterations.at(net_id)));
+              }
+              const auto& inputs = compiled_models[net_id]->inputs();
+              for (auto& input : inputs) {
+                const std::string& name = input.get_any_name();
+                const auto& data = input_data_tensors.at(net_id).at(name)[iterations.at(net_id)];
+                infer_request->set_tensor(input, data);
+              }
+            }
+
+            // Execute one request/batch
+            if (FLAGS_api == "sync") {
+              infer_request->infer();
+            } else {
+              // As the inference request is currently idle, the wait() adds no additional overhead (and should return
+              // immediately). The primary reason for calling the method is exception checking/re-throwing. Callback,
+              // that governs the actual execution can handle errors as well, but as it uses just error codes it has no
+              // details like ‘what()’ method of `std::exception` So, rechecking for any exceptions here.
+              infer_request->wait();
+              infer_request->start_async();
+            }
+            iterations.at(net_id)++;
+            if (net_id == compiled_models.size() - 1) {
+              progress_bar.addProgress(1);
+            }
+          }
+        }
+      }
+
+      // wait the latest inference executions
+      for (auto& infer_request_queue : infer_request_queues) {
+        infer_request_queue->wait_all();
+      }
+    } catch (const std::exception& ex) {
+      slog::err << "Inference failed:" << slog::endl;
+      slog::err << ex.what() << slog::endl;
+      ReadDebugNetworkInfo(core);
+      PrintLSUCounterInfo(core);
+      // Instead of setting return_code = 1 and continuing, exit immediately.
+      // High risk of segfaulting / weird behavior when inference fails.
+      return 1;
+    }
+
+    size_t iteration = iterations.back();
+
+    std::vector<double> all_latencies;
+    auto start_time = infer_request_queues.at(0)->get_start_time();
+    auto end_time = infer_request_queues.at(0)->get_end_time();
+    for (auto& infer_request_queue : infer_request_queues) {
+      auto& latencies = infer_request_queue->get_latencies();
+      all_latencies.insert(all_latencies.end(), latencies.begin(), latencies.end());
+      start_time = std::min(start_time, infer_request_queue->get_start_time());
+      end_time = std::max(end_time, infer_request_queue->get_end_time());
+    }
+    double latency = GetMedianValue<double>(all_latencies);
+    double total_duration = std::chrono::duration_cast<ns>(end_time - start_time).count() * 0.000001;
+    double total_fps = (FLAGS_api == "sync")
+                           ? compiled_models.size() * batch_size * 1000.0 / latency
+                           : compiled_models.size() * batch_size * 1000.0 * iteration / total_duration;
+
+    int ip_num_instances = 0;
+    double ip_duration = 0.0;
+    double ip_fps = 0.0;
+    double ip_fps_per_fmax = 0.0;
+    double estimated_ipFps = 0.0;
+    double estimated_ipFpsPerFmax = 0.0;
+    double fmax_core = -1.0;
+    double estimated_ipFps_assumed_fmax = 0.0;
+    if (device_name.find("FPGA") != std::string::npos) {
+      ip_num_instances = core.get_property("FPGA", "COREDLA_NUM_INSTANCES").as<int>();
+      // even if hardware has 2 instances, only 1 instance actually gets used if only 1 inference is performed
+      size_t ip_num_instances_used = std::min((size_t)ip_num_instances, iteration);
+      ip_duration = core.get_property("FPGA", "IP_ACTIVE_TIME").as<double>();
+      if (ip_duration) {
+        if (ip_duration != 0.0) {
+          ip_fps = (FLAGS_api == "sync")
+                       ? compiled_models.size() * batch_size * 1000.0 / latency / ip_num_instances_used
+                       : compiled_models.size() * batch_size * 1000.0 * iteration / ip_duration / ip_num_instances_used;
+        }
+        fmax_core = core.get_property("FPGA", "COREDLA_CLOCK_FREQUENCY").as<double>();
+        if (fmax_core > 0.0) {
+          ip_fps_per_fmax = ip_fps / fmax_core;
+        } else {
+          slog::warn << "Warning: could not estimate clk_dla frequency on the FPGA" << slog::endl;
+        }
+      }
+
+      if (FLAGS_perf_est && (device_name.find("FPGA") != std::string::npos)) {
+        if (is_model_compiled) {
+          // Ahead of Time Flow: getting the imported, precalculated performance estimate
+          estimated_ipFps = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST").as<double>();
+          if (estimated_ipFps < 0)
+            slog::warn << "Missing performance estimation from at least one of the compiled graphs" << slog::endl;
+          estimated_ipFps_assumed_fmax = core.get_property("FPGA", "IMPORT_PERFORMANCE_EST_ASSUMED_FMAX").as<double>();
+        } else {
+#ifndef DISABLE_JIT
+          // Just In Time Flow: running the performance estimate
+          if (fmax_core > 0.0) {
+#if defined(_WIN32) || defined(_WIN64)
+            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str());
+            _putenv_s("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str());
+#else
+            setenv("PERF_EST_COREDLA_FMAX", double_to_string(fmax_core).c_str(), true);
+            setenv("PERF_EST_PE_FMAX", double_to_string(fmax_core).c_str(), true);
+#endif
+            estimated_ipFps_assumed_fmax = fmax_core;
+          } else {
+// In case the fmax_core variable is not set, we use the estimated fmax values for AGX7 and A10.
+// This if statement is just defensive programming for a condition that should not happen.
+#ifdef DE10_AGILEX
+            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 500);  // AGX7 fMAX estimate
+#else
+            estimated_ipFps_assumed_fmax = GetEnvOrDefault("PERF_EST_COREDLA_FMAX", 265);  // A10 fMAX estimate
+#endif
+            slog::warn
+                << "Warning: could not estimate clk_dla frequency on the FPGA, setting the fmax to default value."
+                << slog::endl;
+#if defined(_WIN32) || defined(_WIN64)
+            _putenv_s("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+            _putenv_s("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str());
+#else
+            setenv("PERF_EST_COREDLA_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+            setenv("PERF_EST_PE_FMAX", double_to_string(estimated_ipFps_assumed_fmax).c_str(), true);
+#endif
+          }
+          estimated_ipFps = core.get_property("FPGA", "PLUGIN_PERFORMANCE_EST").as<double>();
+#endif
+        }
+        estimated_ipFpsPerFmax = estimated_ipFps / estimated_ipFps_assumed_fmax;
+      }
+    }
+
+    if (statistics) {
+      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                {
+                                    {"total execution time (ms)", double_to_string(total_duration)},
+                                    {"IP active time (ms)", double_to_string(ip_duration)},
+                                    {"total number of iterations", std::to_string(iteration)},
+                                });
+      if (device_name.find("MULTI") == std::string::npos) {
+        statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                  {
+                                      {"latency (ms)", double_to_string(latency)},
+                                  });
+      }
+      statistics->addParameters(
+          StatisticsReport::Category::EXECUTION_RESULTS,
+          {{"throughput", double_to_string(total_fps)}, {"IP throughput", double_to_string(ip_fps)}});
+    }
+
+    progress_bar.finish();
+
+    // ----------------- 11. Dumping statistics report -------------------------------------------------------------
+    next_step();
+
+    if (perf_count || !perf_count_sort.empty()) {
+      std::vector<std::vector<ov::ProfilingInfo>> perfCounts;
+      for (size_t ireq = 0; ireq < nireq; ireq++) {
+        auto reqPerfCounts = infer_request_queues.at(0)->requests[ireq]->get_performance_counts();
+        perfCounts.push_back(reqPerfCounts);
+      }
+      if (statistics) {
+        if (perf_count_sort == "sort") {
+          statistics->printPerfCountersSort(perfCounts, "sort");
+        } else if (perf_count_sort == "simple_sort") {
+          statistics->printPerfCountersSort(perfCounts, "simple_sort");
+        } else {
+          statistics->printPerfCountersSort(perfCounts, "no_sort");
+        }
+      }
+    }
+
+    // dla_benchmark originally also implemented more detailed performance
+    // statistics via InferRequest's getPerformanceCounts function
+    // We did not support it, and removed it. If we want to re-implement it
+    // looking at the latest version of OpenVINO's benchmark_app or our git
+    // history would be a good starting point
+    if (statistics) {
+      statistics->dump();
+    }
+
+    std::cout << "count:             " << iteration << " iterations" << std::endl;
+    std::cout << "system duration:   " << double_to_string(total_duration) << " ms" << std::endl;
+    if (ip_duration != 0.0) std::cout << "IP duration:       " << double_to_string(ip_duration) << " ms" << std::endl;
+    if (device_name.find("MULTI") == std::string::npos)
+      std::cout << "latency:           " << double_to_string(latency) << " ms" << std::endl;
+    std::cout << "system throughput: " << double_to_string(total_fps) << " FPS" << std::endl;
+    if (ip_num_instances != 0) std::cout << "number of hardware instances: " << ip_num_instances << std::endl;
+    if (compiled_models.size() != 0)
+      std::cout << "number of network instances: " << compiled_models.size() << std::endl;
+    if (ip_fps != 0.0) std::cout << "IP throughput per instance: " << double_to_string(ip_fps) << " FPS" << std::endl;
+    if (ip_fps_per_fmax != 0.0)
+      std::cout << "IP throughput per fmax per instance: " << double_to_string(ip_fps_per_fmax) << " FPS/MHz"
+                << std::endl;
+    if (fmax_core > 0.0) std::cout << "IP clock frequency: " << double_to_string(fmax_core) << " MHz" << std::endl;
+    if (estimated_ipFps != 0.0)
+      std::cout << "estimated IP throughput per instance: " << double_to_string(estimated_ipFps) << " FPS ("
+                << (int)estimated_ipFps_assumed_fmax << " MHz assumed)" << std::endl;
+    if (estimated_ipFpsPerFmax != 0.0)
+      std::cout << "estimated IP throughput per fmax per instance: " << double_to_string(estimated_ipFpsPerFmax)
+                << " FPS/MHz" << std::endl;
+
+    // ----------------- 12. Dumping output values -------------------------------------------------------------
+    next_step();
+
+    if (FLAGS_dump_output) {
+      for (size_t i = 0; i < compiled_models.size(); i++) {
+        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+        const auto& output_tensors_map = output_tensors[i];
+        // A flag regarding whether we can dump output tensor in a text file due to unsupported layout.
+        // This flag is set at first during dumping.
+        bool can_dump_txt = true;
+        bool can_dump_layout_info_in_txt = true;
+        // dump output tensor as bin, which can be loaded using Python Numpy
+        std::regex pattern("\\{batch\\}");
+        std::string results_bin_file_name = output_dir + "result_{batch}.bin";
+        // dump output tensor as text
+        // backward compatibility support for old regtests that used only one graph
+        std::string results_txt_file_name = output_dir + "result.txt";
+        std::string results_boundaries_file_name = output_dir + "result_tensor_boundaries.txt";
+        // dump inference arguments and metadata as JSON
+        std::string results_meta_file_name = output_dir + "result_meta.json";
+
+        if (compiled_models.size() > 1) {
+          results_bin_file_name = output_dir + topology_names[i] + "_result_{batch}.bin";
+          results_txt_file_name = output_dir + topology_names[i] + "_result.txt";
+          results_boundaries_file_name = output_dir + topology_names[i] + "_result_tensor_boundaries.txt";
+          results_meta_file_name = output_dir + topology_names[i] + "_result_meta.json";
+        }
+
+        slog::info << "Dumping result of " << topology_names[i]
+                   << " to " << results_txt_file_name << slog::endl;
+        slog::info << "Dumping per-batch result (raw output) of " << topology_names[i]
+                   << " to " << results_bin_file_name << slog::endl;
+        slog::info << "Dumping inference meta data of " << topology_names[i]
+                   << " to " << results_meta_file_name << slog::endl;
+
+        std::ofstream result_txt_file(results_txt_file_name);
+        std::ofstream results_boundaries(results_boundaries_file_name);
+        std::ofstream result_meta_file(results_meta_file_name);
+
+        dla_benchmark::InferenceMetaData result_metadata;
+        result_metadata.input_files = multi_input_files.at(i);  // all input files in -i
+        result_metadata.groundtruth_loc = FLAGS_groundtruth_loc;
+        result_metadata.batch_size = FLAGS_batch_size;
+        result_metadata.niter = niter;
+        result_metadata.nireq = nireq;
+        result_metadata.model_input_info = input_infos[i];
+        dla_benchmark::OutputsInfoVec model_output_info;
+
+        uint32_t current_lines = 1;
+        size_t max_allowed_megabytes_to_dump = FLAGS_max_output_file_size;
+
+        for (uint32_t batch = 0; batch < num_batches; batch++) {
+          std::string per_batch_results_bin_file_name = std::regex_replace(results_bin_file_name,
+                                                                           pattern,
+                                                                           std::to_string(batch));
+          std::ofstream per_batch_results_bin_file(per_batch_results_bin_file_name, std::ios::binary);
+
+          for (const auto& item : output_info) {
+            auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+            unsigned int output_size = tensor.get_size() / batch_size;
+
+            const ov::Layout& layout = ov::layout::get_layout(item);
+            const auto& shape = tensor.get_shape();
+            const std::string& name = item.get_any_name();
+            size_t total_bytes_to_dump = tensor.get_size() * niter * sizeof(float);
+
+            if (can_dump_txt) {
+              // if we cannot dump as a text file, we set can_dump_txt flag to false and write the one-time message
+              if (total_bytes_to_dump > max_allowed_megabytes_to_dump * BYTE_TO_MEGABYTE) {
+                can_dump_txt = false;
+                std::string msg = "Output tensor (" + std::to_string(total_bytes_to_dump / BYTE_TO_MEGABYTE) +
+                                  " MB) "
+                                  "is too large to dump. Change environmental variable MAX_DUMP_OUTPUT_TXT (default " +
+                                  std::to_string(FLAGS_max_output_file_size) + " MB) to allow dumping larger tensors";
+                slog::warn << msg << slog::endl;
+                result_txt_file << msg;
+              } else {
+                if (can_dump_layout_info_in_txt && shape.size() != 2 && shape.size() != 4 && shape.size() != 5) {
+                  can_dump_layout_info_in_txt = false;
+                  slog::warn << "Output data tensor of rank that is not 2, 4 or 5. layout info will not be dumped in "
+                             << "result.txt." << slog::endl;
+                }
+                // Otherwise, dump text and write to the result_tensor_boundaries.txt with additional information
+                // about the result.txt file
+                results_boundaries << name << ": Line " << current_lines << " to "
+                                   << "line " << current_lines + output_size - 1 << std::endl;
+                results_boundaries << name << " output layout: " << layout.to_string() << std::endl;
+                results_boundaries << name << " output dimension:";
+                for (unsigned int dim = 0; dim < shape.size(); dim++) {
+                  results_boundaries << " " << shape[dim];
+                }
+                results_boundaries << std::endl;
+                current_lines = current_lines + output_size;
+                DumpResultTxtFile(tensor, item, output_size, result_txt_file);
+              }
+            }
+            DumpResultBinFile(tensor, per_batch_results_bin_file);
+
+            if (batch == 0) {
+              // all batches should have the same output info
+              dla_benchmark::OutputInfo output_info;
+              output_info.name = name;
+              output_info.shape = shape;
+              model_output_info.push_back(output_info);
+            }
+          }
+          per_batch_results_bin_file.close();
+        }
+
+        result_metadata.model_output_info = model_output_info;
+        DumpResultMetaJSONFile(result_metadata, result_meta_file);
+        result_txt_file.close();
+        results_boundaries.close();
+        result_meta_file.close();
+      }
+      const std::string throughput_file_name = output_dir + "throughput_report.txt";
+      std::ofstream throughput_file;
+      throughput_file.open(throughput_file_name);
+      throughput_file << "Throughput : " << total_fps << " fps" << std::endl;
+      throughput_file << "Batch Size : " << batch_size << std::endl;
+      throughput_file << "Graph number : " << compiled_models.size() << std::endl;
+      throughput_file << "Num Batches : " << num_batches << std::endl;
+      throughput_file.close();
+
+      // Append throughput to dataset
+      // Check both gz and non gz versions
+      std::string dataset_gz_file_name = "data.csv.gz";
+      append_value_if_incomplete_to_csv(dataset_gz_file_name, ip_fps);
+      std::string dataset_file_name = "data.csv";
+      append_value_if_incomplete_to_csv(dataset_file_name, ip_fps);
+    }
+
+    // Calculate top 1, top 5 results
+    if (FLAGS_groundtruth_loc != "") {
+      auto groundtruth_files = split(FLAGS_groundtruth_loc, MULTIGRAPH_SEP);
+      for (size_t i = 0; i < compiled_models.size(); i++) {
+        // This flag `FLAGS_enable_object_detection_ap` enables accuracy checking subroutine that
+        // gives the mAP and COCO AP scores. These scores are two of the main detection evaluation
+        // metrics used in the Common Objects in Context contest, https://cocodataset.org/#detection-eval.
+
+        std::vector<ov::Output<const ov::Node>> output_info = compiled_models[i]->outputs();
+        // For multi-outputs: Sort to ensure the order of each tensor dump aligns with the ground truth files
+        std::sort(output_info.begin(), output_info.end(), CompareOutputNodeNames);
+        // Run the default top-1, top-5 evaluation routine if AP scores are not required.
+        if (!FLAGS_enable_object_detection_ap) {
+          if (groundtruth_files.size() <= i) {
+            slog::warn << "Missing ground truth file for " << topology_names[i] << "! SKIPPED" << slog::endl;
+            continue;  // Print warnings for all missing ground truth graphs;
+          }
+          slog::info << "Comparing ground truth file " << groundtruth_files[i] << " with network " << topology_names[i]
+                     << slog::endl;
+          // captures the results in higher precision for accuracy analysis
+          std::vector<float> results;
+          const auto& output_tensors_map = output_tensors[i];
+          for (uint32_t batch = 0; batch < num_batches; batch++) {
+            for (unsigned int img = 0; img < batch_size; img++) {
+              for (const auto& item : output_info) {
+                auto tensor = output_tensors_map.at(item.get_any_name()).at(batch);
+                auto tensor_data = tensor.data<float>();
+                unsigned int output_size = tensor.get_size() / batch_size;
+                size_t offset = img * output_size;
+                for (unsigned int j = 0; j < output_size; j++) {
+                  results.push_back(tensor_data[j + offset]);
+                }
+              }
+            }
+          }
+          bool passed = TopResultsAnalyser::get_top_results(groundtruth_files[i], results, batch_size * num_batches);
+          if (passed) {
+            slog::info << "Get top results for \"" << topology_names[i] << "\" graph passed" << slog::endl;
+          } else {
+            // return 4 indicates that the accuracy of the result was below the threshold
+            return_code = 4;
+          }
+        } else {
+          // Runs the accuracy checking routine if AP scores are required.
+          set_runtime(FLAGS_yolo_version, FLAGS_niter, batch_size_flag, FLAGS_i, FLAGS_groundtruth_loc);
+          std::pair<double, double> res =
+              validate_yolo_wrapper(output_tensors[i], output_info, multi_input_files.at(0));
+          std::cout << std::endl;
+          slog::info << "Batch metrics results:" << slog::endl;
+          std::cout << "Detection - mAP@0.5: " << std::setprecision(6) << res.first * 100 << "%" << std::endl;
+          std::cout << "Detection - mAP@0.5:0.95: " << std::setprecision(6) << res.second * 100 << "%" << std::endl;
+        }
+      }
+    }
+    // Output Debug Network Info if COREDLA_TEST_DEBUG_NETWORK is set
+    ReadDebugNetworkInfo(core);
+    if (FLAGS_report_lsu_counters) {
+      PrintLSUCounterInfo(core);
+    }
+    if (return_code) return return_code;
+  } catch (const std::exception& ex) {
+    slog::err << ex.what() << slog::endl;
+
+    if (statistics) {
+      statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
+                                {
+                                    {"Error during dla_benchmark: ", ex.what()},
+                                });
+      statistics->dump();
+    }
+
+    return 3;
+  }
+
+  return 0;
+  // Bypass long function lint check
+  // NOLINTNEXTLINE(readability/fn_size)
+}
diff --git a/python/openvino/runtime/dla_benchmark/progress_bar.hpp b/python/openvino/runtime/dla_benchmark/progress_bar.hpp
new file mode 100644
index 0000000..cb4459a
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/progress_bar.hpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include <samples/console_progress.hpp>
+
+/// @brief Responsible for progress bar handling within the dla_benchmark
+class ProgressBar {
+ public:
+  explicit ProgressBar(size_t totalNum, bool streamOutput = false, bool progressEnabled = false) {
+    _bar.reset(new ConsoleProgress(totalNum, streamOutput));
+    _streamOutput = streamOutput;
+    _isFinished = true;
+    _progressEnabled = progressEnabled;
+  }
+
+  void addProgress(size_t num) {
+    _isFinished = false;
+    if (_progressEnabled) {
+      _bar->addProgress(num);
+    }
+  }
+
+  void finish(size_t num = 0) {
+    if (num > 0) {
+      addProgress(num);
+    }
+    _isFinished = true;
+    _bar->finish();
+    if (_progressEnabled) {
+      std::cout << std::endl;
+    }
+  }
+
+  void newBar(size_t totalNum) {
+    if (_isFinished) {
+      _bar.reset(new ConsoleProgress(totalNum, _streamOutput));
+    } else {
+      throw std::logic_error("Cannot create a new bar. Current bar is still in progress");
+    }
+  }
+
+ private:
+  std::unique_ptr<ConsoleProgress> _bar;
+  bool _streamOutput;
+  bool _isFinished;
+  bool _progressEnabled;
+};
diff --git a/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp b/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp
new file mode 100644
index 0000000..f97c798
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/shared_tensor_allocator.hpp
@@ -0,0 +1,55 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <algorithm>
+#include "openvino/runtime/allocator.hpp"
+
+// Modified from SharedTensorAllocator in [openvinotoolkit/openvino ›
+// samples/cpp/benchmark_app/shared_tensor_allocator.hpp]
+class SharedTensorAllocator : public ov::AllocatorImpl {
+ public:
+  SharedTensorAllocator(size_t sizeBytes) : size(sizeBytes) { data = new char[size]; }
+
+  // Copy Constructor
+  SharedTensorAllocator(const SharedTensorAllocator& other) : size(other.size) {
+    data = new char[size];
+    std::copy(other.data, other.data + size, data);
+  }
+
+  // Copy Assignment Operator
+  SharedTensorAllocator& operator=(const SharedTensorAllocator& other) {
+    if (this != &other) {
+      size = other.size;
+      delete[] data;
+      data = new char[size];
+      std::copy(other.data, other.data + size, data);
+    }
+    return *this;
+  }
+
+  ~SharedTensorAllocator() { delete[] data; }
+
+  void* allocate(const size_t bytes, const size_t) override {
+    return bytes <= this->size ? (void*)data : nullptr;
+  }
+
+  void deallocate(void* handle, const size_t bytes, const size_t) override {
+    if (handle == data) {
+      delete[] data;
+      data = nullptr;
+    }
+  }
+
+  bool is_equal(const AllocatorImpl& other) const override {
+    auto other_tensor_allocator = dynamic_cast<const SharedTensorAllocator*>(&other);
+    return other_tensor_allocator != nullptr && other_tensor_allocator == this;
+  }
+
+  char* get_buffer() { return data; }
+
+ private:
+  char* data;
+  size_t size;
+};
diff --git a/python/openvino/runtime/dla_benchmark/statistics_report.cpp b/python/openvino/runtime/dla_benchmark/statistics_report.cpp
new file mode 100644
index 0000000..ce80a2e
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/statistics_report.cpp
@@ -0,0 +1,149 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file implements functions to dump inference performance statistics
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "statistics_report.hpp"
+
+static const char* STATUS_NAMES[] = {"NOT_RUN", "OPTIMIZED_OUT", "EXECUTED"};
+
+void StatisticsReport::addParameters(const Category& category, const Parameters& parameters) {
+  if (_parameters.count(category) == 0)
+    _parameters[category] = parameters;
+  else
+    _parameters[category].insert(_parameters[category].end(), parameters.begin(), parameters.end());
+}
+
+void StatisticsReport::dump() {
+  CsvDumper dumper(true, _config.report_folder + _separator + "dla_benchmark_run_summary.csv");
+
+  auto dump_parameters = [&dumper](const Parameters& parameters) {
+    for (auto& parameter : parameters) {
+      dumper << parameter.first << parameter.second;
+      dumper.endLine();
+    }
+  };
+  if (_parameters.count(Category::COMMAND_LINE_PARAMETERS)) {
+    dumper << "Command line parameters";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::COMMAND_LINE_PARAMETERS));
+    dumper.endLine();
+  }
+
+  if (_parameters.count(Category::RUNTIME_CONFIG)) {
+    dumper << "Configuration setup";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::RUNTIME_CONFIG));
+    dumper.endLine();
+  }
+
+  if (_parameters.count(Category::EXECUTION_RESULTS)) {
+    dumper << "Execution results";
+    dumper.endLine();
+
+    dump_parameters(_parameters.at(Category::EXECUTION_RESULTS));
+    dumper.endLine();
+  }
+
+  slog::info << "Run summary is saved to " << dumper.getFilename() << slog::endl;
+}
+
+void StatisticsReport::printPerfCountersSort(const std::vector<PerformanceCounters>& perfCounts, std::string sortFlag) {
+  for (size_t ni = 0; ni < perfCounts.size(); ni++) {
+    const auto& perf_counts = perfCounts[ni];
+    double total_time(0);
+    double total_time_cpu(0);
+    std::cout << "Performance counts sorted for " << ni << "-th infer request" << std::endl;
+    for (auto&& pi : perf_counts) {
+      total_time += pi.real_time.count();
+      total_time_cpu += pi.cpu_time.count();
+    }
+    auto total_real_time_proportion = 0.0;
+    std::vector<std::vector<std::string>> total_detail_data;
+    for (auto&& pi : perf_counts) {
+      auto node_name = pi.node_name;
+      std::string layer_status_str =
+          ((int)pi.status < (int)(sizeof(STATUS_NAMES) / sizeof(STATUS_NAMES[0])) ? STATUS_NAMES[(int)pi.status]
+                                                                                  : "INVALID_STATUS");
+
+      auto layer_type = pi.node_type;
+      auto real_time = pi.real_time.count();
+      auto cpu_time = pi.cpu_time.count();
+      auto real_proportion = real_time / total_time;
+      auto execType = pi.exec_type;
+      std::vector<std::string> tmp_data{node_name,
+                                        layer_status_str,
+                                        std::string(layer_type),
+                                        std::to_string(real_time),
+                                        std::to_string(cpu_time),
+                                        std::to_string(real_proportion),
+                                        std::string(execType)};
+      total_detail_data.push_back(tmp_data);
+      total_real_time_proportion += real_proportion;
+    }
+    // sorted by read_time
+    if (sortFlag == "sort") {
+      std::sort(total_detail_data.begin(), total_detail_data.end(), [](const auto& a, const auto& b) {
+        return std::stod(a[3]) > std::stod(b[3]);
+      });
+    } else if (sortFlag == "no_sort") {
+      total_detail_data = total_detail_data;
+    } else if (sortFlag == "simple_sort") {
+      std::sort(total_detail_data.begin(), total_detail_data.end(), [](const auto& a, const auto& b) {
+        return std::stod(a[3]) > std::stod(b[3]);
+      });
+      total_detail_data.erase(
+          std::remove_if(
+              total_detail_data.begin(), total_detail_data.end(), [](const auto& a) { return a[1] == "NOT_RUN"; }),
+          total_detail_data.end());
+    }
+    printDetailResult(total_detail_data);
+    // Save the current state of std::cout. This is to avoid coverity error.
+    std::ios_base::fmtflags f(std::cout.flags());
+
+    std::cout << "Total time:       " << total_time / 1000 << " microseconds" << std::endl;
+    std::cout << "Total CPU time:   " << total_time_cpu / 1000 << " microseconds" << std::endl;
+    std::cout << "Total proportion: " << std::fixed << std::setprecision(2) << round(total_real_time_proportion * 100)
+              << " % \n"
+              << std::endl;
+
+    // Restore the original state
+    std::cout.flags(f);
+  }
+}
+
+void StatisticsReport::printDetailResult(std::vector<std::vector<std::string>> result_list) {
+  const int max_layer_name_len = 50;
+  for (auto&& tmp_result : result_list) {
+    std::string node_name = tmp_result[0];
+    std::string node_name_truncated = node_name.substr(0, max_layer_name_len - 4);
+    if (node_name.length() >= max_layer_name_len) {
+      node_name_truncated += "...";
+    }
+    std::string layerStatus = tmp_result[1];
+    std::string layerType = tmp_result[2];
+    float real_time = std::stof(tmp_result[3]);
+    float cpu_time = std::stof(tmp_result[4]);
+    float proportion = std::stof(tmp_result[5]);
+    std::string execType = tmp_result[6];
+
+    std::printf(
+        "node: %-50s LayerStatus: %-15s LayerType: %-30s RealTime: %-20.3f CPUTime: %-20.3f Proportion: %-30.3f "
+        "ExecType: %-20s\n",
+        node_name_truncated.c_str(),
+        layerStatus.c_str(),
+        layerType.substr(0, max_layer_name_len).c_str(),
+        real_time / 1000.0,  // ms
+        cpu_time / 1000.0,   // ms
+        proportion * 100,
+        std::string(execType).substr(0, max_layer_name_len).c_str());
+  }
+}
diff --git a/python/openvino/runtime/dla_benchmark/statistics_report.hpp b/python/openvino/runtime/dla_benchmark/statistics_report.hpp
new file mode 100644
index 0000000..8032630
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/statistics_report.hpp
@@ -0,0 +1,83 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: The file defines functions to dump inference performance statistics
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <openvino/openvino.hpp>
+#include <samples/common.hpp>
+#include <samples/csv_dumper.hpp>
+#include <samples/slog.hpp>
+#include "utils.hpp"
+#include "dla_defines.h"
+
+// @brief statistics reports types
+static constexpr char noCntReport[] = "no_counters";
+static constexpr char averageCntReport[] = "average_counters";
+static constexpr char detailedCntReport[] = "detailed_counters";
+
+/// @brief Responsible for collecting of statistics and dumping to .csv file
+class StatisticsReport {
+ public:
+  typedef std::vector<ov::ProfilingInfo> PerformanceCounters;
+  typedef std::vector<std::pair<std::string, std::string>> Parameters;
+
+  struct Config {
+    bool save_report;
+    std::string report_folder;
+  };
+
+  enum class Category {
+    COMMAND_LINE_PARAMETERS,
+    RUNTIME_CONFIG,
+    EXECUTION_RESULTS,
+  };
+
+  explicit StatisticsReport(Config config) : _config(std::move(config)) {
+    _separator = dla::util::path_separator;
+    if (_config.report_folder.empty()) _separator = "";
+  }
+
+  void addParameters(const Category &category, const Parameters &parameters);
+
+  void dump();
+
+  /// print the performance counters for neural net layers executed on the CPU.
+  /// @param perfCounts                vector of map of layer name and InferenceEngineProfileInfo.
+  /// @param sortFlag                  One of "sort", "no_sort", "simple_sort".
+  ///                                    "sort": sort by execution RealTime. Default value.
+  ///                                    "no_sort": no sort.
+  ///                                    "simple_sort": sort by execution RealTime after removing nodes with "NOT_RUN"
+  ///                                    status.
+  void printPerfCountersSort(const std::vector<PerformanceCounters> &perfCounts, std::string sortFlag = "sort");
+
+  /// Helper function used by printPerfCountersSort that prints a row of performance count info.
+  /// prints the following info for a layer from left to right:
+  /// 0. nodeName: name of the layer
+  /// 1. LayerStatus: NOT_RUN, OPTIMIZED_OUT, or EXECUTED
+  /// 2. LayerType: type of layer, such as Convolution.
+  /// 3. RealTime (ms): The absolute time that the layer ran (in total), including CPU processing time + any potential
+  /// wait time.
+  /// 4. CPUTime (ms): The net host cpu time that the layer ran, i.e. CPU processing time.
+  /// 5. Proportion: RealTime of the node / RealTime in total
+  /// 6. ExecType: An execution type of unit. e.g.,  jit_avx2_FP32 (executed using just-in-time (JIT) compilation with
+  /// AVX2 instructions for FP32 data)
+  /// @param result_list              vector of per-node info, where each per-node info is a vector of formatted string.
+  void printDetailResult(std::vector<std::vector<std::string>> result_list);
+
+ private:
+  // configuration of current benchmark execution
+  const Config _config;
+
+  // parameters
+  std::map<Category, Parameters> _parameters;
+
+  // csv separator
+  std::string _separator;
+};
diff --git a/python/openvino/runtime/dla_benchmark/top1_top5.hpp b/python/openvino/runtime/dla_benchmark/top1_top5.hpp
new file mode 100644
index 0000000..4f27bb2
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/top1_top5.hpp
@@ -0,0 +1,222 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: This file defines and implements functions to calculate top1 and top5 scores.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <utility>
+
+class TopResultsAnalyser {
+ public:
+  static bool get_top_results(const std::string groundtruth_loc, const std::string results_loc, uint32_t batchSize) {
+    // This function loads the output results from a file,
+    // The dla benchmark currently uses the get_top_results(string, vector<float>, uint)
+    // This function is kept as it can be used to assess accuracy post runtime,
+    // although it seems awfully similar to the other version of get_top_results().
+    const std::string accuracy_results_loc = "accuracy_report.txt";
+    std::ofstream accuracy_file(accuracy_results_loc);
+
+    if (!accuracy_file.is_open()) {
+      throw std::invalid_argument("Unable to open accuracy file.");
+    }
+
+    std::ifstream groundtruth_file(groundtruth_loc);
+    int groundtruth_lineno = 0;
+
+    if (!groundtruth_file.is_open()) {
+      throw std::invalid_argument("Unable to open groundtruth file.");
+    }
+
+    std::ifstream results_file(results_loc);
+
+    if (!results_file.is_open()) {
+      throw std::invalid_argument("Unable to open result file.");
+    }
+
+    std::string results_line;
+    std::vector<float> results;
+    while (std::getline(results_file, results_line)) {
+      const float result = std::stof(results_line);
+      results.push_back(result);
+    }
+
+    if (results.size() % batchSize != 0) {
+      std::cout << "Results size = " << results.size() << " Batch size = " << batchSize << std::endl;
+      throw std::invalid_argument("Results size is not a multiple of batch size");
+    }
+
+    typedef std::pair<uint64_t, float> CatProbPair;
+    const uint64_t img_output_size = results.size() / batchSize;
+    uint32_t top1_correct_guesses = 0;
+    uint32_t top5_correct_guesses = 0;
+    const auto top_n = fmin(5, img_output_size);
+    for (uint32_t img = 0; img < batchSize; img++) {
+      accuracy_file << "image " << img << " top 5:" << std::endl;
+
+      const auto start_addr = img_output_size * img;
+      std::vector<CatProbPair> top5;
+      for (int i = 0; i < top_n; i++) {
+        top5.push_back(std::make_pair(i, results[start_addr + i]));
+      }
+
+      for (uint64_t i = 5; i < img_output_size; i++) {
+        const auto e = results[start_addr + i];
+        auto min_ele = &top5.at(0);
+        for (size_t j = 1; j < top5.size(); j++) {
+          if (top5.at(j).second < min_ele->second) {
+            min_ele = &top5.at(j);
+          }
+        }
+        if (e > min_ele->second) {
+          *min_ele = std::make_pair(i, e);
+        }
+      }
+
+      // sort descending
+      std::sort(
+          top5.begin(), top5.end(), [](const CatProbPair& a, const CatProbPair& b) { return a.second > b.second; });
+      for (const auto& pair : top5) {
+        accuracy_file << pair.first << " : " << pair.second << std::endl;
+      }
+      std::string line;
+      std::getline(groundtruth_file, line);
+      ++groundtruth_lineno;
+      uint64_t truth;
+      try {
+        truth = std::stoi(line);
+      } catch (const std::invalid_argument& ia) {
+        THROW_IE_EXCEPTION << "Unable to parse line " << groundtruth_lineno << " "
+                           << "of the ground truth file " << groundtruth_loc;
+      }
+      accuracy_file << truth << " : truth" << std::endl;
+      top1_correct_guesses += (top5.at(0).first == truth);
+
+      uint64_t i = 1;
+      for (const auto& guess : top5) {
+        if (guess.first == truth && i < img_output_size) {
+          top5_correct_guesses += 1;
+          break;
+        }
+        i += 1;
+      }
+    }
+
+    const auto top_n_string = [&](std::ostream& stream, const double correct_guesses, const uint32_t N) {
+      stream << "top" << N << " accuracy: " << (correct_guesses * 100.0) / (batchSize) << " %" << std::endl;
+    };
+
+    accuracy_file << "====================" << std::endl;
+
+    top_n_string(accuracy_file, top1_correct_guesses, 1);
+    top_n_string(std::cout, top1_correct_guesses, 1);
+    if (2 < img_output_size && img_output_size < 6) {
+      top_n_string(accuracy_file, top5_correct_guesses, img_output_size - 1);
+      top_n_string(std::cout, top5_correct_guesses, img_output_size - 1);
+    } else if (6 <= img_output_size) {
+      top_n_string(accuracy_file, top5_correct_guesses, 5);
+      top_n_string(std::cout, top5_correct_guesses, 5);
+    }
+    return true;
+  }
+
+  static bool get_top_results(const std::string groundtruth_loc, std::vector<float> results, uint32_t batchSize) {
+    // This function takes the output results directly from runtime in a vector
+    // The dla benchmark currently uses this version of get_top_results
+    const std::string accuracy_results_loc = "accuracy_report.txt";
+    std::ofstream accuracy_file(accuracy_results_loc);
+
+    if (!accuracy_file.is_open()) {
+      throw std::invalid_argument("Unable to open accuracy file.");
+    }
+
+    std::ifstream groundtruth_file(groundtruth_loc);
+    int groundtruth_lineno = 0;
+
+    if (!groundtruth_file.is_open()) {
+      throw std::invalid_argument("Unable to open groundtruth file.");
+    }
+
+    if (results.size() % batchSize != 0) {
+      std::cout << "Results size = " << results.size() << " Batch size = " << batchSize << std::endl;
+      throw std::invalid_argument("Results size is not a multiple of batch size");
+    }
+
+    typedef std::pair<int, float> CatProbPair;
+    const int img_output_size = results.size() / batchSize;
+    uint32_t top1_correct_guesses = 0;
+    uint32_t top5_correct_guesses = 0;
+    const auto top_n = fmin(5, img_output_size);
+    for (uint32_t img = 0; img < batchSize; img++) {
+      accuracy_file << "image " << img << " top 5:" << std::endl;
+
+      const auto start_addr = img_output_size * img;
+      std::vector<CatProbPair> top5;
+      for (int i = 0; i < top_n; i++) {
+        top5.push_back(std::make_pair(i, results[start_addr + i]));
+      }
+
+      for (int i = 5; i < img_output_size; i++) {
+        const auto e = results[start_addr + i];
+        auto min_ele = &top5.at(0);
+        for (size_t j = 1; j < top5.size(); j++) {
+          if (top5.at(j).second < min_ele->second) {
+            min_ele = &top5.at(j);
+          }
+        }
+        if (e > min_ele->second) {
+          *min_ele = std::make_pair(i, e);
+        }
+      }
+
+      // sort descending
+      std::sort(
+          top5.begin(), top5.end(), [](const CatProbPair& a, const CatProbPair& b) { return a.second > b.second; });
+      for (const auto& pair : top5) {
+        accuracy_file << pair.first << " : " << pair.second << std::endl;
+      }
+      std::string line;
+      std::getline(groundtruth_file, line);
+      ++groundtruth_lineno;
+      int truth;
+      try {
+        truth = std::stoi(line);
+      } catch (const std::invalid_argument& ia) {
+        THROW_IE_EXCEPTION << "Unable to parse line " << groundtruth_lineno << " "
+                           << "of the ground truth file " << groundtruth_loc;
+      }
+      accuracy_file << truth << " : truth" << std::endl;
+      top1_correct_guesses += top5.at(0).first == truth;
+
+      int i = 1;
+      for (const auto& guess : top5) {
+        if (guess.first == truth && i < img_output_size) {
+          top5_correct_guesses += 1;
+          break;
+        }
+        i += 1;
+      }
+    }
+
+    const auto top_n_string = [&](std::ostream& stream, const double correct_guesses, const uint32_t N) {
+      stream << "top" << N << " accuracy: " << (correct_guesses * 100.0) / (batchSize) << " %" << std::endl;
+    };
+
+    accuracy_file << "====================" << std::endl;
+
+    top_n_string(accuracy_file, top1_correct_guesses, 1);
+    top_n_string(std::cout, top1_correct_guesses, 1);
+    if (2 < img_output_size && img_output_size < 6) {
+      top_n_string(accuracy_file, top5_correct_guesses, img_output_size - 1);
+      top_n_string(std::cout, top5_correct_guesses, img_output_size - 1);
+    } else if (6 <= img_output_size) {
+      top_n_string(accuracy_file, top5_correct_guesses, 5);
+      top_n_string(std::cout, top5_correct_guesses, 5);
+    }
+
+    return true;
+  }
+};
diff --git a/python/openvino/runtime/dla_benchmark/utils.cpp b/python/openvino/runtime/dla_benchmark/utils.cpp
new file mode 100644
index 0000000..066d234
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/utils.cpp
@@ -0,0 +1,689 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Utility functions handling command line arguments and network input info for DLA's runtime.
+//              Loosely based off OpenVino's benchmark_app/utils.cpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/utils.cpp]
+//              Future OpenVino uplifts should refer to the file listed above.
+
+#include <format_reader_ptr.h>
+#include <gflags/gflags.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include <functional>
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+#include "dla_stl_utils.h"
+#include "utils.hpp"
+
+/**
+ * @brief Namespace dla_benchmark contains utility functions for working with network inputs.
+ */
+namespace dla_benchmark {
+
+/**
+ * @brief Checks if the input layout represents an image.
+ *
+ * This function determines whether the layout is compatible with image data based on the
+ * layout string and the number of channels.
+ *
+ * @return True if the layout is for an image, False otherwise.
+ */
+bool InputInfo::IsImage() const {
+  if ((layout != "NCHW" && layout != "NHWC")) return false;
+  return (GetChannels() == 1 || GetChannels() == 3);
+}
+
+/**
+ * @brief Checks if the input layout represents image information.
+ *
+ * This function checks if the layout corresponds to image information.
+ *
+ * @return True if the layout is for image information, False otherwise.
+ */
+bool InputInfo::IsImageInfo() const {
+  if (layout != "NC") return false;
+  return (GetChannels() >= 2);
+}
+
+/**
+ * @brief Checks if the input layout represents video data.
+ *
+ * This function determines whether the layout is compatible with video data based on the
+ * layout string and the number of channels.
+ *
+ * @return True if the layout is for video data, False otherwise.
+ */
+bool InputInfo::IsVideo() const {
+  if (layout != "NCDHW" && layout != "NDHWC") return false;
+  return (GetChannels() == 3);
+}
+
+/**
+ * @brief Gets the width dimension of the data shape based on the layout.
+ *
+ * @return The width dimension of the data shape.
+ */
+size_t InputInfo::GetWidth() const { return data_shape.at(ov::layout::width_idx(layout)); }
+
+/**
+ * @brief Gets the height dimension of the data shape based on the layout.
+ *
+ * @return The height dimension of the data shape.
+ */
+size_t InputInfo::GetHeight() const { return data_shape.at(ov::layout::height_idx(layout)); }
+
+/**
+ * @brief Gets the number of channels based on the layout.
+ *
+ * @return The number of channels.
+ */
+size_t InputInfo::GetChannels() const { return data_shape.at(ov::layout::channels_idx(layout)); }
+
+/**
+ * @brief Gets the batch size based on the layout.
+ *
+ * @return The batch size.
+ */
+size_t InputInfo::GetBatch() const { return data_shape.at(ov::layout::batch_idx(layout)); }
+
+/**
+ * @brief Gets the depth dimension of the data shape based on the layout.
+ *
+ * @return The depth dimension of the data shape.
+ */
+size_t InputInfo::GetDepth() const { return data_shape.at(ov::layout::depth_idx(layout)); }
+
+}  // namespace dla_benchmark
+
+/**
+ * @brief Parses number of streams for each device from a string argument.
+ *
+ * @param devices vector of supported DLA devices, ie FPGA, CPU
+ * @param values_string string arg of the format: <device1>:<value1>,<device2>:<value2>
+ * @return A map of device : number of streams
+ */
+std::map<std::string, uint32_t> ParseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                            const std::string& values_string) {
+  auto values_string_upper = values_string;
+  std::map<std::string, uint32_t> result;
+  auto device_value_strings = split(values_string_upper, ',');
+  for (auto& device_value_string : device_value_strings) {
+    auto device_value_vec = split(device_value_string, ':');
+    if (device_value_vec.size() == 2) {
+      auto device_name = device_value_vec.at(0);
+      auto nstreams = device_value_vec.at(1);
+      auto it = std::find(devices.begin(), devices.end(), device_name);
+      if (it != devices.end()) {
+        result[device_name] = std::stoi(nstreams);
+      } else {
+        throw std::logic_error("Can't set nstreams value " + std::string(nstreams) + " for device '" + device_name +
+                               "'! Incorrect device name!");
+      }
+    } else if (device_value_vec.size() == 1) {
+      uint32_t value = std::stoi(device_value_vec.at(0));
+      for (auto& device : devices) {
+        result[device] = value;
+      }
+    } else if (device_value_vec.size() != 0) {
+      throw std::runtime_error("Unknown string format: " + values_string);
+    }
+  }
+  return result;
+}
+
+/**
+ * @brief Parses CLI flag args -mean_values or -scale_values. Helper to GetInputsInfo()
+ *
+ * Parsing example: -mean_values data[255,255,255] is stored as data as the key, and a vector of 3 floats as the value
+ *
+ * @param arg raw string from CLI in the form of the example above
+ * @param inputs_info struct used to check that the input name exists in the graph
+ * @returns a map of input name and its respective mean/scale value vector
+ */
+std::map<std::string, std::vector<float>> ParseScaleOrMeanValues(const std::string& arg,
+                                                                 const dla_benchmark::InputsInfo& inputs_info) {
+  std::map<std::string, std::vector<float>> return_value;
+  // Create a copy of the input string for processing
+  std::string search_string = arg;
+  // Find the first '[' character in the string
+  auto start_pos = search_string.find_first_of('[');
+
+  while (start_pos != std::string::npos) {
+    // Find the matching ']' character
+    auto end_pos = search_string.find_first_of(']');
+    if (end_pos == std::string::npos) break;
+    // Extract the input name and value string between '[' and ']'
+    const std::string input_name = search_string.substr(0, start_pos);
+    const std::string input_value_string = search_string.substr(start_pos + 1, end_pos - start_pos - 1);
+    // Split the input value string into a vector of floats using a custom function SplitFloat
+    std::vector<float> input_value = SplitFloat(input_value_string, ',');
+    if (!input_name.empty()) {
+      // If the input name is not empty and exists in the inputs_info map, store the value
+      if (inputs_info.count(input_name)) {
+        return_value[input_name] = input_value;
+      } else {
+        // Ignore wrong input names but gives a warning
+        std::string network_input_names = "";
+        for (auto it = inputs_info.begin(); it != inputs_info.end(); ++it) {
+          network_input_names += it->first;
+          if (std::next(it) != inputs_info.end()) {
+              network_input_names += ", ";
+          }
+        }
+        slog::warn << "Scale values or mean values are applied to '" << input_name << "' but '" << input_name
+        << "' does not exist in network inputs. The available network inputs are: " << network_input_names
+        << slog::endl;
+      }
+    } else {
+      // If the input name is empty, apply the value to all image inputs in inputs_info
+      for (auto& item : inputs_info) {
+        if (item.second.IsImage()) return_value[item.first] = input_value;
+      }
+      // Clear the search string and exit the loop
+      search_string.clear();
+      break;
+    }
+    // Remove processed substring from the search string
+    search_string = search_string.substr(end_pos + 1);
+    // If the string is empty or doesn't start with a comma, exit the loop
+    if (search_string.empty() || search_string.front() != ',') {
+      break;
+    }
+    // Remove the leading comma and search for the next '[' character
+    search_string = search_string.substr(1);
+    start_pos = search_string.find_first_of('[');
+  }
+  // If there are remaining characters in the search string, it's an error
+  if (!search_string.empty()) {
+    throw std::logic_error("Can't parse input parameter string: " + arg);
+  }
+
+  return return_value;
+}
+
+/**
+ * @brief Splits command-line input arguments containing multiple image file paths
+ *        into separate vectors based on a specified separator.
+ * Modified from parseInputFilesArguments() in [openvinotoolkit/openvino ›
+ * inference-engine/samples/common/utils/src/args_helper.cpp]
+ *
+ * @param net_size The number of networks (multigraph functionality).
+ * @return A vector of vectors, where each inner vector contains image file paths
+ *         corresponding to a specific network graph.
+ */
+std::vector<std::vector<std::string>> SplitMultiInputFilesArguments(size_t net_size) {
+  std::vector<std::vector<std::string>> paths;
+  std::vector<std::string> args = gflags::GetArgvs();
+  const auto is_image_arg = [](const std::string& s) { return s == "-i" || s == "--images"; };
+  const auto is_arg = [](const std::string& s) { return s.front() == '-'; };
+  const auto img_start = std::find_if(begin(args), end(args), is_image_arg);  // looking for all `-i` or `--images` args
+  if (img_start == end(args)) {
+    // By default: if no -i argument is specified, then we should generate random
+    // input image data.  The fillBlobs() function will do that later when it sees
+    // an empty vector for its current network.
+    paths.push_back(std::vector<std::string>());
+    return paths;
+  }
+  const auto img_begin = std::next(img_start);
+  const auto img_end = std::find_if(img_begin, end(args), is_arg);
+  for (auto img = img_begin; img != img_end; ++img) {
+    auto multiFiles = split(*img, MULTIGRAPH_SEP);  // split this images arguments
+
+    if (multiFiles.size() != 1 && multiFiles.size() != net_size) {
+      slog::err << "Size of Input argument " << multiFiles.size() << " mismatch graph size " << net_size << " : "
+                << *img << slog::endl;
+      paths.clear();
+      break;
+    }
+    for (size_t i = 0; i < multiFiles.size(); i++)
+      slog::info << "Reading " << multiFiles[i] << " for graph index " << i << slog::endl;
+    while (paths.size() < multiFiles.size()) paths.push_back(std::vector<std::string>());
+
+    for (size_t i = 0; i < multiFiles.size(); i++) {
+      paths[i].push_back(multiFiles[i]);
+    }
+  }
+  return paths;
+}
+
+/**
+ * @brief Returns the stem of a file path.
+ *
+ * The stem is the base name of the file without its extension. This function
+ * takes a file path as input and extracts the stem, which is the part of the
+ * file name before the last period ('.') character.
+ *
+ * @param path The input file path.
+ * @return The stem of the file, excluding the extension.
+ */
+std::string GetStem(std::string path) {
+  auto last_index = path.rfind('/');
+
+  if (std::string::npos != last_index) {
+    path.erase(0, last_index + 1);
+  }
+
+  last_index = path.rfind('.');
+  if (std::string::npos != last_index) {
+    path.erase(last_index);
+  }
+
+  return path;
+}
+
+/**
+ * @brief Splits a string into substrings using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the substrings.
+ * @return A vector of strings containing the substrings from the input string.
+ */
+std::vector<std::string> split(const std::string& s, char delim) {
+  std::vector<std::string> result;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    result.push_back(item);
+  }
+  return result;
+}
+
+/**
+ * @brief Splits a string of floats into floats using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the floats.
+ * @return A vector of floats containing the floats from the input string.
+ */
+std::vector<float> SplitFloat(const std::string& s, char delim) {
+  std::vector<float> result;
+  std::stringstream ss(s);
+  std::string item;
+
+  while (getline(ss, item, delim)) {
+    result.push_back(std::stof(item));
+  }
+  return result;
+}
+
+/**
+ * @brief Parses a list of devices from a string
+ *
+ * @param device_string The input string to be split. The delimiter is ':'
+ * @return A vector of strings containing the devices
+ */
+std::vector<std::string> ParseDevices(const std::string& device_string) {
+  std::string comma_separated_devices = device_string;
+  if (comma_separated_devices.find(":") != std::string::npos) {
+    comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
+  }
+  auto devices = split(comma_separated_devices, ',');
+  for (auto& device : devices) device = device.substr(0, device.find_first_of(".("));
+  return devices;
+}
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param reshape_required boolean flag indicating that the model needs to be reshaped according to the batch size flag
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @param mean_string CLI arg specifying image mean value. Example: input[255,255,255]. (Optional)
+ * @param scale_string CLI arg specifying image scale value. Example: input[255,255,255]. (Optional)
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        bool& reshape_required,
+                                        const bool is_binary_data,
+                                        const std::string& mean_string = "",
+                                        const std::string& scale_string = "") {
+  reshape_required = false;
+  dla_benchmark::InputsInfo info_map;
+
+  bool is_there_at_least_one_batch_dim = false;
+  for (auto& item : input_info) {
+    dla_benchmark::InputInfo info;
+    const std::string& name = item.get_any_name();
+
+    // Layout
+    info.layout = dynamic_cast<const ov::op::v0::Parameter&>(*item.get_node()).get_layout();
+
+    // Calculating default layout values if needed
+    std::string newLayout = "";
+    if (info.layout.empty()) {
+      const size_t rank = item.get_partial_shape().size();
+      const std::string newLayout = dla::util::getTensorLayout(rank);
+      if (newLayout != "") {
+        info.layout = ov::Layout(newLayout);
+        slog::warn << name << ": layout is not set explicitly through model optimizer"
+                   << (newLayout != "" ? std::string(", so it is defaulted to ") + newLayout : "")
+                   << ". It is recommended to explicity set layout via model optmizer." << slog::endl;
+      }
+    }
+
+    // Partial Shape
+    info.partial_shape = item.get_partial_shape();
+    info.data_shape = info.partial_shape.get_shape();
+
+    // DLA only supports static shapes
+    if (info.partial_shape.is_dynamic()) {
+      throw std::runtime_error(
+          "DLA only supports static shapes. Check your model and make sure all shapes are defined (No dims of -1).");
+    }
+
+    // Precision
+    // Edwinzha: setting input data to u8 for image data instead of the defined precision in .xml
+    // leads to accuracy loss that didn't exist prior to API 2.0. Should investigate or remove this condition.
+    // info.IsImage() && !is_binary_data ? ov::element::u8 : item.get_element_type();
+    info.type = item.get_element_type();
+
+    // Update shape with batch if needed (only in static shape case)
+    // Update blob shape only not affecting network shape to trigger dynamic batch size case
+    if (batch_size != 0) {
+      if (ov::layout::has_batch(info.layout)) {
+        std::size_t batch_index = ov::layout::batch_idx(info.layout);
+        if (info.data_shape.at(batch_index) != batch_size) {
+          info.partial_shape[batch_index] = batch_size;
+          info.data_shape[batch_index] = batch_size;
+          reshape_required = true;
+          is_there_at_least_one_batch_dim = true;
+        }
+      } else {
+        slog::warn << "Input '" << name
+                   << "' doesn't have batch dimension in layout. -b option will be ignored for this input."
+                   << slog::endl;
+      }
+    }
+    info_map[name] = info;
+  }
+
+  if (batch_size > 1 && !is_there_at_least_one_batch_dim) {
+    throw std::runtime_error(
+        "-b option is provided in command line, but there's no inputs with batch(B) "
+        "dimension in input layout, so batch cannot be set. "
+        "You may specify layout explicitly using -layout option.");
+  }
+
+  // Update scale and mean
+  std::map<std::string, std::vector<float>> scale_map = ParseScaleOrMeanValues(scale_string, info_map);
+  std::map<std::string, std::vector<float>> mean_map = ParseScaleOrMeanValues(mean_string, info_map);
+
+  for (auto& item : info_map) {
+    dla_benchmark::InputInfo& info = item.second;
+    if (info.IsImage()) {
+      if (info.GetChannels() == 3) {  // Image is RGB or BGR
+        info.scale_values.assign({1, 1, 1});
+        info.mean_values.assign({0, 0, 0});
+      } else if (info.GetChannels() == 1) {  // Image is greyscale
+        info.scale_values.assign({1});
+        info.mean_values.assign({0});
+      } else {
+        std::string err =
+            "Input is image but is not of 3 channels (RGB, BGR) or 1 channel (Greyscale). Cannot assign mean and/or "
+            "scale values";
+        throw std::logic_error(err);
+      }
+      if (scale_map.count(item.first)) {
+        info.scale_values = scale_map.at(item.first);
+      }
+      if (mean_map.count(item.first)) {
+        info.mean_values = mean_map.at(item.first);
+      }
+    }
+  }
+  return info_map;
+}
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ * Used in AOT flow where reshaping is not required (Handled by compiler)
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        const bool is_binary_data) {
+  bool reshape_required = false;
+  return GetInputsInfo(batch_size, input_info, reshape_required, is_binary_data);
+}
+
+/**
+ * @brief Extracts the file extension from a given file name.
+ *
+ * @param name The file name from which to extract the extension.
+ * @return The file extension as a string, or an empty string if no extension is found.
+ */
+std::string GetExtension(const std::string& name) {
+  auto extension_position = name.rfind('.', name.size());
+  return extension_position == std::string::npos ? "" : name.substr(extension_position + 1, name.size() - 1);
+}
+
+/**
+ * @brief Filters a list of file paths by specified file extensions (case insensitive).
+ *
+ * @param file_paths A vector of file paths to be filtered.
+ * @param extensions A vector of file extensions to filter by.
+ * @return A vector of filtered file paths that match the specified extensions.
+ */
+std::vector<std::string> FilterFilesByExtensions(const std::vector<std::string>& file_paths,
+                                                 const std::vector<std::string>& extensions) {
+  std::vector<std::string> filtered;
+  for (auto& file_path : file_paths) {
+    auto extension = GetExtension(file_path);
+    std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
+    if (std::find(extensions.begin(), extensions.end(), extension) != extensions.end()) {
+      filtered.push_back(file_path);
+    }
+  }
+  return filtered;
+}
+
+/**
+ * @brief Dumps output tensor into result.txt. Mainly used for regtesting, only runs with -dump_output flag
+ *
+ * @param output_tensor Output tensor to dump
+ * @param output_node Output node corresponding to the output tensor to dump
+ * @param output_size Size of the output tensor
+ * @param result_file ofstream object corresponding to result.txt
+ */
+void DumpResultTxtFile(const ov::Tensor& output_tensor,
+                       const ov::Output<const ov::Node>& output_node,
+                       const unsigned int output_size,
+                       std::ofstream& result_file) {
+  size_t C = 1;
+  size_t H = 1;
+  size_t W = 1;
+  size_t D = 1;
+
+  // allow dumping the data as txt for all layouts, but not dumping layout if it's unknown
+  bool unknown_layout = false;
+
+  const ov::Layout& layout = ov::layout::get_layout(output_node);
+  const ov::Shape& shape = output_tensor.get_shape();
+  const std::string& name = output_node.get_any_name();
+  const size_t num_dims = shape.size();
+  const size_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+  if (num_dims == 2) {
+    C = shape[1];
+  } else if (num_dims == 4) {
+    C = shape[1];
+    H = shape[2];
+    W = shape[3];
+  } else if (num_dims == 5) {
+    C = shape[1];
+    D = shape[2];
+    H = shape[3];
+    W = shape[4];
+  } else {
+    unknown_layout = true;
+  }
+
+  const auto* data = output_tensor.data<float>();
+  if (data == nullptr) {
+    throw std::runtime_error("Unable to dump result tensors because tensor data is NULL");
+  }
+  if (!result_file.is_open()) {
+    // Fix coverity, this should always be open from dla_benchmark/main.cpp
+    throw std::runtime_error("Unable to dump result tensors due to result ofstream not being open!");
+  }
+  // Save the original formatting flags for coverity
+  std::ios_base::fmtflags original_flags = result_file.flags();
+
+  for (size_t idx = 0; idx < tensor_size; ++idx) {
+    // Explicity set precision for coverity
+    result_file << std::fixed << std::setprecision(6) << data[idx] << std::defaultfloat;
+    if (!unknown_layout) {
+      size_t n = idx / (C * D * H * W);
+      size_t c = (idx / (D * H * W)) % C;
+      size_t d = (idx / (H * W)) % D;
+      size_t h = (idx / W) % H;
+      size_t w = idx % W;
+      result_file <<" # Layout: " << layout.to_string() << "; ";
+      result_file << "Index: " << n << " " << c;
+      if (num_dims == 4) {
+        result_file << " " << h << " " << w;
+      }
+      if (num_dims == 5) {
+        result_file << " " << d << " " << h << " " << w;
+      }
+    } else {
+      result_file << " # Index: " << idx;
+    }
+
+    if (idx == 0) {
+      result_file << " start of " << name;
+    } else if (idx == output_size - 1) {
+      result_file << " end of " << name << ", see result_tensor_boundaries.txt for details";
+    }
+    result_file << std::endl;
+  }
+  // restore orginal formatting flags
+  result_file.flags(original_flags);
+}
+
+/**
+ * @brief Dumps output tensor as binaries into result.bin.
+ * Can be useful in postprocessing of the result tensor using Python numpy,
+ * or when the tensor layout is not supported by DLA.
+ *
+ * @param output_tensor Output tensor to dump
+ * @param result_file ofstream object corresponding to result.bin
+ */
+void DumpResultBinFile(const ov::Tensor& output_tensor,
+                       std::ofstream& result_file) {
+  const ov::Shape& shape = output_tensor.get_shape();
+  size_t total_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+  const auto* data = output_tensor.data<float>();
+  if (data == nullptr) {
+    throw std::runtime_error("Unable to dump result tensors because tensor data is NULL");
+  }
+  for (size_t idx = 0; idx < total_size; ++idx) {
+    result_file.write(reinterpret_cast<const char*>(&data[idx]), sizeof(float));
+  }
+}
+
+/**
+ * @brief Dumps inference metadata as a JSON file into result_meta.json
+ * Useful for postprocessing and reviewing inference arguments
+ *
+ * @param metadata Meta data to dump
+ * @param result_file ofstream object corresponding to result_meta.json
+ */
+void DumpResultMetaJSONFile(const dla_benchmark::InferenceMetaData& metadata,
+                            std::ofstream& result_file) {
+  result_file << "{\n";
+  // batch size
+  result_file << "\t\"batch_size\": " << metadata.batch_size << ",\n";
+
+  // niter
+  result_file << "\t\"niter\": " << metadata.niter << ",\n";
+
+  // nireq
+  result_file << "\t\"nireq\": " << metadata.nireq << ",\n";
+
+  // groundtruth loc
+  result_file << "\t\"groundtruth_loc\": \"" << metadata.groundtruth_loc << "\",\n";
+
+  // input info: model_input_info
+  result_file << "\t\"input_info\": [\n";
+  long unsigned int idx = 0;
+  for (const auto &name_input_pair : metadata.model_input_info) {
+    // to collect scale_values and mean_values
+    std::ostringstream oss_scale_vals, oss_mean_vals;
+    unsigned int scale_values_size = name_input_pair.second.scale_values.size();
+    if (scale_values_size != name_input_pair.second.mean_values.size()) {
+      throw std::logic_error("scale_values and mean_values should always have the same size");
+    }
+    oss_scale_vals << "[";
+    oss_mean_vals << "[";
+    for (long unsigned int i = 0; i < scale_values_size; i++) {
+      oss_scale_vals << name_input_pair.second.scale_values[i];
+      oss_mean_vals << name_input_pair.second.mean_values[i];
+      if (i < scale_values_size - 1) {
+        oss_scale_vals << ",";
+        oss_mean_vals << ",";
+      } else {
+        oss_scale_vals << "]";
+        oss_mean_vals << "]";
+      }
+    }
+    result_file <<  "\t\t{\"name\": \"" << name_input_pair.first << "\", \"shape\": \""
+                << name_input_pair.second.data_shape.to_string() << "\", \"scale_values\": \""
+                << oss_scale_vals.str() << "\", \"mean_values\": \""
+                << oss_mean_vals.str() << "\", \"layout\": \""
+                << name_input_pair.second.layout.to_string() << "\"}";
+    if (idx == metadata.model_input_info.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+    idx += 1;
+  }
+  result_file << "\t],\n";
+
+  // output info: model_output_info preserves the order multi-tensor output
+  result_file << "\t\"output_info\": [\n";
+  for (long unsigned int i = 0; i < metadata.model_output_info.size(); i++) {
+    dla_benchmark::OutputInfo info = metadata.model_output_info[i];
+    result_file <<  "\t\t{\"name\": \"" << info.name << "\", \"shape\": \"" << info.shape.to_string() << "\"}";
+    if (i == metadata.model_output_info.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+  }
+  result_file << "\t],\n";
+
+  // input files
+  result_file << "\t\"input_files\": [\n";
+  for (long unsigned int i = 0; i < metadata.input_files.size(); i++) {
+    std::string input_file = metadata.input_files[i];
+    result_file <<  "\t\t\"" << input_file << "\"";
+    if (i == metadata.input_files.size() - 1) {
+      result_file << "\n";
+    } else {
+      result_file << ",\n";
+    }
+  }
+  result_file << "\t]\n";
+
+  result_file << "}\n";
+}
diff --git a/python/openvino/runtime/dla_benchmark/utils.hpp b/python/openvino/runtime/dla_benchmark/utils.hpp
new file mode 100644
index 0000000..5ca7834
--- /dev/null
+++ b/python/openvino/runtime/dla_benchmark/utils.hpp
@@ -0,0 +1,249 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Description: Utility functions handling command line arguments and network input info for DLA's runtime.
+//              Loosely based off OpenVino's benchmark_app/utils.hpp
+//              [openvinotoolkit/openvino › samples/cpp/benchmark_app/utils.hpp]
+//              Future OpenVino uplifts should refer to the file listed above.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+#define MULTIGRAPH_SEP ',' /* seperator used in argument line when multigraph activated */
+
+// Constants
+constexpr size_t BYTE_TO_MEGABYTE = 1024 * 1024;
+constexpr size_t MAX_COUT_WITHOUT_VERBOSE = 20;  // How many couts can be printed w/o VERBOSE=1
+
+typedef std::chrono::high_resolution_clock Time;
+typedef std::chrono::nanoseconds ns;
+
+#ifdef USE_OPENCV
+// This is the full list of image extensions supported by the opencv reader:
+// "bmp", "dib", "jpeg", "jpg", "jpe",
+// "jp2", "png", "pbm", "pgm", "ppm",
+// "sr", "ras", "tiff", "tif"
+// However, the ones in the list below are already
+// tested using the synthetic graphs infrastructure.
+// Only jpeg, jpg, jpe extensions of very high quality
+// and with subsampling disabled were tested.
+// TODO(meldafra): Check why the remaining extensions are not passing and fix them
+static const std::vector<std::string> supported_image_extensions = {
+    "bmp", "png", "pbm", "pgm", "ppm", "jpeg", "jpg", "jpe"};
+
+#else
+static const std::vector<std::string> supported_image_extensions = {"bmp"};
+#endif
+static const std::vector<std::string> supported_binary_extensions = {"bin"};
+static const std::vector<std::string> supported_video_extensions = {"mp4", "gif"};
+
+/**
+ * @brief Namespace dla_benchmark contains utility functions for working with network inputs.
+ */
+namespace dla_benchmark {
+struct InputInfo {
+  ov::element::Type type;
+  ov::PartialShape partial_shape;
+  ov::Shape data_shape;
+  ov::Layout layout;
+  std::vector<float> scale_values;
+  std::vector<float> mean_values;
+  bool IsImage() const;
+  bool IsImageInfo() const;
+  bool IsVideo() const;
+  size_t GetWidth() const;
+  size_t GetHeight() const;
+  size_t GetChannels() const;
+  size_t GetBatch() const;
+  size_t GetDepth() const;
+};
+
+struct OutputInfo {
+  std::string name;
+  ov::Shape shape;
+};
+
+using InputsInfo = std::map<std::string, InputInfo>;
+using OutputsInfoVec = std::vector<OutputInfo>;
+using PartialShapes = std::map<std::string, ngraph::PartialShape>;
+
+struct InferenceMetaData {
+  std::vector<std::string> input_files;  // Input files used inferencing
+  std::string groundtruth_loc;  // The directory that contains the groundtruth files
+  unsigned int batch_size;  // The batch size used in the inference
+  unsigned int niter;  // The number of iterations set by -niter in dla_benchmark
+  unsigned int nireq;  // The number of inference requests set by -nireq in dla_benchmark
+  dla_benchmark::InputsInfo model_input_info;  // the metadata of the model input
+  dla_benchmark::OutputsInfoVec model_output_info;  // the metadata of the model output
+};
+}  // namespace dla_benchmark
+
+/**
+ * @brief Parses number of streams for each device from a string argument.
+ *
+ * @param devices vector of supported DLA devices, ie FPGA, CPU
+ * @param values_string string arg of the format: <device1>:<value1>,<device2>:<value2>
+ * @return A map of device : number of streams
+ */
+std::map<std::string, uint32_t> ParseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                            const std::string& values_string);
+/**
+ * @brief Splits a string into substrings using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the substrings.
+ * @return A vector of strings containing the substrings from the input string.
+ */
+std::vector<std::string> split(const std::string& s, char delim);
+
+/**
+ * @brief Splits a string of floats into floats using a specified delimiter.
+ *
+ * @param s The input string to be split.
+ * @param delim The delimiter character used to separate the floats.
+ * @return A vector of floats containing the floats from the input string.
+ */
+std::vector<float> SplitFloat(const std::string& s, char delim);
+
+// To enable multigraph operations to all CNNNetworks, inputs are mutable
+template <typename T, typename S, typename Functor>
+inline std::vector<T> VectorMap(std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  for (auto& input : inputs) results.push_back(fn(input));
+  return results;
+}
+
+// Supports temporary object or constant expression
+template <typename T, typename S, typename Functor>
+inline std::vector<T> VectorMap(const std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  for (auto& input : inputs) results.push_back(fn(input));
+  return results;
+}
+
+template <typename T, typename S, typename Functor>
+inline std::vector<T> vectorMapWithIndex(const std::vector<S>& inputs, Functor fn) {
+  std::vector<T> results;
+  uint32_t index = 0;
+  for (auto& input : inputs) results.push_back(fn(input, index++));
+  return results;
+}
+
+/**
+ * @brief Splits command-line input arguments containing multiple image file paths
+ *        into separate vectors based on a specified separator.
+ * Modified from parseInputFilesArguments() in [openvinotoolkit/openvino ›
+ * inference-engine/samples/common/utils/src/args_helper.cpp]
+ *
+ * @param net_size The number of networks (multigraph functionality).
+ * @return A vector of vectors, where each inner vector contains image file paths
+ *         corresponding to a specific network graph.
+ */
+std::vector<std::vector<std::string>> SplitMultiInputFilesArguments(size_t net_size);
+
+/**
+ * @brief Returns the stem of a file path.
+ *
+ * The stem is the base name of the file without its extension. This function
+ * takes a file path as input and extracts the stem, which is the part of the
+ * file name before the last period ('.') character.
+ *
+ * @param path The input file path.
+ * @return The stem of the file, excluding the extension.
+ */
+std::string GetStem(std::string path);
+
+/**
+ * @brief Extracts the file extension from a given file name.
+ *
+ * @param name The file name from which to extract the extension.
+ * @return The file extension as a string, or an empty string if no extension is found.
+ */
+std::string GetExtension(const std::string& path);
+
+/**
+ * @brief Parses a list of devices from a string
+ *
+ * @param device_string The input string to be split. The delimiter is ':'
+ * @return A vector of strings containing the devices
+ */
+std::vector<std::string> ParseDevices(const std::string& device_string);
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param reshape_required boolean flag indicating that the model needs to be reshaped according to the batch size flag
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @param mean_string CLI arg specifying image mean value. Example: input[255,255,255]. (Optional)
+ * @param scale_string CLI arg specifying image scale value. Example: input[255,255,255]. (Optional)
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        bool& reshape_required,
+                                        const bool is_binary_data,
+                                        const std::string& mean_string,
+                                        const std::string& scale_string);
+
+/**
+ * @brief Gets information about a network's inputs.
+ *
+ * Reads all input nodes from a network, determines tensor layout, shapes, precision, etc.
+ * Saves into dla::benchmark::InputsInfo which maps each input info struct to an input name.
+ * Used in AOT flow where reshaping is not required (Handled by compiler)
+ *
+ * @param batch_size Network batch size from the user via the batch size flag
+ * @param input_info Vector of input nodes. Obtained from ov::Model.inputs() or ov::CompiledModel.inputs()
+ * @param is_binary_data User flag indicating that the data is binary data and not image data
+ * @return dla::benchmark::InputsInfo which is a map of input names and its respective input information
+ */
+dla_benchmark::InputsInfo GetInputsInfo(const size_t batch_size,
+                                        const std::vector<ov::Output<const ov::Node>>& input_info,
+                                        const bool isBinaryData);
+
+/**
+ * @brief Filters a list of file paths by specified file extensions (case insensitive).
+ *
+ * @param file_paths A vector of file paths to be filtered.
+ * @param extensions A vector of file extensions to filter by.
+ * @return A vector of filtered file paths that match the specified extensions.
+ */
+std::vector<std::string> FilterFilesByExtensions(const std::vector<std::string>& file_paths,
+                                                 const std::vector<std::string>& extensions);
+
+// Helper function to dump result.txt with tensor indicies
+void DumpResultTxtFile(const ov::Tensor& output_tensor,
+                       const ov::Output<const ov::Node>& output_node,
+                       const unsigned int output_size,
+                       std::ofstream& result_file);
+
+// Helper function to dump the output tensor as binaries in result.bin
+void DumpResultBinFile(const ov::Tensor& output_tensor,
+                       std::ofstream& result_file);
+
+// Helper function to dump the inference metadata into result_meta.json
+void DumpResultMetaJSONFile(const dla_benchmark::InferenceMetaData& metadata,
+                            std::ofstream& result_file);
+
+/**
+ * @brief Gets the appriopriate DLA supported tensor layout from a node.
+ *
+ * @param node Node to determine the tensor layout. Obtained from ov::Model.inputs()/outputs()
+ *             or ov::CompiledModel.inputs()/outputs()
+ * @param allow_partial_defined Whether to allow partial defined layout. When set true, DLA tolerates
+ *             dumping custom layouts e.g., when the rank of shape is 3. The layout will have ? in
+ *             all dimensions. e.g., [???].
+ *             This param should ONLY be used when dumping graph output of irregular layout.
+ * @return OpenVino's tensor layout object.
+ */
+ov::Layout GetTensorLayout(const ov::Output<ov::Node>& node, const bool allow_partial_defined = false);
diff --git a/python/openvino/runtime/fpga_jtag_reprogram/CMakeLists.txt b/python/openvino/runtime/fpga_jtag_reprogram/CMakeLists.txt
new file mode 100644
index 0000000..6d7b051
--- /dev/null
+++ b/python/openvino/runtime/fpga_jtag_reprogram/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(TARGET_NAME fpga_jtag_reprogram)
+if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
+  set(CMAKE_CXX_STANDARD 14)
+else()
+  set(CMAKE_CXX_STANDARD 11)
+endif()
+
+file(GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+add_executable(${TARGET_NAME} ${SOURCES})
+
+if (NOT WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -lrt" )
+endif()
+target_link_libraries(${TARGET_NAME} ${CMAKE_DL_LIBS} de10_agilex_mmd)
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT COREDLA)
diff --git a/python/openvino/runtime/fpga_jtag_reprogram/main.cpp b/python/openvino/runtime/fpga_jtag_reprogram/main.cpp
new file mode 100644
index 0000000..c80d38a
--- /dev/null
+++ b/python/openvino/runtime/fpga_jtag_reprogram/main.cpp
@@ -0,0 +1,101 @@
+// Copyright 2021-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// The purpose of this utility is to full-chip program an FPGA using JTAG.
+// Avoid calling quartus_pgm directly since programming an FPGA means the PCIe
+// device disappears from the CPU's perspective, first need to mask the surprise
+// down error. The newly added function aocl_mmd_program_sof is basically the
+// same as the existing one from the mmd but without trying to handshake with
+// some registers that no longer exist in the CoreDLA version of the DE10
+// Agilex BSP.
+
+#include <stdlib.h>
+#include <fstream>  // ifstream
+#include <iostream>
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::string
+#include "aocl_mmd.h"
+
+// helper functions
+bool file_exists(std::string filename) {
+  std::ifstream f(filename.c_str());
+  return f.good();
+}
+
+bool string_ends_with(std::string str, std::string suffix) {
+  return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+// wrapper around mmd functions to reprogram the fpga
+int reprogram(std::string sof_filename) {
+  // open the mmd
+  constexpr size_t MAX_BOARD_NAMES_LEN = 4096;
+  char name[MAX_BOARD_NAMES_LEN];
+  size_t sz;
+  int status = aocl_mmd_get_offline_info(AOCL_MMD_BOARD_NAMES, MAX_BOARD_NAMES_LEN, name, &sz);
+  if (status) {
+    std::string msg = "Failed to query a board name from MMD. Perhaps no FPGA device is available?";
+    throw std::runtime_error(msg);
+  }
+  int handle = aocl_mmd_open(name);
+  if (handle < 0) {
+    std::string msg = "Failed to open MMD";
+    throw std::runtime_error(msg);
+  }
+
+  char *COREDLA_JTAG_PID = getenv("COREDLA_JTAG_PID");
+  bool skipSaveRestore = false;
+  if (COREDLA_JTAG_PID) {
+    skipSaveRestore = true;
+  }
+
+  // reprogram the fpga using a sof file
+  // BEWARE this invalidates the handle from the MMD
+  status = aocl_mmd_program_sof(handle, sof_filename.c_str(), skipSaveRestore);
+  if (status) {
+    std::string msg = "Failed to reprogram the FPGA";
+    throw std::runtime_error(msg);
+  }
+
+  return aocl_mmd_close(handle);
+}
+
+int main(int argc, char **argv) {
+  try {
+    // use the first command line arg as the sof filename
+    if (argc != 2) {
+      std::string msg = "usage: fpga_jtag_reprogram </path/to/sof/filename.sof>";
+      throw std::runtime_error(msg);
+    }
+
+    // check that file exists
+    std::string sof_filename = argv[1];
+    if (!file_exists(sof_filename)) {
+      std::string msg = "Error: cannot open file " + sof_filename;
+      throw std::runtime_error(msg);
+    }
+
+    // check that file name ends in .sof
+    if (!string_ends_with(sof_filename, ".sof")) {
+      std::string msg = "Error: file name does not end with .sof";
+      throw std::runtime_error(msg);
+    }
+
+    // reprogram the fpga using jtag
+    int exitcode = reprogram(argv[1]);
+    return exitcode;
+  } catch (std::runtime_error &e) {
+    std::cerr << e.what() << std::endl;
+    return -1;
+  }
+}
diff --git a/python/openvino/runtime/object_detection_demo/CMakeLists.txt b/python/openvino/runtime/object_detection_demo/CMakeLists.txt
new file mode 100644
index 0000000..f88d1d7
--- /dev/null
+++ b/python/openvino/runtime/object_detection_demo/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+        set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+endif()
+
+set (TARGET_NAME "object_detection_demo")
+
+file (GLOB MAIN_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+file (GLOB MAIN_HEADERS
+    # Add headers if any
+)
+
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    "$ENV{COREDLA_ROOT}/runtime/common/demo_utils/include/utils"
+    "$ENV{COREDLA_ROOT}/runtime/common/monitors/include"
+)
+
+if (NOT WIN32)
+    set (LIB_DL dl)
+endif()
+
+target_link_libraries(${TARGET_NAME}
+    openvino::runtime
+    openvino_dev_api
+    ie_samples_utils
+    ${OpenCV_LIBRARIES}
+    models
+    monitors
+    pipelines
+    utils
+    coreDLAHeteroPlugin
+)
+
+if(NOT WIN32)
+    target_link_libraries(${TARGET_NAME} ${LIB_DL} pthread)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
+
+# For libcoreDlaRuntimePlugin.so - typically specified by $COREDLA_ROOT/runtime/plugins.xml
+set_target_properties(${TARGET_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN/..")
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT DEMO)
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/not_shipped/bin" COMPONENT NOT_SHIPPED)
diff --git a/python/openvino/runtime/object_detection_demo/CMakeLists.txt.orig b/python/openvino/runtime/object_detection_demo/CMakeLists.txt.orig
new file mode 100755
index 0000000..5fba764
--- /dev/null
+++ b/python/openvino/runtime/object_detection_demo/CMakeLists.txt.orig
@@ -0,0 +1,11 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+file(GLOB SRC_FILES ./*.cpp)
+file(GLOB H_FILES ./*.h)
+
+add_demo(NAME object_detection_demo
+    SOURCES ${SRC_FILES}
+    HEADERS ${H_FILES}
+    DEPENDENCIES monitors models pipelines)
diff --git a/python/openvino/runtime/object_detection_demo/README.md b/python/openvino/runtime/object_detection_demo/README.md
new file mode 100644
index 0000000..7e7a90d
--- /dev/null
+++ b/python/openvino/runtime/object_detection_demo/README.md
@@ -0,0 +1,15 @@
+# Object Detection YOLO* V3 C++ Demo, Async API Performance Showcase
+
+### Running with CoreDLA
+In addition to the options described below, include the arguments:
+
+-  `-plugins=<path the plugins.xml>`, using the path to [plugins.xml](../plugins.xml)
+- `-d HETERO:FPGA,CPU`
+- `-arch_file <path to arch file>`, using the path to the architecture used when creating the FPGA bitstream
+
+Use the -build_demo option to the runtime/build_runtime.sh script to build the demos.
+
+See the documentation that is included with the example design.
+
+For detailed information on the OpenVINO C++ Object Detection Demo, please see the [README](https://github.com/openvinotoolkit/open_model_zoo/blob/2023.3.0/demos/object_detection_demo/cpp/README.md) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
diff --git a/python/openvino/runtime/object_detection_demo/main.cpp b/python/openvino/runtime/object_detection_demo/main.cpp
new file mode 100644
index 0000000..f787504
--- /dev/null
+++ b/python/openvino/runtime/object_detection_demo/main.cpp
@@ -0,0 +1,598 @@
+/*
+// Copyright (C) 2018-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <stddef.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <exception>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+#include <sys/stat.h>
+
+#include <models/detection_model.h>
+#include <models/detection_model_centernet.h>
+#include <models/detection_model_faceboxes.h>
+#include <models/detection_model_retinaface.h>
+#include <models/detection_model_retinaface_pt.h>
+#include <models/detection_model_ssd.h>
+#include <models/detection_model_yolo.h>
+#include <models/detection_model_yolov3_onnx.h>
+#include <models/detection_model_yolox.h>
+#include <models/input_data.h>
+#include <models/model_base.h>
+#include <models/results.h>
+#include <monitors/presenter.h>
+#include <pipelines/async_pipeline.h>
+#include <pipelines/metadata.h>
+#include <utils/args_helper.hpp>
+#include <utils/common.hpp>
+#include <utils/config_factory.h>
+#include <utils/default_flags.hpp>
+#include <utils/images_capture.h>
+#include <utils/ocv_common.hpp>
+#include <utils/performance_metrics.hpp>
+#include <utils/slog.hpp>
+
+DEFINE_INPUT_FLAGS
+DEFINE_OUTPUT_FLAGS
+
+static const char help_message[] = "Print a usage message.";
+static const char at_message[] =
+    "Required. Architecture type: centernet, faceboxes, retinaface, retinaface-pytorch, ssd, yolo, yolov3-onnx or yolox";
+static const char model_message[] = "Required. Path to an .xml file with a trained model.";
+static const char target_device_message[] =
+    "Optional. Specify the target device to infer on (the list of available devices is shown below). "
+    "Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
+    "The demo will look for a suitable plugin for a specified device.";
+static const char labels_message[] = "Optional. Path to a file with labels mapping.";
+static const char layout_message[] = "Optional. Specify inputs layouts."
+                                     " Ex. NCHW or input0:NCHW,input1:NC in case of more than one input.";
+static const char thresh_output_message[] = "Optional. Probability threshold for detections.";
+static const char raw_output_message[] = "Optional. Inference results as raw values.";
+static const char input_resizable_message[] =
+    "Optional. Enables resizable input with support of ROI crop & auto resize.";
+static const char nireq_message[] = "Optional. Number of infer requests. If this option is omitted, number of infer "
+                                    "requests is determined automatically.";
+static const char num_threads_message[] = "Optional. Number of threads.";
+static const char num_streams_message[] = "Optional. Number of streams to use for inference on the CPU or/and GPU in "
+                                          "throughput mode (for HETERO and MULTI device cases use format "
+                                          "<device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>)";
+static const char no_show_message[] = "Optional. Don't show output.";
+static const char utilization_monitors_message[] = "Optional. List of monitors to show initially.";
+static const char iou_thresh_output_message[] =
+    "Optional. Filtering intersection over union threshold for overlapping boxes.";
+static const char yolo_af_message[] = "Optional. Use advanced postprocessing/filtering algorithm for YOLO.";
+static const char output_resolution_message[] =
+    "Optional. Specify the maximum output window resolution "
+    "in (width x height) format. Example: 1280x720.";
+static const char input_resolution_message[] =
+    "Optional. Specify the maximum input video capturing resolution "
+    "in (width x height) format. Example: 640x640. The input frame size used by default is 1280x720.";
+static const char anchors_message[] = "Optional. A comma separated list of anchors. "
+                                      "By default used default anchors for model. Only for YOLOV4 architecture type.";
+static const char masks_message[] = "Optional. A comma separated list of mask for anchors. "
+                                    "By default used default masks for model. Only for YOLOV4 architecture type.";
+static const char reverse_input_channels_message[] = "Optional. Switch the input channels order from BGR to RGB.";
+static const char mean_values_message[] =
+    "Optional. Normalize input by subtracting the mean values per channel. Example: \"255.0 255.0 255.0\"";
+static const char scale_values_message[] = "Optional. Divide input by scale values per channel. Division is applied "
+                                           "after mean values subtraction. Example: \"255.0 255.0 255.0\"";
+
+// @brief message for performance counters option
+static const char plugins_message[] = "Optional. Select a custom plugins_xml file to use.";
+// @brief message for architecture .arch file
+static const char arch_file_message[] = "Optional. Provide a path for the architecture .arch file.";
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(at, "", at_message);
+DEFINE_string(m, "", model_message);
+DEFINE_string(d, "CPU", target_device_message);
+DEFINE_string(labels, "", labels_message);
+DEFINE_string(layout, "", layout_message);
+DEFINE_bool(r, false, raw_output_message);
+DEFINE_double(t, 0.5, thresh_output_message);
+DEFINE_double(iou_t, 0.5, iou_thresh_output_message);
+DEFINE_bool(auto_resize, false, input_resizable_message);
+DEFINE_int32(nireq, 0, nireq_message);
+DEFINE_int32(nthreads, 0, num_threads_message);
+DEFINE_string(nstreams, "", num_streams_message);
+DEFINE_bool(no_show, false, no_show_message);
+DEFINE_string(u, "", utilization_monitors_message);
+DEFINE_bool(yolo_af, true, yolo_af_message);
+DEFINE_string(input_resolution, "", input_resolution_message);
+DEFINE_string(output_resolution, "", output_resolution_message);
+DEFINE_string(anchors, "", anchors_message);
+DEFINE_string(masks, "", masks_message);
+DEFINE_bool(reverse_input_channels, false, reverse_input_channels_message);
+DEFINE_string(mean_values, "", mean_values_message);
+DEFINE_string(scale_values, "", scale_values_message);
+
+/// @brief Path to a plugins_xml file
+DEFINE_string(plugins, "", plugins_message);
+/// @brief Path to arch file
+DEFINE_string(arch_file, "", arch_file_message);
+
+/**
+ * \brief This function shows a help message
+ */
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "object_detection_demo [OPTION]" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    -h                        " << help_message << std::endl;
+    std::cout << "    -at \"<type>\"              " << at_message << std::endl;
+    std::cout << "    -i                        " << input_message << std::endl;
+    std::cout << "    -m \"<path>\"               " << model_message << std::endl;
+    std::cout << "    -o \"<path>\"               " << output_message << std::endl;
+    std::cout << "    -limit \"<num>\"            " << limit_message << std::endl;
+    std::cout << "    -d \"<device>\"             " << target_device_message << std::endl;
+    std::cout << "    -labels \"<path>\"          " << labels_message << std::endl;
+    std::cout << "    -layout \"<string>\"        " << layout_message << std::endl;
+    std::cout << "    -r                        " << raw_output_message << std::endl;
+    std::cout << "    -t                        " << thresh_output_message << std::endl;
+    std::cout << "    -iou_t                    " << iou_thresh_output_message << std::endl;
+    std::cout << "    -auto_resize              " << input_resizable_message << std::endl;
+    std::cout << "    -nireq \"<integer>\"        " << nireq_message << std::endl;
+    std::cout << "    -nthreads \"<integer>\"     " << num_threads_message << std::endl;
+    std::cout << "    -nstreams                 " << num_streams_message << std::endl;
+    std::cout << "    -loop                     " << loop_message << std::endl;
+    std::cout << "    -no_show                  " << no_show_message << std::endl;
+    std::cout << "    -input_resolution         " << input_resolution_message << std::endl;
+    std::cout << "    -output_resolution        " << output_resolution_message << std::endl;
+    std::cout << "    -u                        " << utilization_monitors_message << std::endl;
+    std::cout << "    -yolo_af                  " << yolo_af_message << std::endl;
+    std::cout << "    -anchors                  " << anchors_message << std::endl;
+    std::cout << "    -masks                    " << masks_message << std::endl;
+    std::cout << "    -reverse_input_channels   " << reverse_input_channels_message << std::endl;
+    std::cout << "    -mean_values              " << mean_values_message << std::endl;
+    std::cout << "    -scale_values             " << scale_values_message << std::endl;
+}
+
+class ColorPalette {
+private:
+    std::vector<cv::Scalar> palette;
+
+    static double getRandom(double a = 0.0, double b = 1.0) {
+        static std::default_random_engine e;
+        std::uniform_real_distribution<> dis(a, std::nextafter(b, std::numeric_limits<double>::max()));
+        return dis(e);
+    }
+
+    static double distance(const cv::Scalar& c1, const cv::Scalar& c2) {
+        auto dh = std::fmin(std::fabs(c1[0] - c2[0]), 1 - fabs(c1[0] - c2[0])) * 2;
+        auto ds = std::fabs(c1[1] - c2[1]);
+        auto dv = std::fabs(c1[2] - c2[2]);
+
+        return dh * dh + ds * ds + dv * dv;
+    }
+
+    static cv::Scalar maxMinDistance(const std::vector<cv::Scalar>& colorSet,
+                                     const std::vector<cv::Scalar>& colorCandidates) {
+        std::vector<double> distances;
+        distances.reserve(colorCandidates.size());
+        for (auto& c1 : colorCandidates) {
+            auto min =
+                *std::min_element(colorSet.begin(), colorSet.end(), [&c1](const cv::Scalar& a, const cv::Scalar& b) {
+                    return distance(c1, a) < distance(c1, b);
+                });
+            distances.push_back(distance(c1, min));
+        }
+        auto max = std::max_element(distances.begin(), distances.end());
+        return colorCandidates[std::distance(distances.begin(), max)];
+    }
+
+    static cv::Scalar hsv2rgb(const cv::Scalar& hsvColor) {
+        cv::Mat rgb;
+        cv::Mat hsv(1, 1, CV_8UC3, hsvColor);
+        cv::cvtColor(hsv, rgb, cv::COLOR_HSV2RGB);
+        return cv::Scalar(rgb.data[0], rgb.data[1], rgb.data[2]);
+    }
+
+public:
+    explicit ColorPalette(size_t n) {
+        palette.reserve(n);
+        std::vector<cv::Scalar> hsvColors(1, {1., 1., 1.});
+        std::vector<cv::Scalar> colorCandidates;
+        size_t numCandidates = 100;
+
+        hsvColors.reserve(n);
+        colorCandidates.resize(numCandidates);
+        for (size_t i = 1; i < n; ++i) {
+            std::generate(colorCandidates.begin(), colorCandidates.end(), []() {
+                return cv::Scalar{getRandom(), getRandom(0.8, 1.0), getRandom(0.5, 1.0)};
+            });
+            hsvColors.push_back(maxMinDistance(hsvColors, colorCandidates));
+        }
+
+        for (auto& hsv : hsvColors) {
+            // Convert to OpenCV HSV format
+            hsv[0] *= 179;
+            hsv[1] *= 255;
+            hsv[2] *= 255;
+
+            palette.push_back(hsv2rgb(hsv));
+        }
+    }
+
+    const cv::Scalar& operator[](size_t index) const {
+        return palette[index % palette.size()];
+    }
+};
+
+bool exists_test (const std::string& name) {
+  struct stat buffer;
+  return (stat (name.c_str(), &buffer) == 0);
+}
+
+bool is_valid_resolution(const std::string& resolution) {
+  bool valid = true;
+  size_t pos = FLAGS_input_resolution.find("x");
+  if (pos == std::string::npos) {
+    valid = false;
+  } else {
+    try {
+        int width = std::stoi(FLAGS_input_resolution.substr(0, pos));
+        int height = std::stoi(FLAGS_input_resolution.substr(pos + 1, FLAGS_input_resolution.length()));
+        if (width <= 0 || height <= 0) {
+            valid = false;
+        }
+    } catch (...) {
+        valid = false;
+    }
+  }
+  return valid;
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[]) {
+    // ---------------------------Parsing and validation of input args--------------------------------------
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        showUsage();
+       // showAvailableDevices();
+        return false;
+    }
+
+    if (FLAGS_i.empty()) {
+        throw std::logic_error("Parameter -i is not set");
+    }
+
+    if (FLAGS_m.empty()) {
+        throw std::logic_error("Parameter -m is not set");
+    }
+
+    if (FLAGS_at.empty()) {
+        throw std::logic_error("Parameter -at is not set");
+    }
+
+    if (!FLAGS_input_resolution.empty() && !is_valid_resolution(FLAGS_input_resolution)) {
+        throw std::logic_error("Correct format of -input_resolution parameter is \"width\"x\"height\".");
+    }
+
+    if (!FLAGS_output_resolution.empty() && !is_valid_resolution(FLAGS_input_resolution)) {
+        throw std::logic_error("Correct format of -output_resolution parameter is \"width\"x\"height\".");
+    }
+
+    if(!FLAGS_plugins.empty()) {
+        std::cout << "Using custom plugins xml file - " << FLAGS_plugins << std::endl;
+    }
+
+    if (!exists_test(FLAGS_plugins)) {
+        std::cout << "Error: plugins_xml file: " << FLAGS_plugins << " doesn't exist. Please provide a valid path." << std::endl;
+        throw std::logic_error("plugins_xml file path does not exist.");
+    }
+    return true;
+}
+
+// Input image is stored inside metadata, as we put it there during submission stage
+cv::Mat renderDetectionData(DetectionResult& result, const ColorPalette& palette, OutputTransform& outputTransform) {
+    if (!result.metaData) {
+        throw std::invalid_argument("Renderer: metadata is null");
+    }
+
+    auto outputImg = result.metaData->asRef<ImageMetaData>().img;
+
+    if (outputImg.empty()) {
+        throw std::invalid_argument("Renderer: image provided in metadata is empty");
+    }
+    outputTransform.resize(outputImg);
+    // Visualizing result data over source image
+    if (FLAGS_r) {
+        slog::debug << " -------------------- Frame # " << result.frameId << "--------------------" << slog::endl;
+        slog::debug << " Class ID  | Confidence | XMIN | YMIN | XMAX | YMAX " << slog::endl;
+    }
+
+    for (auto& obj : result.objects) {
+        if (FLAGS_r) {
+            slog::debug << " " << std::left << std::setw(9) << obj.label << " | " << std::setw(10) << obj.confidence
+                        << " | " << std::setw(4) << int(obj.x) << " | " << std::setw(4) << int(obj.y) << " | "
+                        << std::setw(4) << int(obj.x + obj.width) << " | " << std::setw(4) << int(obj.y + obj.height)
+                        << slog::endl;
+        }
+        outputTransform.scaleRect(obj);
+        std::ostringstream conf;
+        conf << ":" << std::fixed << std::setprecision(1) << obj.confidence * 100 << '%';
+        const auto& color = palette[obj.labelID];
+        putHighlightedText(outputImg,
+                           obj.label + conf.str(),
+                           cv::Point2f(obj.x, obj.y - 5),
+                           cv::FONT_HERSHEY_COMPLEX_SMALL,
+                           1,
+                           color,
+                           2);
+        cv::rectangle(outputImg, obj, color, 2);
+    }
+
+    try {
+        for (auto& lmark : result.asRef<RetinaFaceDetectionResult>().landmarks) {
+            outputTransform.scaleCoord(lmark);
+            cv::circle(outputImg, lmark, 2, cv::Scalar(0, 255, 255), -1);
+        }
+    } catch (const std::bad_cast&) {}
+
+    return outputImg;
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        PerformanceMetrics metrics;
+
+        // ------------------------------ Parsing and validation of input args ---------------------------------
+        if (!ParseAndCheckCommandLine(argc, argv)) {
+            return 0;
+        }
+
+        const auto& strAnchors = split(FLAGS_anchors, ',');
+        const auto& strMasks = split(FLAGS_masks, ',');
+
+        std::vector<float> anchors;
+        std::vector<int64_t> masks;
+        try {
+            for (auto& str : strAnchors) {
+                anchors.push_back(std::stof(str));
+            }
+        } catch (...) { throw std::runtime_error("Invalid anchors list is provided."); }
+
+        try {
+            for (auto& str : strMasks) {
+                masks.push_back(std::stoll(str));
+            }
+        } catch (...) { throw std::runtime_error("Invalid masks list is provided."); }
+
+        //------------------------------- Preparing Input ------------------------------------------------------
+        cv::Size inputResolution;
+        if (!FLAGS_input_resolution.empty()) {
+            size_t pos = FLAGS_input_resolution.find("x");
+            inputResolution = cv::Size{
+                    std::stoi(FLAGS_input_resolution.substr(0, pos)),
+                    std::stoi(FLAGS_input_resolution.substr(pos + 1, FLAGS_input_resolution.length()))};
+            slog::info << "Using custom input resolution of " << FLAGS_input_resolution << slog::endl;
+        } else {
+            inputResolution = cv::Size{1280, 720};
+            slog::info << "Using default input resolution of 1280x720." << slog::endl;
+        }
+
+        auto cap = openImagesCapture(FLAGS_i,
+                                     FLAGS_loop,
+                                     FLAGS_nireq == 1 ? read_type::efficient : read_type::safe,
+                                     0,
+                                     std::numeric_limits<size_t>::max(),
+                                     inputResolution);
+
+        cv::Mat curr_frame;
+
+        //------------------------------ Running Detection routines ----------------------------------------------
+        std::vector<std::string> labels;
+        if (!FLAGS_labels.empty())
+            labels = DetectionModel::loadLabels(FLAGS_labels);
+        ColorPalette palette(labels.size() > 0 ? labels.size() : 100);
+
+        std::unique_ptr<ModelBase> model;
+        if (FLAGS_at == "centernet") {
+            model.reset(new ModelCenterNet(FLAGS_m, static_cast<float>(FLAGS_t), labels, FLAGS_layout));
+        } else if (FLAGS_at == "faceboxes") {
+            model.reset(new ModelFaceBoxes(FLAGS_m,
+                                           static_cast<float>(FLAGS_t),
+                                           FLAGS_auto_resize,
+                                           static_cast<float>(FLAGS_iou_t),
+                                           FLAGS_layout));
+        } else if (FLAGS_at == "retinaface") {
+            model.reset(new ModelRetinaFace(FLAGS_m,
+                                            static_cast<float>(FLAGS_t),
+                                            FLAGS_auto_resize,
+                                            static_cast<float>(FLAGS_iou_t),
+                                            FLAGS_layout));
+        } else if (FLAGS_at == "retinaface-pytorch") {
+            model.reset(new ModelRetinaFacePT(FLAGS_m,
+                                              static_cast<float>(FLAGS_t),
+                                              FLAGS_auto_resize,
+                                              static_cast<float>(FLAGS_iou_t),
+                                              FLAGS_layout));
+        } else if (FLAGS_at == "ssd") {
+            model.reset(new ModelSSD(FLAGS_m, static_cast<float>(FLAGS_t), FLAGS_auto_resize, labels, FLAGS_layout));
+        } else if (FLAGS_at == "yolo") {
+            model.reset(new ModelYolo(FLAGS_m,
+                                      static_cast<float>(FLAGS_t),
+                                      FLAGS_auto_resize,
+                                      FLAGS_yolo_af,
+                                      static_cast<float>(FLAGS_iou_t),
+                                      labels,
+                                      anchors,
+                                      masks,
+                                      FLAGS_layout));
+        } else if (FLAGS_at == "yolov3-onnx") {
+            model.reset(new ModelYoloV3ONNX(FLAGS_m,
+                                            static_cast<float>(FLAGS_t),
+                                            labels,
+                                            FLAGS_layout));
+        } else if (FLAGS_at == "yolox") {
+            model.reset(new ModelYoloX(FLAGS_m,
+                                       static_cast<float>(FLAGS_t),
+                                       static_cast<float>(FLAGS_iou_t),
+                                       labels,
+                                       FLAGS_layout));
+        } else {
+            slog::err << "No model type or invalid model type (-at) provided: " + FLAGS_at << slog::endl;
+            return -1;
+        }
+        model->setInputsPreprocessing(FLAGS_reverse_input_channels, FLAGS_mean_values, FLAGS_scale_values);
+        slog::info << ov::get_openvino_version() << slog::endl;
+
+        ov::Core core(FLAGS_plugins);
+
+        AsyncPipeline pipeline(std::move(model),
+                               ConfigFactory::getUserConfig(FLAGS_d, FLAGS_nireq, FLAGS_nstreams, FLAGS_nthreads, FLAGS_arch_file),
+                               core);
+        Presenter presenter(FLAGS_u);
+
+        bool keepRunning = true;
+        int64_t frameNum = -1;
+        std::unique_ptr<ResultBase> result;
+        uint32_t framesProcessed = 0;
+
+        LazyVideoWriter videoWriter{FLAGS_o, cap->fps(), static_cast<unsigned int>(FLAGS_limit)};
+
+        PerformanceMetrics renderMetrics;
+
+        cv::Size outputResolution;
+        OutputTransform outputTransform = OutputTransform();
+        size_t found = FLAGS_output_resolution.find("x");
+
+        while (keepRunning) {
+            if (pipeline.isReadyToProcess()) {
+                auto startTime = std::chrono::steady_clock::now();
+
+                //--- Capturing frame
+                curr_frame = cap->read();
+
+                if (curr_frame.empty()) {
+                    // Input stream is over
+                    break;
+                }
+
+                frameNum = pipeline.submitData(ImageInputData(curr_frame),
+                                               std::make_shared<ImageMetaData>(curr_frame, startTime));
+            }
+
+            if (frameNum == 0) {
+                if (found == std::string::npos) {
+                    outputResolution = curr_frame.size();
+                } else {
+                    outputResolution = cv::Size{
+                        std::stoi(FLAGS_output_resolution.substr(0, found)),
+                        std::stoi(FLAGS_output_resolution.substr(found + 1, FLAGS_output_resolution.length()))};
+                    outputTransform = OutputTransform(curr_frame.size(), outputResolution);
+                    outputResolution = outputTransform.computeResolution();
+                }
+            }
+
+            //--- Waiting for free input slot or output data available. Function will return immediately if any of them
+            // are available.
+            pipeline.waitForData();
+
+            //--- Checking for results and rendering data if it's ready
+            //--- If you need just plain data without rendering - cast result's underlying pointer to DetectionResult*
+            //    and use your own processing instead of calling renderDetectionData().
+            while (keepRunning && (result = pipeline.getResult())) {
+                auto renderingStart = std::chrono::steady_clock::now();
+                cv::Mat outFrame = renderDetectionData(result->asRef<DetectionResult>(), palette, outputTransform);
+
+                //--- Showing results and device information
+                presenter.drawGraphs(outFrame);
+                renderMetrics.update(renderingStart);
+                metrics.update(result->metaData->asRef<ImageMetaData>().timeStamp,
+                               outFrame,
+                               {10, 22},
+                               cv::FONT_HERSHEY_COMPLEX,
+                               0.65);
+
+                videoWriter.write(outFrame);
+                framesProcessed++;
+
+                if (!FLAGS_no_show) {
+                    cv::imshow("Detection Results", outFrame);
+                    //--- Processing keyboard events
+                    int key = cv::waitKey(1);
+                    if (27 == key || 'q' == key || 'Q' == key) {  // Esc
+                        keepRunning = false;
+                    } else {
+                        presenter.handleKey(key);
+                    }
+                }
+            }
+        }  // while(keepRunning)
+
+        // ------------ Waiting for completion of data processing and rendering the rest of results ---------
+        pipeline.waitForTotalCompletion();
+
+        for (; framesProcessed <= frameNum; framesProcessed++) {
+            result = pipeline.getResult();
+            if (result != nullptr) {
+                auto renderingStart = std::chrono::steady_clock::now();
+                cv::Mat outFrame = renderDetectionData(result->asRef<DetectionResult>(), palette, outputTransform);
+                //--- Showing results and device information
+                presenter.drawGraphs(outFrame);
+                renderMetrics.update(renderingStart);
+                metrics.update(result->metaData->asRef<ImageMetaData>().timeStamp,
+                               outFrame,
+                               {10, 22},
+                               cv::FONT_HERSHEY_COMPLEX,
+                               0.65);
+                videoWriter.write(outFrame);
+                if (!FLAGS_no_show) {
+                    cv::imshow("Detection Results", outFrame);
+                    //--- Updating output window
+                    cv::waitKey(1);
+                }
+            }
+        }
+
+        slog::info << "Metrics report:" << slog::endl;
+        metrics.logTotal();
+        logLatencyPerStage(cap->getMetrics().getTotal().latency,
+                           pipeline.getPreprocessMetrics().getTotal().latency,
+                           pipeline.getInferenceMetircs().getTotal().latency,
+                           pipeline.getPostprocessMetrics().getTotal().latency,
+                           renderMetrics.getTotal().latency);
+        slog::info << presenter.reportMeans() << slog::endl;
+    } catch (const std::exception& error) {
+        slog::err << error.what() << slog::endl;
+        return 1;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/python/openvino/runtime/object_detection_demo/models.lst b/python/openvino/runtime/object_detection_demo/models.lst
new file mode 100644
index 0000000..4b2e8a8
--- /dev/null
+++ b/python/openvino/runtime/object_detection_demo/models.lst
@@ -0,0 +1,55 @@
+# This file can be used with the --list option of the model downloader.
+# For -at centernet
+ctdet_coco_dlav0_512
+# For -at faceboxes
+faceboxes-pytorch
+# For -at retinaface-pytorch
+retinaface-resnet50-pytorch
+# For -at ssd
+efficientdet-d0-tf
+efficientdet-d1-tf
+face-detection-????
+face-detection-adas-????
+face-detection-retail-????
+faster-rcnn-resnet101-coco-sparse-60-0001
+faster_rcnn_inception_resnet_v2_atrous_coco
+faster_rcnn_resnet50_coco
+pedestrian-and-vehicle-detector-adas-????
+pedestrian-detection-adas-????
+pelee-coco
+person-detection-????
+person-detection-retail-0013
+person-vehicle-bike-detection-????
+product-detection-0001
+rfcn-resnet101-coco-tf
+retinanet-tf
+ssd300
+ssd512
+ssd-resnet34-1200-onnx
+ssd_mobilenet_v1_coco
+ssd_mobilenet_v1_fpn_coco
+ssdlite_mobilenet_v2
+vehicle-detection-????
+vehicle-detection-adas-????
+vehicle-license-plate-detection-barrier-????
+# For -at yolo
+mobilenet-yolo-v4-syg
+person-vehicle-bike-detection-crossroad-yolov3-1020
+yolo-v1-tiny-tf
+yolo-v2-ava-0001
+yolo-v2-ava-sparse-??-0001
+yolo-v2-tiny-ava-0001
+yolo-v2-tiny-ava-sparse-??-0001
+yolo-v2-tiny-vehicle-detection-0001
+yolo-v2-tf
+yolo-v2-tiny-tf
+yolo-v3-tf
+yolo-v3-tiny-tf
+yolo-v4-tf
+yolo-v4-tiny-tf
+yolof
+# For -at yolov3-onnx
+yolo-v3-onnx
+yolo-v3-tiny-onnx
+# For -at yolox
+yolox-tiny
diff --git a/python/openvino/runtime/patches/computelibrary.patch b/python/openvino/runtime/patches/computelibrary.patch
new file mode 100644
index 0000000..1fc0dd3
--- /dev/null
+++ b/python/openvino/runtime/patches/computelibrary.patch
@@ -0,0 +1,47 @@
+diff --git a/SConstruct b/SConstruct
+index 68c518a4a0..6ecfb05672 100644
+--- a/SConstruct
++++ b/SConstruct
+@@ -109,7 +109,7 @@ vars.AddVariables(
+     BoolVariable("cppthreads", "Enable C++11 threads backend", True),
+     PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept),
+     PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept),
+-    BoolVariable("exceptions", "Enable/disable C++ exception support", True),
++    BoolVariable("exceptions", "Enable/disable C++ exception support", False),
+     BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
+     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
+     PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure: 
+@@ -324,11 +324,14 @@ if env['multi_isa']:
+ else: # NONE "multi_isa" builds
+ 
+     if 'v7a' in env['arch']:
+-        env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
+-        if (env['os'] == 'android' or env['os'] == 'tizen') and not 'hf' in env['arch']:
+-            env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
++        if ('-march' in env['extra_cxx_flags']) or ('-mcpu' in env['extra_cxx_flags']):
++            print("INFO: Re-use march/mcpu settings")
+         else:
+-            env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
++            env.Append(CXXFLAGS = ['-march=armv7-a', '-mthumb', '-mfpu=neon'])
++            if env['os'] == 'android' or env['os'] == 'tizen':
++                env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
++            else:
++                env.Append(CXXFLAGS = ['-mfloat-abi=hard'])
+     elif 'v8.6-a' in env['arch']:
+         if 'armv8.6-a-sve2' in env['arch']:
+             env.Append(CXXFLAGS = ['-march=armv8.6-a+sve2'])
+@@ -649,7 +652,7 @@ if env['exceptions']:
+     if env['os'] == 'bare_metal' and env['arch'] == 'armv7a':
+         print("WARNING: Building tests for bare metal and armv7a is not supported")
+         Return()
+-    SConscript('./tests/SConscript', variant_dir='%s/tests' % build_path, duplicate=0)
++    # SConscript('./tests/SConscript', variant_dir='%s/tests' % build_path, duplicate=0)
+ 
+ # Unknown variables are not allowed
+ # Note: we must delay the call of UnknownVariables until after
+@@ -657,4 +660,4 @@ if env['exceptions']:
+ unknown = vars.UnknownVariables()
+ if unknown:
+     print("Unknown variables: %s" % " ".join(unknown.keys()))
+-    Exit(1)
++    # Exit(1)
diff --git a/python/openvino/runtime/patches/flags.patch b/python/openvino/runtime/patches/flags.patch
new file mode 100644
index 0000000..5c663aa
--- /dev/null
+++ b/python/openvino/runtime/patches/flags.patch
@@ -0,0 +1,76 @@
+diff --git a/cmake/developer_package/target_flags.cmake b/cmake/developer_package/target_flags.cmake
+index 29f23e713e..84d32e6633 100644
+--- a/cmake/developer_package/target_flags.cmake
++++ b/cmake/developer_package/target_flags.cmake
+@@ -113,36 +113,38 @@ endif()
+ 
+ get_property(OV_GENERATOR_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+ 
+-function(ov_glibc_version)
+-    # cmake needs to look at glibc version only when we build for Linux on Linux
+-    if(LINUX)
+-        function(ov_get_definition definition var)
+-            execute_process(COMMAND echo "#include <errno.h>"
+-                            COMMAND "${CMAKE_CXX_COMPILER}" -xc - -E -dM
+-                            COMMAND grep -E "^#define ${definition} "
+-                            OUTPUT_VARIABLE glibc_version_component
+-                            ERROR_VARIABLE error_message
+-                            RESULT_VARIABLE exit_code
+-                            OUTPUT_STRIP_TRAILING_WHITESPACE)
+-
+-            if(NOT exit_code EQUAL 0)
+-                message(FATAL_ERROR "Failed to detect glibc version: ${error_message}\n${glibc_version_component}")
+-            endif()
+-
+-            if(glibc_version_component MATCHES "^#define ${definition} ([0-9]+)")
+-                set("${var}" "${CMAKE_MATCH_1}" PARENT_SCOPE)
+-            else()
+-                message(FATAL_ERROR "Internal error: failed to parse ${definition} from '${glibc_version_component}'")
+-            endif()
+-        endfunction()
+-
+-        ov_get_definition("__GLIBC__" _ov_glibc_major)
+-        ov_get_definition("__GLIBC_MINOR__" _ov_glibc_minor)
+-
+-        set(OV_GLIBC_VERSION "${_ov_glibc_major}.${_ov_glibc_minor}" PARENT_SCOPE)
+-    else()
+-        set(OV_GLIBC_VERSION "0.0" PARENT_SCOPE)
+-    endif()
+-endfunction()
+-
+-ov_glibc_version()
++if(FALSE)
++  function(ov_glibc_version)
++      # cmake needs to look at glibc version only when we build for Linux on Linux
++      if(LINUX)
++          function(ov_get_definition definition var)
++              execute_process(COMMAND echo "#include <errno.h>"
++                              COMMAND "${CMAKE_CXX_COMPILER}" -xc - -E -dM
++                              COMMAND grep -E "^#define ${definition} "
++                              OUTPUT_VARIABLE glibc_version_component
++                              ERROR_VARIABLE error_message
++                              RESULT_VARIABLE exit_code
++                              OUTPUT_STRIP_TRAILING_WHITESPACE)
++
++              if(NOT exit_code EQUAL 0)
++                  message(FATAL_ERROR "Failed to detect glibc version: ${error_message}\n${glibc_version_component}")
++              endif()
++
++              if(glibc_version_component MATCHES "^#define ${definition} ([0-9]+)")
++                  set("${var}" "${CMAKE_MATCH_1}" PARENT_SCOPE)
++              else()
++                  message(FATAL_ERROR "Internal error: failed to parse ${definition} from '${glibc_version_component}'")
++              endif()
++          endfunction()
++
++          ov_get_definition("__GLIBC__" _ov_glibc_major)
++          ov_get_definition("__GLIBC_MINOR__" _ov_glibc_minor)
++
++          set(OV_GLIBC_VERSION "${_ov_glibc_major}.${_ov_glibc_minor}" PARENT_SCOPE)
++      else()
++          set(OV_GLIBC_VERSION "0.0" PARENT_SCOPE)
++      endif()
++  endfunction()
++
++  ov_glibc_version()
++endif()
diff --git a/python/openvino/runtime/patches/openvino_5cee8bbf29797f4544b343e803de957e9f041f92_gcc11.3.0.patch b/python/openvino/runtime/patches/openvino_5cee8bbf29797f4544b343e803de957e9f041f92_gcc11.3.0.patch
new file mode 100644
index 0000000..97fbc4d
--- /dev/null
+++ b/python/openvino/runtime/patches/openvino_5cee8bbf29797f4544b343e803de957e9f041f92_gcc11.3.0.patch
@@ -0,0 +1,37 @@
+diff --git a/src/core/src/type/bfloat16.cpp b/src/core/src/type/bfloat16.cpp
+index 6e612b0cfe..dee498d795 100644
+--- a/src/core/src/type/bfloat16.cpp
++++ b/src/core/src/type/bfloat16.cpp
+@@ -61,6 +61,23 @@ size_t bfloat16::size() const {
+ #    pragma GCC diagnostic ignored "-Wuninitialized"
+ #endif
+ 
++#if 1
++// GCC 11 fails due to the reinterpret_cast violating alaising rules
++union bfloat16_uint32
++{
++    float f;
++    uint32_t v;
++};
++
++bfloat16::operator float() const
++{
++    uint32_t tmp = (static_cast<uint32_t>(m_value) << 16);
++    union bfloat16_uint32 fv;
++    fv.v = tmp;
++
++    return fv.f;
++}
++#else
+ bfloat16::operator float() const {
+     uint32_t tmp = 0;
+     uint32_t* ptmp = &tmp;
+@@ -68,7 +85,7 @@ bfloat16::operator float() const {
+     const float* f = reinterpret_cast<const float*>(ptmp);
+     return *f;
+ }
+-
++#endif
+ #if defined __GNUC__ && __GNUC__ == 11
+ #    pragma GCC diagnostic pop
+ #endif
diff --git a/python/openvino/runtime/plugins.xml b/python/openvino/runtime/plugins.xml
new file mode 100644
index 0000000..c866727
--- /dev/null
+++ b/python/openvino/runtime/plugins.xml
@@ -0,0 +1,18 @@
+<ie>
+    <plugins>
+        <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
+        </plugin>
+        <plugin name="HETERO" location="libcoreDLAHeteroPlugin.so">
+        </plugin>
+        <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
+        </plugin>
+        <plugin name="MULTI" location="libopenvino_auto_plugin.so">
+        </plugin>
+        <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
+        </plugin>
+        <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
+        </plugin>
+        <plugin name="FPGA" location="libcoreDlaRuntimePlugin.so">
+        </plugin>
+    </plugins>
+</ie>
diff --git a/python/openvino/runtime/plugins_win.xml b/python/openvino/runtime/plugins_win.xml
new file mode 100755
index 0000000..f88c9e9
--- /dev/null
+++ b/python/openvino/runtime/plugins_win.xml
@@ -0,0 +1,22 @@
+<ie>
+    <plugins>
+        <plugin name="AUTO" location="openvino_auto_plugin.dll">
+        </plugin>
+        <plugin name="BATCH" location="openvino_auto_batch_plugin.dll">
+        </plugin>
+        <plugin name="CPU" location="openvino_intel_cpu_plugin.dll">
+        </plugin>
+        <plugin name="GNA" location="openvino_intel_gna_plugin.dll">
+        </plugin>
+        <plugin name="GPU" location="openvino_intel_gpu_plugin.dll">
+        </plugin>
+        <plugin name="HETERO" location="coreDLAHeteroPlugin.dll">
+        </plugin>
+        <plugin name="MULTI" location="openvino_auto_plugin.dll">
+        </plugin>
+        <plugin name="MYRIAD" location="openvino_intel_myriad_plugin.dll">
+        </plugin>
+        <plugin name="FPGA" location="coreDlaRuntimePlugin.dll">
+        </plugin>
+    </plugins>
+</ie>
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
new file mode 100644
index 0000000..7613e82
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/README.md
@@ -0,0 +1,6 @@
+### OpenVINO Benchmark Tool
+---
+
+For detailed information on the OpenVINO Benchmark Tool, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/tools/benchmark_tool) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
+If you need examples of how to use the Benchmark Tool, check the [README](../README.md) in the parent directory for sample commands.
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
new file mode 100644
index 0000000..6696804
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.patch
@@ -0,0 +1,78 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/benchmark.py	2024-03-01 14:01:50.443877000 -0500
++++ benchmark.py	2024-04-01 10:06:18.751566000 -0400
+@@ -1,14 +1,15 @@
+-# Copyright (C) 2018-2023 Intel Corporation
++# Copyright (C) 2018-2022 Intel Corporation
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import os
+ from datetime import datetime
+ from math import ceil
++import warnings
+ from openvino.runtime import Core, get_version, AsyncInferQueue
+ 
+-from .utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+-from .utils.logging import logger
+-from .utils.utils import get_duration_seconds
++from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
++from openvino.tools.benchmark.utils.logging import logger
++from openvino.tools.benchmark.utils.utils import get_duration_seconds
+ 
+ def percentile(values, percent):
+     return values[ceil(len(values) * percent / 100) - 1]
+@@ -17,7 +18,17 @@
+     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                  duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+         self.device = device
+-        self.core = Core()
++        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
++        if dla_plugins == '':
++            # Backwards compatability for old DLA_PLUGINS_XML_FILE
++            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
++            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
++        self.core = Core(dla_plugins)
++        if "FPGA" in self.device:
++            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
++            if dla_arch_file is None:
++                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
++            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+         self.nireq = number_infer_requests if api_type == 'async' else 1
+         self.niter = number_iterations
+         self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+@@ -59,6 +70,9 @@
+     def set_cache_dir(self, cache_dir: str):
+         self.core.set_property({'CACHE_DIR': cache_dir})
+ 
++    def set_allow_auto_batching(self, flag: bool):
++        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
++
+     def read_model(self, path_to_model: str):
+         model_filename = os.path.abspath(path_to_model)
+         head, ext = os.path.splitext(model_filename)
+@@ -110,7 +124,7 @@
+               (self.duration_seconds and exec_time < self.duration_seconds) or \
+               (iteration % self.nireq):
+             idle_id = infer_queue.get_idle_request_id()
+-            if idle_id in in_fly:
++            if idle_id in in_fly:       # Is this check neccessary?
+                 times.append(infer_queue[idle_id].latency)
+             else:
+                 in_fly.add(idle_id)
+@@ -162,7 +176,6 @@
+     def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+         if self.api_type == 'sync':
+             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+-            fps = len(batch_size) * iteration / total_duration_sec
+         elif self.inference_only:
+             times, total_duration_sec, iteration = self.async_inference_only(requests)
+             fps = len(batch_size) * iteration / total_duration_sec
+@@ -175,6 +188,9 @@
+         min_latency_ms = times[0]
+         max_latency_ms = times[-1]
+ 
++        if self.api_type == 'sync':
++            fps = len(batch_size) * 1000 / median_latency_ms
++
+         if pcseq:
+             for group in self.latency_groups:
+                 if group.times:
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
new file mode 100644
index 0000000..a98b82a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark.py
@@ -0,0 +1,202 @@
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from datetime import datetime
+from math import ceil
+import warnings
+from openvino.runtime import Core, get_version, AsyncInferQueue
+
+from openvino.tools.benchmark.utils.constants import GPU_DEVICE_NAME, XML_EXTENSION, BIN_EXTENSION
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import get_duration_seconds
+
+def percentile(values, percent):
+    return values[ceil(len(values) * percent / 100) - 1]
+
+class Benchmark:
+    def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+        self.device = device
+        dla_plugins = os.environ.get('DLA_PLUGINS', default='')
+        if dla_plugins == '':
+            # Backwards compatability for old DLA_PLUGINS_XML_FILE
+            warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
+            dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
+        self.core = Core(dla_plugins)
+        if "FPGA" in self.device:
+            dla_arch_file = os.environ.get('DLA_ARCH_FILE')
+            if dla_arch_file is None:
+                raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
+            self.core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+        self.nireq = number_infer_requests if api_type == 'async' else 1
+        self.niter = number_iterations
+        self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
+        self.api_type = api_type
+        self.inference_only = inference_only
+        self.latency_groups = []
+
+    def __del__(self):
+        del self.core
+
+    def add_extension(self, path_to_extensions: str=None, path_to_cldnn_config: str=None):
+        if path_to_cldnn_config:
+            self.core.set_property(GPU_DEVICE_NAME, {'CONFIG_FILE': path_to_cldnn_config})
+            logger.info(f'GPU extensions is loaded {path_to_cldnn_config}')
+
+        if path_to_extensions:
+            for extension in path_to_extensions.split(","):
+                logger.info(f"Loading extension {extension}")
+                self.core.add_extension(extension)
+
+    def print_version_info(self) -> None:
+        version = get_version()
+        logger.info('OpenVINO:')
+        logger.info(f"{'Build ':.<39} {version}")
+        logger.info("")
+
+        logger.info("Device info:")
+        for device, version in self.core.get_versions(self.device).items():
+            logger.info(f"{device}")
+            logger.info(f"{'Build ':.<39} {version.build_number}")
+
+        logger.info("")
+        logger.info("")
+
+    def set_config(self, config = {}):
+        for device in config.keys():
+            self.core.set_property(device, config[device])
+
+    def set_cache_dir(self, cache_dir: str):
+        self.core.set_property({'CACHE_DIR': cache_dir})
+
+    def set_allow_auto_batching(self, flag: bool):
+        self.core.set_property({'ALLOW_AUTO_BATCHING': flag})
+
+    def read_model(self, path_to_model: str):
+        model_filename = os.path.abspath(path_to_model)
+        head, ext = os.path.splitext(model_filename)
+        weights_filename = os.path.abspath(head + BIN_EXTENSION) if ext == XML_EXTENSION else ""
+        return self.core.read_model(model_filename, weights_filename)
+
+    def create_infer_requests(self, compiled_model):
+        if self.api_type == 'sync':
+            requests = [compiled_model.create_infer_request()]
+        else:
+            requests = AsyncInferQueue(compiled_model, self.nireq)
+            self.nireq = len(requests)
+        return requests
+
+    def first_infer(self, requests):
+        if self.api_type == 'sync':
+            requests[0].infer()
+            return requests[0].latency
+        else:
+            id = requests.get_idle_request_id()
+            requests.start_async()
+            requests.wait_all()
+            return requests[id].latency
+
+    def sync_inference(self, request, data_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds):
+            if self.inference_only == False:
+                request.set_input_tensors(data_queue.get_next_input())
+            request.infer()
+            times.append(request.latency)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_only(self, infer_queue):
+        exec_time = 0
+        iteration = 0
+        times = []
+        in_fly = set()
+        start_time = datetime.utcnow()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % self.nireq):
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:       # Is this check neccessary?
+                times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            infer_queue.start_async()
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+        return sorted(times), total_duration_sec, iteration
+
+    def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
+        processed_frames = 0
+        exec_time = 0
+        iteration = 0
+        times = []
+        num_groups = len(self.latency_groups)
+        start_time = datetime.utcnow()
+        in_fly = set()
+        while (self.niter and iteration < self.niter) or \
+              (self.duration_seconds and exec_time < self.duration_seconds) or \
+              (iteration % num_groups):
+            processed_frames += data_queue.get_next_batch_size()
+            idle_id = infer_queue.get_idle_request_id()
+            if idle_id in in_fly:
+                times.append(infer_queue[idle_id].latency)
+                if pcseq:
+                    self.latency_groups[infer_queue.userdata[idle_id]].times.append(infer_queue[idle_id].latency)
+            else:
+                in_fly.add(idle_id)
+            group_id = data_queue.current_group_id
+            infer_queue[idle_id].set_input_tensors(data_queue.get_next_input())
+            infer_queue.start_async(userdata=group_id)
+            iteration += 1
+
+            exec_time = (datetime.utcnow() - start_time).total_seconds()
+        infer_queue.wait_all()
+        total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
+        
+        for infer_request_id in in_fly:
+            times.append(infer_queue[infer_request_id].latency)
+            if pcseq:
+                self.latency_groups[infer_queue.userdata[infer_request_id]].times.append(infer_queue[infer_request_id].latency)
+        
+        return sorted(times), total_duration_sec, processed_frames, iteration
+
+    def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq):
+        if self.api_type == 'sync':
+            times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
+        elif self.inference_only:
+            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            fps = len(batch_size) * iteration / total_duration_sec
+        else:
+            times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)
+            fps = processed_frames / total_duration_sec
+
+        median_latency_ms = percentile(times, latency_percentile)
+        avg_latency_ms = sum(times) / len(times)
+        min_latency_ms = times[0]
+        max_latency_ms = times[-1]
+
+        if self.api_type == 'sync':
+            fps = len(batch_size) * 1000 / median_latency_ms
+
+        if pcseq:
+            for group in self.latency_groups:
+                if group.times:
+                    group.times.sort()
+                    group.median = percentile(group.times, latency_percentile)
+                    group.avg = sum(group.times) / len(group.times)
+                    group.min = group.times[0]
+                    group.max = group.times[-1]
+        return fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
new file mode 100644
index 0000000..4a003ad
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.patch
@@ -0,0 +1,14 @@
+--- /p/psg/swip/dla/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/benchmark_app	2023-02-07 15:01:24.336634000 -0500
++++ benchmark_app.py	2023-05-03 12:01:20.435826000 -0400
+@@ -1,8 +1,8 @@
+-#!/nfs/site/disks/swip_dla_1/resources/inference_engine/2022.3.0/centos7/openvino_2022/openvino_env/bin/python
++#!/usr/bin/python3
+ # -*- coding: utf-8 -*-
+ import re
+ import sys
+-from openvino.tools.benchmark.main import main
++import main
+ if __name__ == '__main__':
+     sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+-    sys.exit(main())
++    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
new file mode 100644
index 0000000..d5b9c9a
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py
@@ -0,0 +1,8 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import re
+import sys
+import main
+if __name__ == '__main__':
+    sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+    sys.exit(main.main())
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
new file mode 100644
index 0000000..99afb40
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.patch
@@ -0,0 +1,106 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/python/openvino/tools/benchmark/main.py	2024-03-01 14:01:50.466871000 -0500
++++ main.py	2024-10-29 11:10:06.569928000 -0400
+@@ -7,11 +7,11 @@
+ 
+ from openvino.runtime import Dimension,properties
+ 
+-from openvino.tools.benchmark.benchmark import Benchmark
++import benchmark as openvino_benchmark
+ from openvino.tools.benchmark.parameters import parse_args
+ from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+     CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+-    BLOB_EXTENSION, AUTO_DEVICE_NAME
++    BIN_EXTENSION, AUTO_DEVICE_NAME
+ from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+ from openvino.tools.benchmark.utils.logging import logger
+ from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+@@ -41,13 +41,13 @@
+     if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+         raise Exception("only detailed_counters report type is supported for MULTI device")
+ 
+-    _, ext = os.path.splitext(args.path_to_model)
+-    is_network_compiled = True if ext == BLOB_EXTENSION else False
+-    is_precisiton_set = not (args.input_precision == "" and args.output_precision == "" and args.input_output_precision == "")
++    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
++        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
++                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
++                       "then the number of inference request must be 1.")
+ 
+-    if is_network_compiled and is_precisiton_set:
+-        raise Exception("Cannot set precision for a compiled model. " \
+-                        "Please re-compile your model with required precision.")
++    _, ext = os.path.splitext(args.path_to_model)
++    is_network_compiled = True if ext == BIN_EXTENSION else False
+ 
+     return args, is_network_compiled
+ 
+@@ -84,7 +84,7 @@
+         # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+         next_step(step_id=2)
+ 
+-        benchmark = Benchmark(args.target_device, args.number_infer_requests,
++        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                               args.number_iterations, args.time, args.api_type, args.inference_only)
+ 
+         if args.extensions:
+@@ -166,8 +166,11 @@
+             supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+             if device not in config.keys():
+                 config[device] = {}
+-
+             ## high-level performance modes
++            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
++            # for both the HETERO and FPGA devices in our patched version of the Python demos
++            if device in ['HETERO', 'FPGA']:
++                continue
+             set_performance_hint(device)
+ 
+             if is_flag_set_in_command_line('nireq'):
+@@ -429,16 +432,21 @@
+             next_step()
+ 
+             start_time = datetime.utcnow()
+-            compiled_model = benchmark.core.import_model(args.path_to_model, benchmark.device, device_config)
+-            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+-            logger.info(f"Import model took {duration_ms} ms")
+-            if statistics:
+-                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+-                                          [
+-                                              ('import model time (ms)', duration_ms)
+-                                          ])
+-            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+-            batch_size = get_network_batch_size(app_inputs_info)
++            try:
++                with open(args.path_to_model, "rb") as model_stream:
++                    model_bytes = model_stream.read()
++                compiled_model = benchmark.core.import_model(model_bytes, device_name)
++                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
++                logger.info(f"Import model took {duration_ms} ms")
++                if statistics:
++                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
++                                            [
++                                                ('import model time (ms)', duration_ms)
++                                            ])
++                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
++                batch_size = get_network_batch_size(app_inputs_info)
++            except Exception as e:
++                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+ 
+         # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+         next_step()
+@@ -653,7 +661,7 @@
+             exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+             logger.info(f'Execution Devices:{exeDevice}')
+         except:
+-            pass
++            exeDevice = None
+         logger.info(f'Count:            {iteration} iterations')
+         logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+         if MULTI_DEVICE_NAME not in device_name:
+@@ -692,4 +700,4 @@
+                 [('error', str(e))]
+             )
+             statistics.dump()
+-        sys.exit(1)
++        sys.exit(1)
+\ No newline at end of file
diff --git a/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
new file mode 100644
index 0000000..e11daec
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_benchmark_app/main.py
@@ -0,0 +1,703 @@
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+from datetime import datetime
+
+from openvino.runtime import Dimension,properties
+
+import benchmark as openvino_benchmark
+from openvino.tools.benchmark.parameters import parse_args
+from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, \
+    CPU_DEVICE_NAME, GPU_DEVICE_NAME, \
+    BIN_EXTENSION, AUTO_DEVICE_NAME
+from openvino.tools.benchmark.utils.inputs_filling import get_input_data
+from openvino.tools.benchmark.utils.logging import logger
+from openvino.tools.benchmark.utils.utils import next_step, get_number_iterations, pre_post_processing, \
+    process_help_inference_string, print_perf_counters, print_perf_counters_sort, dump_exec_graph, get_duration_in_milliseconds, \
+    get_command_line_arguments, parse_value_per_device, parse_devices, get_inputs_info, \
+    print_inputs_and_outputs_info, get_network_batch_size, load_config, dump_config, get_latency_groups, \
+    check_for_static, can_measure_as_static, parse_value_for_virtual_device, is_virtual_device, is_virtual_device_found
+from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, JsonStatisticsReport, CsvStatisticsReport, \
+    averageCntReport, detailedCntReport
+
+def parse_and_check_command_line():
+    def arg_not_empty(arg_value,empty_value):
+        return not arg_value is None and not arg_value == empty_value
+
+    parser = parse_args()
+    args = parser.parse_args()
+
+    if args.latency_percentile < 1 or args.latency_percentile > 100:
+        parser.print_help()
+        raise RuntimeError("The percentile value is incorrect. The applicable values range is [1, 100].")
+
+    if not args.perf_hint == "none" and (arg_not_empty(args.number_streams, "") or arg_not_empty(args.number_threads, 0) or arg_not_empty(args.infer_threads_pinning, "")):
+        raise Exception("-nstreams, -nthreads and -pin options are fine tune options. To use them you " \
+                        "should explicitely set -hint option to none. This is not OpenVINO limitation " \
+                        "(those options can be used in OpenVINO together), but a benchmark_app UI rule.")
+
+    if args.report_type == "average_counters" and MULTI_DEVICE_NAME in args.target_device:
+        raise Exception("only detailed_counters report type is supported for MULTI device")
+
+    if args.number_infer_requests != 1 and "FPGA" in args.target_device:
+        logger.warning(f"If the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP "\
+                       "(e.g. the Agilex 5E Premium Development Kit JTAG Design Example), "\
+                       "then the number of inference request must be 1.")
+
+    _, ext = os.path.splitext(args.path_to_model)
+    is_network_compiled = True if ext == BIN_EXTENSION else False
+
+    return args, is_network_compiled
+
+def main():
+    statistics = None
+    try:
+        # ------------------------------ 1. Parsing and validating input arguments ------------------------------
+        next_step()
+        logger.info("Parsing input parameters")
+        args, is_network_compiled = parse_and_check_command_line()
+
+        command_line_arguments = get_command_line_arguments(sys.argv)
+        if args.report_type:
+            _statistics_class = JsonStatisticsReport if args.json_stats else CsvStatisticsReport
+            statistics = _statistics_class(StatisticsReport.Config(args.report_type, args.report_folder))
+            statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
+
+        def is_flag_set_in_command_line(flag):
+            return any(x.strip('-') == flag for x, y in command_line_arguments)
+
+        device_name = args.target_device
+
+        devices = parse_devices(device_name)
+        device_number_streams = parse_value_per_device(devices, args.number_streams, "nstreams")
+        device_infer_precision = parse_value_per_device(devices, args.infer_precision, "infer_precision")
+
+        config = {}
+        if args.load_config:
+            load_config(args.load_config, config)
+
+        if is_network_compiled:
+            logger.info("Model is compiled")
+
+        # ------------------------------ 2. Loading OpenVINO Runtime -------------------------------------------
+        next_step(step_id=2)
+
+        benchmark = openvino_benchmark.Benchmark(args.target_device, args.number_infer_requests,
+                              args.number_iterations, args.time, args.api_type, args.inference_only)
+
+        if args.extensions:
+            benchmark.add_extension(path_to_extensions=args.extensions)
+
+        ## GPU (clDNN) Extensions
+        if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
+            if GPU_DEVICE_NAME not in config.keys():
+                config[GPU_DEVICE_NAME] = {}
+            config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
+
+        if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
+            cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
+            benchmark.add_extension(path_to_cldnn_config=cldnn_config)
+
+        benchmark.print_version_info()
+
+        # --------------------- 3. Setting device configuration --------------------------------------------------------
+        next_step()
+
+        def set_performance_hint(device):
+            perf_hint = properties.hint.PerformanceMode.UNDEFINED
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if properties.hint.performance_mode() in supported_properties:
+                if is_flag_set_in_command_line('hint'):
+                    if args.perf_hint == "throughput" or args.perf_hint == "tput":
+                        perf_hint = properties.hint.PerformanceMode.THROUGHPUT
+                    elif args.perf_hint == "latency":
+                        perf_hint = properties.hint.PerformanceMode.LATENCY
+                    elif args.perf_hint == "cumulative_throughput" or args.perf_hint == "ctput":
+                        perf_hint = properties.hint.PerformanceMode.CUMULATIVE_THROUGHPUT
+                    elif args.perf_hint=='none':
+                        perf_hint = properties.hint.PerformanceMode.UNDEFINED
+                    else:
+                        raise RuntimeError("Incorrect performance hint. Please set -hint option to"
+                            "`throughput`(tput), `latency', 'cumulative_throughput'(ctput) value or 'none'.")
+                else:
+                    perf_hint = properties.hint.PerformanceMode.THROUGHPUT if benchmark.api_type == "async" else properties.hint.PerformanceMode.LATENCY
+                    logger.warning(f"Performance hint was not explicitly specified in command line. " +
+                    f"Device({device}) performance hint will be set to {perf_hint}.")
+                if perf_hint != properties.hint.PerformanceMode.UNDEFINED:
+                    config[device][properties.hint.performance_mode()] = perf_hint
+            else:
+                logger.warning(f"Device {device} does not support performance hint property(-hint).")
+
+
+        def get_device_type_from_name(name) :
+            new_name = str(name)
+            new_name = new_name.split(".", 1)[0]
+            new_name = new_name.split("(", 1)[0]
+            return new_name
+
+        ## Set default values from dumped config
+        default_devices = set()
+        for device in devices:
+            device_type = get_device_type_from_name(device)
+            if device_type in config and device not in config:
+                config[device] = config[device_type].copy()
+                default_devices.add(device_type)
+
+        for def_device in default_devices:
+            config.pop(def_device)
+
+        perf_counts = False
+        # check if using the virtual device
+        hw_devices_list = devices.copy()
+        # Remove the hardware devices if AUTO/MULTI/HETERO appears in the devices list.
+        is_virtual = is_virtual_device_found(devices)
+        if is_virtual:
+            devices.clear()
+            # Parse out the currect virtual device as the target device.
+            virtual_device = device_name.partition(":")[0]
+            hw_devices_list.remove(virtual_device)
+            devices.append(virtual_device)
+            parse_value_for_virtual_device(virtual_device, device_number_streams)
+            parse_value_for_virtual_device(virtual_device, device_infer_precision)
+
+        for device in devices:
+            supported_properties = benchmark.core.get_property(device, properties.supported_properties())
+            if device not in config.keys():
+                config[device] = {}
+            ## high-level performance modes
+            # The orginial OV 2022.3 Python API fails with the pc flag, so we comment it out
+            # for both the HETERO and FPGA devices in our patched version of the Python demos
+            if device in ['HETERO', 'FPGA']:
+                continue
+            set_performance_hint(device)
+
+            if is_flag_set_in_command_line('nireq'):
+                config[device][properties.hint.num_requests()] = str(args.number_infer_requests)
+
+            ## Set performance counter
+            if is_flag_set_in_command_line('pc'):
+                ## set to user defined value
+                config[device][properties.enable_profiling()] = True if args.perf_counts else False
+            elif properties.enable_profiling() in config[device].keys() and config[device][properties.enable_profiling()] == True:
+                logger.warning(f"Performance counters for {device} device is turned on. " +
+                               "To print results use -pc option.")
+            elif args.report_type in [ averageCntReport, detailedCntReport ]:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since report type is {args.report_type}.")
+                config[device][properties.enable_profiling()] = True
+            elif args.exec_graph_path is not None:
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               "due to execution graph dumping.")
+                config[device][properties.enable_profiling()] = True
+            elif is_flag_set_in_command_line('pcsort'):
+                ## set to default value
+                logger.warning(f"Turn on performance counters for {device} device " +
+                               f"since pcsort value is {args.perf_counts_sort}.")
+                config[device][properties.enable_profiling()] = True if args.perf_counts_sort else False
+            else:
+                ## set to default value
+                config[device][properties.enable_profiling()] = args.perf_counts
+            perf_counts = True if config[device][properties.enable_profiling()] == True else perf_counts
+
+            ## insert or append property into hw device properties list
+            def update_configs(hw_device, property_name, property_value):
+                (key, value) = properties.device.properties({hw_device:{property_name:property_value}})
+                # add property into hw device properties list.
+                if key not in config[device].keys():
+                    config[device][key] = value
+                else:
+                    current_config = config[device][key].get()
+                    if hw_device not in current_config.keys():
+                        current_config.update(value.get())
+                    else:
+                        current_device_config = current_config[hw_device]
+                        for prop in value.get().items():
+                            current_device_config.update(prop[1])
+                        current_config[hw_device].update(current_device_config)
+                    config[device][key].set(current_config)
+
+            def update_device_config_for_virtual_device(value, config, key):
+                # check if the element contains the hardware device property
+                if len(value.split(':')) == 1:
+                    config[device][key] = device_infer_precision[device]
+                else:
+                    # set device nstreams properties in the AUTO/MULTI plugin
+                    value_vec = value[value.find('{') + 1:value.rfind('}')].split(',')
+                    device_properties  = {value_vec[i].split(':')[0] : value_vec[i].split(':')[1] for i in range(0, len(value_vec))}
+                    for hw_device in device_properties.keys():
+                        update_configs(hw_device, key, device_properties[hw_device])
+
+            ## infer precision
+            def set_infer_precision():
+                key = properties.hint.inference_precision()
+                if device in device_infer_precision.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_infer_precision[device]
+                    elif is_virtual_device(device):
+                        update_device_config_for_virtual_device(device_infer_precision[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key INFERENCE_PRECISION_HINT!" \
+                                        " Please specify -infer_precision for correct devices in format" \
+                                        " <dev1>:<infer_precision1>,<dev2>:<infer_precision2> or via configuration file.")
+                return
+
+            ## the rest are individual per-device settings (overriding the values the device will deduce from perf hint)
+            def set_throughput_streams():
+                key = get_device_type_from_name(device) + "_THROUGHPUT_STREAMS"
+                if device in device_number_streams.keys():
+                    ## set to user defined value
+                    if key in supported_properties:
+                        config[device][key] = device_number_streams[device]
+                    elif properties.streams.num() in supported_properties:
+                        key = properties.streams.num()
+                        config[device][key] = device_number_streams[device]
+                    elif is_virtual_device(device):
+                        key = properties.streams.num()
+                        update_device_config_for_virtual_device(device_number_streams[device], config, key)
+                    else:
+                        raise Exception(f"Device {device} doesn't support config key '{key}'! " +
+                                        "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>")
+                elif key not in config[device].keys() and args.api_type == "async" and key not in config[device].keys() \
+                    and 'PERFORMANCE_HINT' in config[device].keys() and config[device]['PERFORMANCE_HINT'] == '':
+                    ## set the _AUTO value for the #streams
+                    logger.warning(f"-nstreams default value is determined automatically for {device} device. " +
+                                   "Although the automatic selection usually provides a reasonable performance, "
+                                   "but it still may be non-optimal for some cases, for more information look at README.")
+                    if key in supported_properties:
+                        config[device][key] = get_device_type_from_name(device) + "_THROUGHPUT_AUTO"
+                    elif properties.streams.Num() in supported_properties:
+                        key = properties.streams.Num()
+                        config[device][key] = "-1"  # Set AUTO mode for streams number
+                    elif is_virtual_device(device):
+                        # Set nstreams to default value auto if no nstreams specified from cmd line.
+                        for hw_device in hw_devices_list:
+                            hw_supported_properties = benchmark.core.get_property(hw_device, properties.supported_properties())
+                            key = get_device_type_from_name(hw_device) + "_THROUGHPUT_STREAMS"
+                            value = get_device_type_from_name(hw_device) + "_THROUGHPUT_AUTO"
+                            if key not in hw_supported_properties:
+                                key = properties.streams.Num()
+                                value = properties.streams.Num.AUTO
+                            if key in hw_supported_properties:
+                                update_configs(hw_device, key, value)
+                if key in config[device].keys():
+                    device_number_streams[device] = config[device][key]
+                return
+
+            def set_nthreads_pin(property_name, property_value):
+                if property_name == properties.affinity():
+                    if property_value == "YES":
+                        property_value = properties.Affinity.CORE
+                    elif property_value == "NO":
+                        property_value = properties.Affinity.NONE
+                if property_name in supported_properties or device_name == AUTO_DEVICE_NAME:
+                    # create nthreads/pin primary property for HW device or AUTO if -d is AUTO directly.
+                    config[device][property_name] = property_value
+                elif is_virtual:
+                    # Create secondary property of -nthreads/-pin only for CPU if CPU device appears in the devices
+                    # list specified by -d.
+                    if CPU_DEVICE_NAME in hw_devices_list:
+                        update_configs(CPU_DEVICE_NAME, property_name, property_value)
+                return
+
+            if args.number_threads and is_flag_set_in_command_line("nthreads"):
+                # limit threading for CPU portion of inference
+                set_nthreads_pin(properties.inference_num_threads(), str(args.number_threads))
+
+            if is_flag_set_in_command_line('pin'):
+                ## set for CPU to user defined value
+                set_nthreads_pin(properties.affinity(), args.infer_threads_pinning)
+
+            set_throughput_streams()
+            set_infer_precision()
+
+            if is_virtual_device(device):
+                if device in device_number_streams.keys():
+                    del device_number_streams[device]
+
+        device_config = {}
+        for device in config:
+            if benchmark.device.find(device) == 0:
+                device_config = config[device]
+        if args.cache_dir:
+            benchmark.set_cache_dir(args.cache_dir)
+
+        ## If set batch size, disable the auto batching
+        if args.batch_size:
+            logger.warning("Batch size is set. Auto batching will be disabled")
+            device_config["ALLOW_AUTO_BATCHING"] = False
+
+        topology_name = ""
+        load_from_file_enabled = is_flag_set_in_command_line('load_from_file') or is_flag_set_in_command_line('lfile')
+        if load_from_file_enabled and not is_network_compiled:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported with --load_from_file. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+            next_step()
+            print("Skipping the step for loading model from file")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(args.path_to_model, benchmark.device, device_config)
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+            app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+            batch_size = get_network_batch_size(app_inputs_info)
+        elif not is_network_compiled:
+            # --------------------- 4. Read the Intermediate Representation of the network -----------------------------
+            next_step()
+
+            logger.info("Loading model files")
+
+            start_time = datetime.utcnow()
+            model = benchmark.read_model(args.path_to_model)
+            topology_name = model.get_name()
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Read model took {duration_ms} ms")
+            logger.info("Original model I/O parameters:")
+            print_inputs_and_outputs_info(model)
+
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('read model time (ms)', duration_ms)
+                                          ])
+
+            # --------------------- 5. Resizing network to match image sizes and given batch ---------------------------
+            next_step()
+
+            app_inputs_info, reshape = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, model.inputs)
+
+            # use batch size according to provided layout and shapes
+            batch_size = get_network_batch_size(app_inputs_info)
+            logger.info(f'Model batch size: {batch_size}')
+
+            if reshape:
+                start_time = datetime.utcnow()
+                shapes = { info.name : info.partial_shape for info in app_inputs_info }
+                logger.info(
+                    'Reshaping model: {}'.format(', '.join("'{}': {}".format(k, str(v)) for k, v in shapes.items())))
+                model.reshape(shapes)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Reshape model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                              [
+                                                  ('reshape model time (ms)', duration_ms)
+                                              ])
+
+            # --------------------- 6. Configuring inputs and outputs of the model --------------------------------------------------
+            next_step()
+
+            pre_post_processing(model, app_inputs_info, args.input_precision, args.output_precision, args.input_output_precision)
+            print_inputs_and_outputs_info(model)
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+            start_time = datetime.utcnow()
+            compiled_model = benchmark.core.compile_model(model, benchmark.device, device_config)
+
+            duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+            logger.info(f"Compile model took {duration_ms} ms")
+            if statistics:
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ('compile model time (ms)', duration_ms)
+                                          ])
+        else:
+            if args.mean_values or args.scale_values:
+                raise RuntimeError("--mean_values and --scale_values aren't supported for compiled model. "
+                    "The values can be set via model_optimizer while generating xml")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+            next_step()
+            print("Skipping the step for compiled model")
+
+            # --------------------- 7. Loading the model to the device -------------------------------------------------
+            next_step()
+
+            start_time = datetime.utcnow()
+            try:
+                with open(args.path_to_model, "rb") as model_stream:
+                    model_bytes = model_stream.read()
+                compiled_model = benchmark.core.import_model(model_bytes, device_name)
+                duration_ms = f"{(datetime.utcnow() - start_time).total_seconds() * 1000:.2f}"
+                logger.info(f"Import model took {duration_ms} ms")
+                if statistics:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                ('import model time (ms)', duration_ms)
+                                            ])
+                app_inputs_info, _ = get_inputs_info(args.shape, args.data_shape, args.layout, args.batch_size, args.scale_values, args.mean_values, compiled_model.inputs)
+                batch_size = get_network_batch_size(app_inputs_info)
+            except Exception as e:
+                raise RuntimeError(f"Cannot open or import compiled model file: {args.path_to_model}. Error: {str(e)}")
+
+        # --------------------- 8. Querying optimal runtime parameters --------------------------------------------------
+        next_step()
+
+        ## actual device-deduced settings
+        keys = compiled_model.get_property(properties.supported_properties())
+        logger.info("Model:")
+        for k in keys:
+            skip_keys = ('SUPPORTED_METRICS', 'SUPPORTED_CONFIG_KEYS', properties.supported_properties())
+            if k not in skip_keys:
+                value = compiled_model.get_property(k)
+                if k == properties.device.properties():
+                    for device_key in value.keys():
+                        logger.info(f'  {device_key}:')
+                        for k2, value2 in value.get(device_key).items():
+                            if k2 not in skip_keys:
+                                logger.info(f'    {k2}: {value2}')
+                else:
+                    logger.info(f'  {k}: {value}')
+
+        # Update number of streams
+        for device in device_number_streams.keys():
+            try:
+                key = get_device_type_from_name(device) + '_THROUGHPUT_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+            except:
+                key = 'NUM_STREAMS'
+                device_number_streams[device] = compiled_model.get_property(key)
+
+        # ------------------------------------ 9. Creating infer requests and preparing input data ----------------------
+        next_step()
+
+        # Create infer requests
+        requests = benchmark.create_infer_requests(compiled_model)
+
+        # Prepare input data
+        paths_to_input = list()
+        if args.paths_to_input:
+            for path in args.paths_to_input:
+                if ":" in next(iter(path), ""):
+                    paths_to_input.extend(path)
+                else:
+                    paths_to_input.append(os.path.abspath(*path))
+
+        data_queue = get_input_data(paths_to_input, app_inputs_info)
+
+        static_mode = check_for_static(app_inputs_info)
+        allow_inference_only_or_sync = can_measure_as_static(app_inputs_info)
+        if not allow_inference_only_or_sync and benchmark.api_type == 'sync':
+            raise Exception("Benchmarking of the model with dynamic shapes is available for async API only. "
+                            "Please use -api async -hint latency -nireq 1 to emulate sync behavior.")
+
+        if benchmark.inference_only == None:
+            if static_mode:
+                benchmark.inference_only = True
+            else:
+                benchmark.inference_only = False
+        elif benchmark.inference_only and not allow_inference_only_or_sync:
+            raise Exception("Benchmarking dynamic model available with input filling in measurement loop only!")
+
+        # update batch size in case dynamic network with one data_shape
+        if allow_inference_only_or_sync and batch_size.is_dynamic:
+            batch_size = Dimension(data_queue.batch_sizes[data_queue.current_group_id])
+
+        benchmark.latency_groups = get_latency_groups(app_inputs_info)
+
+        if len(benchmark.latency_groups) > 1:
+            logger.info(f"Defined {len(benchmark.latency_groups)} tensor groups:")
+            for group in benchmark.latency_groups:
+                logger.info(f"\t{str(group)}")
+
+        # Iteration limit
+        benchmark.niter = get_number_iterations(benchmark.niter, benchmark.nireq, max(len(info.shapes) for info in app_inputs_info), benchmark.api_type)
+
+        # Set input tensors before first inference
+        for request in requests:
+            data_tensors = data_queue.get_next_input()
+            for port, data_tensor in data_tensors.items():
+                input_tensor = request.get_input_tensor(port)
+                if not static_mode:
+                    input_tensor.shape = data_tensor.shape
+                if not len(input_tensor.shape):
+                    input_tensor.data.flat[:] = data_tensor.data
+                else:
+                    input_tensor.data[:] = data_tensor.data
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                      [
+                                          ('topology', topology_name),
+                                          ('target device', device_name),
+                                          ('API', args.api_type),
+                                          ('inference_only', benchmark.inference_only),
+                                          ('precision', "UNSPECIFIED"),
+                                          ('batch size', str(batch_size)),
+                                          ('number of iterations', str(benchmark.niter)),
+                                          ('number of parallel infer requests', str(benchmark.nireq)),
+                                          ('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
+                                       ])
+
+            for nstreams in device_number_streams.items():
+                statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
+                                         [
+                                            (f"number of {nstreams[0]} streams", str(nstreams[1])),
+                                         ])
+
+        # ------------------------------------ 10. Measuring performance -----------------------------------------------
+
+        output_string = process_help_inference_string(benchmark, device_number_streams)
+
+        next_step(additional_info=output_string)
+
+        if benchmark.inference_only:
+            logger.info("Benchmarking in inference only mode (inputs filling are not included in measurement loop).")
+        else:
+            logger.info("Benchmarking in full mode (inputs filling are included in measurement loop).")
+        duration_ms = f"{benchmark.first_infer(requests):.2f}"
+        logger.info(f"First inference took {duration_ms} ms")
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                    [
+                                        ('first inference time (ms)', duration_ms)
+                                    ])
+
+        pcseq = args.pcseq
+        if static_mode or len(benchmark.latency_groups) == 1:
+            pcseq = False
+
+        fps, median_latency_ms, avg_latency_ms, min_latency_ms, max_latency_ms, total_duration_sec, iteration = benchmark.main_loop(requests, data_queue, batch_size, args.latency_percentile, pcseq)
+
+        # ------------------------------------ 11. Dumping statistics report -------------------------------------------
+        next_step()
+
+        if args.dump_config:
+            dump_config(args.dump_config, config)
+            logger.info(f"OpenVINO configuration settings were dumped to {args.dump_config}")
+
+        if args.exec_graph_path:
+            dump_exec_graph(compiled_model, args.exec_graph_path)
+
+        if perf_counts:
+            perfs_count_list = []
+            for request in requests:
+                perfs_count_list.append(request.profiling_info)
+
+            if args.perf_counts_sort:
+                total_sorted_list = print_perf_counters_sort(perfs_count_list,sort_flag=args.perf_counts_sort)
+                if statistics:
+                    statistics.dump_performance_counters_sorted(total_sorted_list)
+
+            elif args.perf_counts:
+                print_perf_counters(perfs_count_list)
+
+            if statistics:
+                # if not args.perf_counts_sort:
+                statistics.dump_performance_counters(perfs_count_list)
+
+        if statistics:
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('total execution time (ms)', f'{get_duration_in_milliseconds(total_duration_sec):.2f}'),
+                                          ('total number of iterations', str(iteration)),
+                                      ])
+            if MULTI_DEVICE_NAME not in device_name:
+                latency_prefix = None
+                if args.latency_percentile == 50:
+                    latency_prefix = 'latency (ms)'
+                elif args.latency_percentile != 50:
+                    latency_prefix = 'latency (' + str(args.latency_percentile) + ' percentile) (ms)'
+                if latency_prefix:
+                    statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                            [
+                                                (latency_prefix, f'{median_latency_ms:.2f}'),
+                                            ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{avg_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{min_latency_ms:.2f}'),
+                                          ])
+                statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{max_latency_ms:.2f}'),
+                                          ])
+                if pcseq:
+                    for group in benchmark.latency_groups:
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("group", str(group)),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("avg latency", f'{group.avg:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("min latency", f'{group.min:.2f}'),
+                                          ])
+                        statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                          [
+                                              ("max latency", f'{group.max:.2f}'),
+                                          ])
+            statistics.add_parameters(StatisticsReport.Category.EXECUTION_RESULTS,
+                                      [
+                                          ('throughput', f'{fps:.2f}'),
+                                      ])
+            statistics.dump()
+
+        try:
+            exeDevice = compiled_model.get_property("EXECUTION_DEVICES")
+            logger.info(f'Execution Devices:{exeDevice}')
+        except:
+            exeDevice = None
+        logger.info(f'Count:            {iteration} iterations')
+        logger.info(f'Duration:         {get_duration_in_milliseconds(total_duration_sec):.2f} ms')
+        if MULTI_DEVICE_NAME not in device_name:
+            logger.info('Latency:')
+            if args.latency_percentile == 50:
+                logger.info(f'   Median:        {median_latency_ms:.2f} ms')
+            elif args.latency_percentile != 50:
+                logger.info(f'   {args.latency_percentile} percentile:     {median_latency_ms:.2f} ms')
+            logger.info(f'   Average:       {avg_latency_ms:.2f} ms')
+            logger.info(f'   Min:           {min_latency_ms:.2f} ms')
+            logger.info(f'   Max:           {max_latency_ms:.2f} ms')
+
+            if pcseq:
+                logger.info("Latency for each data shape group:")
+                for idx,group in enumerate(benchmark.latency_groups):
+                    logger.info(f"{idx+1}.{str(group)}")
+                    if args.latency_percentile == 50:
+                        logger.info(f'   Median:     {group.median:.2f} ms')
+                    elif args.latency_percentile != 50:
+                        logger.info(f'   {args.latency_percentile} percentile:     {group.median:.2f} ms')
+                    logger.info(f'   Average:    {group.avg:.2f} ms')
+                    logger.info(f'   Min:        {group.min:.2f} ms')
+                    logger.info(f'   Max:        {group.max:.2f} ms')
+
+        logger.info(f'Throughput:   {fps:.2f} FPS')
+
+        del compiled_model
+
+        next_step.step_id = 0
+    except Exception as e:
+        logger.exception(e)
+
+        if statistics:
+            statistics.add_parameters(
+                StatisticsReport.Category.EXECUTION_RESULTS,
+                [('error', str(e))]
+            )
+            statistics.dump()
+        sys.exit(1)
+\ No newline at end of file
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md
new file mode 100644
index 0000000..0b021b9
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/README.md
@@ -0,0 +1,6 @@
+### OpeNVINO Image Classification Async Python Sample 
+---
+
+For detailed information on the OpenVINO Classification Sample Async Demo, please see the [README](https://github.com/openvinotoolkit/openvino/tree/2023.3.0/samples/python/classification_sample_async) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
+
+If you need examples of how to use the demo, check the [README](../README.md) in the parent directory for sample commands.
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch
new file mode 100644
index 0000000..28ae75c
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.patch
@@ -0,0 +1,116 @@
+--- /nfs/site/disks/swip_dla_1/resources/inference_engine/2023.3.0_with_dev_tools/1/linux64/suse12/samples/python/classification_sample_async/classification_sample_async.py	2024-03-01 14:01:24.460131000 -0500
++++ ./runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py	2024-04-16 10:33:28.810439000 -0400
+@@ -1,15 +1,18 @@
+ #!/usr/bin/env python3
+ # -*- coding: utf-8 -*-
+-# Copyright (C) 2018-2023 Intel Corporation
++# Copyright (C) 2018-2022 Intel Corporation
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import argparse
+ import logging as log
++import os
+ import sys
++import warnings
+ 
+ import cv2
+ import numpy as np
+-import openvino as ov
++from openvino.preprocess import PrePostProcessor
++from openvino.runtime import AsyncInferQueue, Core, InferRequest, Layout, Type
+ 
+ 
+ def parse_args() -> argparse.Namespace:
+@@ -24,14 +27,14 @@
+     args.add_argument('-i', '--input', type=str, required=True, nargs='+',
+                       help='Required. Path to an image file(s).')
+     args.add_argument('-d', '--device', type=str, default='CPU',
+-                      help='Optional. Specify the target device to infer on; CPU, GPU, GNA or HETERO: '
++                      help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
+                       'is acceptable. The sample will look for a suitable plugin for device specified. '
+                       'Default value is CPU.')
+     # fmt: on
+     return parser.parse_args()
+ 
+ 
+-def completion_callback(infer_request: ov.InferRequest, image_path: str) -> None:
++def completion_callback(infer_request: InferRequest, image_path: str) -> None:
+     predictions = next(iter(infer_request.results.values()))
+ 
+     # Change a shape of a numpy.ndarray with results to get another one with one dimension
+@@ -60,7 +63,17 @@
+ 
+ # --------------------------- Step 1. Initialize OpenVINO Runtime Core ------------------------------------------------
+     log.info('Creating OpenVINO Runtime Core')
+-    core = ov.Core()
++    dla_plugins = os.environ.get('DLA_PLUGINS', default='')
++    if dla_plugins == '':
++        # Backwards compatability for old DLA_PLUGINS_XML_FILE
++        warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
++        dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
++    core = Core(dla_plugins)
++    if "FPGA" in args.device:
++        dla_arch_file = os.environ.get('DLA_ARCH_FILE')
++        if dla_arch_file is None:
++            raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
++        core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+ 
+ # --------------------------- Step 2. Read a model --------------------------------------------------------------------
+     log.info(f'Reading the model: {args.model}')
+@@ -80,29 +93,38 @@
+     images = [cv2.imread(image_path) for image_path in args.input]
+ 
+     # Resize images to model input dims
+-    _, _, h, w = model.input().shape
++    # Assuming we always have w=h, we will 
++    # figure out the layout from the dimensions
++    # start with the assumption of NHWC (TF)
++    _, h, w, c = model.input().shape
++
++    if h != w:
++        c = h
++        h = w
++
+     resized_images = [cv2.resize(image, (w, h)) for image in images]
+ 
+     # Add N dimension
+     input_tensors = [np.expand_dims(image, 0) for image in resized_images]
+ 
++    # Transpose from NHWC to NCHW
++    input_tensors = [np.transpose(tensor, (0, 3, 1, 2)) for tensor in input_tensors]
++
+ # --------------------------- Step 4. Apply preprocessing -------------------------------------------------------------
+-    ppp = ov.preprocess.PrePostProcessor(model)
++    ppp = PrePostProcessor(model)
+ 
+     # 1) Set input tensor information:
+     # - input() provides information about a single model input
+-    # - precision of tensor is supposed to be 'u8'
+-    # - layout of data is 'NHWC'
+-    ppp.input().tensor() \
+-        .set_element_type(ov.Type.u8) \
+-        .set_layout(ov.Layout('NHWC'))  # noqa: N400
++    # - layout of data is 'NCHW'
++    ppp.input().tensor().set_layout(Layout('NCHW'))  # noqa: N400
+ 
+     # 2) Here we suppose model has 'NCHW' layout for input
+-    ppp.input().model().set_layout(ov.Layout('NCHW'))
++    # DLA --> We let the demo select the layout based on the model
++    # ppp.input().model().set_layout(Layout('NCHW'))
+ 
+     # 3) Set output tensor information:
+     # - precision of tensor is supposed to be 'f32'
+-    ppp.output().tensor().set_element_type(ov.Type.f32)
++    ppp.output().tensor().set_element_type(Type.f32)
+ 
+     # 4) Apply preprocessing modifing the original 'model'
+     model = ppp.build()
+@@ -114,7 +136,7 @@
+ # --------------------------- Step 6. Create infer request queue ------------------------------------------------------
+     log.info('Starting inference in asynchronous mode')
+     # create async queue with optimal number of infer requests
+-    infer_queue = ov.AsyncInferQueue(compiled_model)
++    infer_queue = AsyncInferQueue(compiled_model)
+     infer_queue.set_callback(completion_callback)
+ 
+ # --------------------------- Step 7. Do inference --------------------------------------------------------------------
diff --git a/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py
new file mode 100755
index 0000000..339c942
--- /dev/null
+++ b/python/openvino/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import logging as log
+import os
+import sys
+import warnings
+
+import cv2
+import numpy as np
+from openvino.preprocess import PrePostProcessor
+from openvino.runtime import AsyncInferQueue, Core, InferRequest, Layout, Type
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse and return command line arguments."""
+    parser = argparse.ArgumentParser(add_help=False)
+    args = parser.add_argument_group('Options')
+    # fmt: off
+    args.add_argument('-h', '--help', action='help',
+                      help='Show this help message and exit.')
+    args.add_argument('-m', '--model', type=str, required=True,
+                      help='Required. Path to an .xml or .onnx file with a trained model.')
+    args.add_argument('-i', '--input', type=str, required=True, nargs='+',
+                      help='Required. Path to an image file(s).')
+    args.add_argument('-d', '--device', type=str, default='CPU',
+                      help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
+                      'is acceptable. The sample will look for a suitable plugin for device specified. '
+                      'Default value is CPU.')
+    # fmt: on
+    return parser.parse_args()
+
+
+def completion_callback(infer_request: InferRequest, image_path: str) -> None:
+    predictions = next(iter(infer_request.results.values()))
+
+    # Change a shape of a numpy.ndarray with results to get another one with one dimension
+    probs = predictions.reshape(-1)
+
+    # Get an array of 10 class IDs in descending order of probability
+    top_10 = np.argsort(probs)[-10:][::-1]
+
+    header = 'class_id probability'
+
+    log.info(f'Image path: {image_path}')
+    log.info('Top 10 results: ')
+    log.info(header)
+    log.info('-' * len(header))
+
+    for class_id in top_10:
+        probability_indent = ' ' * (len('class_id') - len(str(class_id)) + 1)
+        log.info(f'{class_id}{probability_indent}{probs[class_id]:.7f}')
+
+    log.info('')
+
+
+def main() -> int:
+    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
+    args = parse_args()
+
+# --------------------------- Step 1. Initialize OpenVINO Runtime Core ------------------------------------------------
+    log.info('Creating OpenVINO Runtime Core')
+    dla_plugins = os.environ.get('DLA_PLUGINS', default='')
+    if dla_plugins == '':
+        # Backwards compatability for old DLA_PLUGINS_XML_FILE
+        warnings.warn("DLA_PLUGINS_XML_FILE option is deprecated as of 2024.1, Please use DLA_PLUGINS") 
+        dla_plugins = os.environ.get('DLA_PLUGINS_XML_FILE', default='')
+    core = Core(dla_plugins)
+    if "FPGA" in args.device:
+        dla_arch_file = os.environ.get('DLA_ARCH_FILE')
+        if dla_arch_file is None:
+            raise Exception(f"To use FPGA, you need to specify the path to an arch_file!")
+        core.set_property(device_name="FPGA", properties={"ARCH_PATH": dla_arch_file})
+
+# --------------------------- Step 2. Read a model --------------------------------------------------------------------
+    log.info(f'Reading the model: {args.model}')
+    # (.xml and .bin files) or (.onnx file)
+    model = core.read_model(args.model)
+
+    if len(model.inputs) != 1:
+        log.error('Sample supports only single input topologies')
+        return -1
+
+    if len(model.outputs) != 1:
+        log.error('Sample supports only single output topologies')
+        return -1
+
+# --------------------------- Step 3. Set up input --------------------------------------------------------------------
+    # Read input images
+    images = [cv2.imread(image_path) for image_path in args.input]
+
+    # Resize images to model input dims
+    # Assuming we always have w=h, we will 
+    # figure out the layout from the dimensions
+    # start with the assumption of NHWC (TF)
+    _, h, w, c = model.input().shape
+
+    if h != w:
+        c = h
+        h = w
+
+    resized_images = [cv2.resize(image, (w, h)) for image in images]
+
+    # Add N dimension
+    input_tensors = [np.expand_dims(image, 0) for image in resized_images]
+
+    # Transpose from NHWC to NCHW
+    input_tensors = [np.transpose(tensor, (0, 3, 1, 2)) for tensor in input_tensors]
+
+# --------------------------- Step 4. Apply preprocessing -------------------------------------------------------------
+    ppp = PrePostProcessor(model)
+
+    # 1) Set input tensor information:
+    # - input() provides information about a single model input
+    # - layout of data is 'NCHW'
+    ppp.input().tensor().set_layout(Layout('NCHW'))  # noqa: N400
+
+    # 2) Here we suppose model has 'NCHW' layout for input
+    # DLA --> We let the demo select the layout based on the model
+    # ppp.input().model().set_layout(Layout('NCHW'))
+
+    # 3) Set output tensor information:
+    # - precision of tensor is supposed to be 'f32'
+    ppp.output().tensor().set_element_type(Type.f32)
+
+    # 4) Apply preprocessing modifing the original 'model'
+    model = ppp.build()
+
+# --------------------------- Step 5. Loading model to the device -----------------------------------------------------
+    log.info('Loading the model to the plugin')
+    compiled_model = core.compile_model(model, args.device)
+
+# --------------------------- Step 6. Create infer request queue ------------------------------------------------------
+    log.info('Starting inference in asynchronous mode')
+    # create async queue with optimal number of infer requests
+    infer_queue = AsyncInferQueue(compiled_model)
+    infer_queue.set_callback(completion_callback)
+
+# --------------------------- Step 7. Do inference --------------------------------------------------------------------
+    for i, input_tensor in enumerate(input_tensors):
+        infer_queue.start_async({0: input_tensor}, args.input[i])
+
+    infer_queue.wait_all()
+# ----------------------------------------------------------------------------------------------------------------------
+    log.info('This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool\n')
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/python/openvino/runtime/python_demos/README.md b/python/openvino/runtime/python_demos/README.md
new file mode 100644
index 0000000..2cf080b
--- /dev/null
+++ b/python/openvino/runtime/python_demos/README.md
@@ -0,0 +1,184 @@
+# CoreDLA Python API Usage
+
+This README.md documents how to use OpenVINO's Python API with FPGA AI Suite.
+
+## OpenVINO Benchmark Python Tool (Just In Time Flow)
+
+A port of the OpenVINO Python benchmark_app is included in this directory. For more details on OpenVINO Python benchmark_app, see [README.md](./OpenVINO_benchmark_app/README.md). Note that this OpenVINO Python benchmark_app has slightly lower performance than the DLA C++ dla_benchmark in `runtime/dla_benchmark`.
+
+To run this Python implementation of benchmark_app:
+
+1. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to program the bitstream onto the FPGA device.
+
+2. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$COREDLA_ROOT/lib:$COREDLA_WORK/runtime/build_Release`
+    - `$COREDLA_ROOT/lib` is needed to find `libcoreDLAHeteroPlugin.so`
+    - `$COREDLA_WORK/runtime/build_Release` is needed to find `libcoreDLARuntimePlugin.so`
+
+3. This step assumes that $curarch specifies the .arch file corresponding to the bitstream currently
+programmed onto the FPGA board (as is done in the FPGA AI Suite Getting Started Guide).
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py \
+    -b=1 \
+    -m $xmldir/resnet-50-tf/FP32/resnet-50-tf.xml \
+   -d=HETERO:FPGA,CPU \
+   -niter=8 \
+   -api=async \
+   -nireq=4 \
+   -i $imagedir \
+   -ip=f32 \
+```
+
+   which will estimate the latency and throughput for resnet-50.
+
+Below is a fragment of sample output for HETERO:FPGA,CPU:
+
+```text
+[Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 8 iterations)
+[ INFO ] First inference took <number> ms
+[Step 11/11] Dumping statistics report
+Count:      8 iterations
+Duration:   <Duration> ms
+Latency:    <Latency> ms
+Throughput: <Throughput> FPS
+```
+**Note**: When the target FPGA design uses JTAG to access the CSRs on the FPGA AI Suite IP (e.g. the Agilex 5E Premium Development Kit JTAG Design Example), the only supported value of *nireq* is 1.
+
+## OpenVINO Benchmark Python Tool (Ahead Of Time Flow)
+
+A port of the OpenVINO Python benchmark_app is included in this directory. For more details on OpenVINO Python benchmark_app, see [README.md](./OpenVINO_benchmark_app/README.md). Note that this OpenVINO Python benchmark_app has slightly lower performance than the DLA C++ dla_benchmark in `runtime/dla_benchmark`.
+
+To run this Python implementation of benchmark_app:
+
+1. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to generate an AOT file. The architecture used should correspond to the same bitstream programmed in step 4.
+
+2. Follow instructions in the *FPGA AI Suite: Getting Started Guide* to program the bitstream onto the FPGA device.
+
+3. `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$COREDLA_ROOT/lib:$COREDLA_WORK/runtime/build_Release`
+    - `$COREDLA_ROOT/lib` is needed to find `libcoreDLAHeteroPlugin.so`
+    - `$COREDLA_WORK/runtime/build_Release` is needed to find `libcoreDLARuntimePlugin.so`
+
+4. This step assumes that:
+    - `$curarch` specifies the .arch file corresponding to the bitstream currently programmed onto the FPGA board (as is done in the FPGA AI Suite Getting Started Guide).
+    -  `graph.bin` is the compiled graph from step 1.
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/benchmark_app.py \
+    -b=1 \
+    -m $COREDLA_WORK/graph.bin \
+   -d=HETERO:FPGA,CPU \
+   -niter=8 \
+   -api=async \
+   -nireq=4 \
+   -i $imagedir \
+   -ip=f32 \
+```
+
+   which will estimate the latency and throughput for resnet-50.
+
+Below is a fragment of sample output for HETERO:FPGA,CPU:
+
+```text
+[Step 10/11] Measuring performance (Start inference asynchronously, 4 inference requests using 4 streams for CPU, limits: 8 iterations)
+[ INFO ] First inference took <number> ms
+[Step 11/11] Dumping statistics report
+Count:      8 iterations
+Duration:   <Duration> ms
+Latency:    <Latency> ms
+Throughput: <Throughput> FPS
+```
+
+## OpenVINO Benchmark Python Tool Precision (AOT and JIT)
+
+The OpenVINO Python application supports various input tensor precisions. For compatibility with the FPGA AI Suite, which only supports f16 and f32 precisions in the input transformations module, please specify the desired precision using the `-ip` (or `--input_precision`) flag.
+
+## OpenVINO Image Classification Async Python Sample
+
+Another example is a port of OpenVINO Image Classification Async Python Sample. For more details, see it's [README.md](./OpenVINO_classification_sample_async/README.md).
+
+To run this demo, follow step 1 and 2 above in the previous section and run
+
+```bash
+imagedir=$COREDLA_WORK/demo/sample_images
+xmldir=$COREDLA_WORK/demo/models/public/
+DLA_PLUGINS=$COREDLA_WORK/runtime/plugins.xml \
+  DLA_ARCH_FILE=$curarch \
+  python $COREDLA_WORK/runtime/python_demos/OpenVINO_classification_sample_async/classification_sample_async.py \
+    -m $xmldir/resnet-50-tf/FP32/resnet-50-tf.xml \
+    -d=HETERO:FPGA,CPU \
+    -i $imagedir/val_00000000.bmp $imagedir/val_00000001.bmp
+```
+
+Below is a fragment of the output:
+
+```txt
+[ INFO ] Starting inference in asynchronous mode
+[ INFO ] Infer request 0 returned 0
+[ INFO ] Image path: /absolute/path/of/demo/sample_images/val_00000000.bmp
+[ INFO ] Top 10 results:
+[ INFO ] classid probability
+[ INFO ] -------------------
+[ INFO ] 872     0.9995117
+[ INFO ] 999     0.0000000
+[ INFO ] 327     0.0000000
+[ INFO ] 340     0.0000000
+[ INFO ] 339     0.0000000
+[ INFO ] 338     0.0000000
+[ INFO ] 337     0.0000000
+[ INFO ] 336     0.0000000
+[ INFO ] 335     0.0000000
+[ INFO ] 334     0.0000000
+[ INFO ]
+[ INFO ] Infer request 1 returned 0
+[ INFO ] Image path: /absolute/path/of/demo/sample_images/val_00000001.bmp
+[ INFO ] Top 10 results:
+[ INFO ] classid probability
+[ INFO ] -------------------
+[ INFO ] 769     0.9672852
+[ INFO ] 845     0.0292053
+[ INFO ] 778     0.0005350
+[ INFO ] 798     0.0005350
+[ INFO ] 710     0.0003245
+[ INFO ] 767     0.0002230
+[ INFO ] 418     0.0001737
+[ INFO ] 587     0.0001533
+[ INFO ] 542     0.0000820
+[ INFO ] 600     0.0000820
+```
+
+## Instructions on how to run the software emulator model
+
+1. All steps are the same as above except `DLA_PLUGINS` should be set to $COERDLA_ROOT/bin/plugins_emulation.xml (`DLA_PLUGINS=$COREDLA_ROOT/bin/plugins_emulation.xml`)
+
+**NOTE** The software emulator model is slower than a hardware run. Thus, it is highly recommended to run the commands above with the `DLA_PLUGINS` and `-niter=1` and `-nireq=1`
+
+## Modifications Needed
+
+OpenVINO's Python demos and benchmark_app requires slight modification to work with CoreDLA.
+
+Please see the `.patch` file for the exact changes applied to port the OpenVINO Python benchmark_app to the FPGA AI Suite.
+
+These patches are created using
+
+- `cd $COREDLA_WORK/runtime/python_demos/OpenVINO_benchmark_app/`
+- `diff -u $INTEL_OPENVINO_DIR/python/openvino/tools/benchmark/benchmark.py benchmark.py > benchmark.patch`
+- `diff -u $INTEL_OPENVINO_DIR/python/openvino/tools/benchmark/main.py main.py > main.patch`
+- `diff -u $INTEL_OPENVINO_DIR/samples/python/classification_sample_async/classification_sample_async.py classification_sample_async.py > classification_sample_async.patch`
+
+To run these demos and benchmark_app, pass the absolute path of the plugin file and arch file as environment variables as shown in the example above.
+
+---
+
+**IMPORTANT**: OpenVINO's sample applications, tools, and demos are designed to work with images in BGR channel order by default. If your model was trained using images in RGB channel order, you will need to take additional steps to ensure compatibility:
+
+1. **Modify the Application**: Update the channel order within the sample or demo application code to match the RGB order expected by your model.
+
+2. **Convert the Model**: Alternatively, you can convert your trained model to expect BGR input by using the Model Optimizer tool. When doing so, include the `--reverse_input_channels` flag to adjust the channel order. For detailed guidance on this flag, consult the Model Optimizer documentation or run `mo --help` in your command line for assistance.
+
+---
diff --git a/python/openvino/runtime/scripts/hps/create_toolchain_file.sh b/python/openvino/runtime/scripts/hps/create_toolchain_file.sh
new file mode 100755
index 0000000..cb624c7
--- /dev/null
+++ b/python/openvino/runtime/scripts/hps/create_toolchain_file.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# Script to create a toolchain file based off the environment 
+# which would have been setup by Yocto SDK env script
+
+# Check we have at least one parameter
+if [ $# != 1 ]; then
+    echo "./create_toolchain_file.sh <SDK Dir>"
+    exit 1
+fi
+
+SDK_DIR=$1
+if [ ! -e ${SDK_DIR} ]; then
+    echo "SDK Dir does not exist."
+    exit 1
+fi
+
+mkdir -p ${SDK_DIR}/cmake
+CMAKE_FILE=${SDK_DIR}/cmake/embedded.arm.cmake
+
+# Source the Yocto environment to get the setup
+source ${SDK_DIR}/environment-setup-*
+#############################################################
+
+echo "# Copyright (C) 2018-2020 Intel Corporation" > ${CMAKE_FILE}
+echo "# SPDX-License-Identifier: Apache-2.0" >> ${CMAKE_FILE}
+echo "#" >> ${CMAKE_FILE}
+echo "" >> ${CMAKE_FILE}
+
+#############################################################
+# Setup OS and Processor
+echo "set(CMAKE_SYSTEM_NAME Linux)" >> ${CMAKE_FILE}
+# Use the OECORE_TARGET_ARCH for the SYSTEM PROCESSOR
+if [ "$OECORE_TARGET_ARCH" == "arm" ]; then
+    echo "set(CMAKE_SYSTEM_PROCESSOR armv7l)" >> ${CMAKE_FILE}
+else
+    echo "set(CMAKE_SYSTEM_PROCESSOR \"$OECORE_TARGET_ARCH\")" >> ${CMAKE_FILE}
+fi
+echo "" >> ${CMAKE_FILE}
+
+#############################################################
+# Setup the TOOLCHAIN
+TOOLCHAIN_PREFIX=${OECORE_NATIVE_SYSROOT}/${CROSS_COMPILE}
+echo "set(TOOLCHAIN_PREFIX \"$TOOLCHAIN_PREFIX\")"  >> ${CMAKE_FILE}
+
+#############################################################
+# Extract the link flags
+IFS='\ ' read -r -a array <<< "${LD}"
+unset "array[0]"
+LINK_FLAGS="${array[@]}"
+
+#############################################################
+# Setup the CC Compiler
+
+# Split the CC to get compiler name and flags in an array
+IFS='\ ' array=($CC)
+#Compiler is the first entry
+C_COMPILER=`which ${array[0]}`
+echo "set(CMAKE_C_COMPILER \"${C_COMPILER}\")" >> ${CMAKE_FILE}
+# Remove the first entry
+unset "array[0]"
+
+echo "set(CMAKE_C_FLAGS \"\${CMAKE_C_FLAGS} ${array[@]} ${CFLAGS}\")" >> ${CMAKE_FILE}
+
+echo "set(CMAKE_C_LINK_FLAGS \"\${CMAKE_C_LINK_FLAGS} ${LINK_FLAGS}\")" >> ${CMAKE_FILE}
+
+echo "set(CMAKE_C_FLAGS \"\${CMAKE_C_FLAGS} -Wno-error=array-bounds\")" >> ${CMAKE_FILE}
+#############################################################
+# Setup the CXX Compiler
+
+# Split the CXX to get compiler name and flags in an array
+IFS='\ ' array=(${CXX})
+
+#Compiler is the first entry
+CXX_COMPILER=`which ${array[0]}`
+echo "set(CMAKE_CXX_COMPILER \"${CXX_COMPILER}\")" >> ${CMAKE_FILE}
+# Remove the first entry
+unset "array[0]"
+
+echo "set(CMAKE_CXX_FLAGS \"\${CMAKE_CXX_FLAGS} ${OECORE_TUNE_CCARGS} ${KCFLAGS} -Wno-psabi\")" >> ${CMAKE_FILE}
+
+echo "set(CMAKE_CXX_LINK_FLAGS \"\${CMAKE_CXX_LINK_FLAGS} ${LINK_FLAGS}\")" >> ${CMAKE_FILE}
+
+# Add -Wno-error=array-bounds due to a gcc 11.3 compile error
+echo "set(CMAKE_CXX_FLAGS \"\${CMAKE_CXX_FLAGS} -Wno-error=array-bounds\")" >> ${CMAKE_FILE}
+
+# Add -Wno-error=narrowing due to a gcc 12.2 compile error for OpenVINO
+echo "set(CMAKE_CXX_FLAGS \"\${CMAKE_CXX_FLAGS} -Wno-error=narrowing\")" >> ${CMAKE_FILE}
+
+
+CXXFLAGS_DEBUG=${CXXFLAGS/-O2/-O0}
+echo "set(CMAKE_CXX_FLAGS_DEBUG \"${CXXFLAGS_DEBUG}\")" >> ${CMAKE_FILE}
+echo "set(CMAKE_CXX_FLAGS_RELEASE \"${CXXFLAGS}\")" >> ${CMAKE_FILE}
+
+################################################################
+echo "set(ENV{CFLAGS} \${CMAKE_C_FLAGS})" >> ${CMAKE_FILE}
+echo "set(ENV{CXXFLAGS} \${CMAKE_CXX_FLAGS})" >> ${CMAKE_FILE}
+echo "set(ENV{CC} \${CMAKE_C_COMPILER})" >> ${CMAKE_FILE}
+echo "set(ENV{CXX} \${CMAKE_CXX_COMPILER})" >> ${CMAKE_FILE}
+echo "set(ENV{LDFLAGS} \${LINK_FLAGS})" >> ${CMAKE_FILE}
+
+echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> ${CMAKE_FILE}
+echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> ${CMAKE_FILE}
+echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> ${CMAKE_FILE}
+echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> ${CMAKE_FILE}
diff --git a/python/openvino/runtime/scripts/hps/setup_toolchain.sh b/python/openvino/runtime/scripts/hps/setup_toolchain.sh
new file mode 100755
index 0000000..f88c001
--- /dev/null
+++ b/python/openvino/runtime/scripts/hps/setup_toolchain.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Script to unpack the Yocto SDK and setup a toolchain file
+unset LD_LIBRARY_PATH
+
+SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+TOOLCHAIN_DIR=`pwd`/embedded_arm_sdk
+TOOLCHAIN_FILEDIR=${TOOLCHAIN_DIR}/cmake
+TOOLCHAIN_FILE=${TOOLCHAIN_FILEDIR}/embedded.arm.cmake
+
+# If we have a parameter then use as the poky install script
+POKY_FILE=`pwd`/poky*.sh
+if [ $# -gt 0 ]; then
+    POKY_FILE=$1
+fi
+
+###########################################################
+# If the toolchain file already exists then do nothing
+# If you want to recreate then delete ${TOOLCHAIN_DIR}
+if [ -e ${TOOLCHAIN_DIR} ]; then
+    echo "Toolchain file already exists. ${TOOLCHAIN_DIR}"
+    exit 0
+fi
+
+# Install the Yocto SDK
+./$POKY_FILE -y -d ${TOOLCHAIN_DIR}
+if [ $? != 0 ]; then
+    echo "Failed to install Yocto SDK"
+    exit 1
+fi
+
+# Create the Toolchain file
+${SCRIPT_PATH}/create_toolchain_file.sh ${TOOLCHAIN_DIR}
+exit $?
diff --git a/python/openvino/runtime/segmentation_demo/CMakeLists.txt b/python/openvino/runtime/segmentation_demo/CMakeLists.txt
new file mode 100644
index 0000000..d845d30
--- /dev/null
+++ b/python/openvino/runtime/segmentation_demo/CMakeLists.txt
@@ -0,0 +1,64 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_STANDARD_REQUIRED ON)
+if (NOT("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel"))
+        set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+endif()
+
+set (TARGET_NAME "segmentation_demo")
+
+file (GLOB MAIN_SRC
+     ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+file (GLOB MAIN_HEADERS
+    # Add header files here
+)
+
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS core highgui imgcodecs imgproc videoio REQUIRED)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../common/monitors/include
+    $ENV{COREDLA_ROOT}/dla_plugin/inc/
+)
+
+if (NOT WIN32)
+    set (LIB_DL dl)
+endif()
+
+if (NOT TARGET ie_samples_utils)
+  message (FATAL_ERROR "ie_samples_utils missing")
+endif()
+
+target_link_libraries(${TARGET_NAME}
+    openvino::runtime
+    coreDLAHeteroPlugin
+    openvino_dev_api
+    ie_samples_utils
+    ${OpenCV_LIBRARIES}
+    monitors
+    utils
+    models
+    pipelines
+)
+
+if(NOT WIN32)
+    target_link_libraries(${TARGET_NAME} ${LIB_DL} pthread)
+endif()
+
+set_target_properties(${TARGET_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../lib")
+
+# For libcoreDlaRuntimePlugin.so - typically specified by $COREDLA_ROOT/runtime/plugins.xml
+set_target_properties(${TARGET_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN/..")
+
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/bin" COMPONENT DEMO)
+install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION "dla/not_shipped/bin" COMPONENT NOT_SHIPPED)
diff --git a/python/openvino/runtime/segmentation_demo/README.md b/python/openvino/runtime/segmentation_demo/README.md
new file mode 100644
index 0000000..4c750e1
--- /dev/null
+++ b/python/openvino/runtime/segmentation_demo/README.md
@@ -0,0 +1,16 @@
+# Image Segmentation C++ Demo
+
+### Running with CoreDLA
+In addition to the options described below, include the arguments:
+
+-  `-plugins=<path the plugins.xml>`, using the path to [plugins.xml](../plugins.xml)
+- `-d HETERO:FPGA,CPU`
+- `-arch_file <path to arch file>`, using the path to the architecture used when creating the FPGA bitstream
+
+Use the `-build_demo` option to the runtime/build_runtime.sh script to build the demos.
+
+See the documentation that is included with the example design.
+
+Use the `unet_camvid_onnx_0001` with the segmentation demo.  The `semantic-segmentation-adas-0001` graph is not supported in the current release of the Intel FPGA AI Suite and will not work.
+
+For detailed information on the OpenVINO C++ Segmentation Demo, please see the [README](https://github.com/openvinotoolkit/open_model_zoo/blob/2023.3.0/demos/segmentation_demo/cpp/README.md) in the OpenVINO repository. Make sure to match the git tag with your installed version of OpenVINO for compatibility.
diff --git a/python/openvino/runtime/segmentation_demo/main.cpp b/python/openvino/runtime/segmentation_demo/main.cpp
new file mode 100644
index 0000000..1631e29
--- /dev/null
+++ b/python/openvino/runtime/segmentation_demo/main.cpp
@@ -0,0 +1,445 @@
+/*
+// Copyright (C) 2018-2022 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <chrono>
+#include <exception>
+#include <initializer_list>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <openvino/openvino.hpp>
+
+#include <models/input_data.h>
+#include <models/model_base.h>
+#include <models/results.h>
+#include <models/segmentation_model.h>
+#include <monitors/presenter.h>
+#include <pipelines/async_pipeline.h>
+#include <pipelines/metadata.h>
+#include <utils/common.hpp>
+#include <utils/config_factory.h>
+#include <utils/default_flags.hpp>
+#include <utils/images_capture.h>
+#include <utils/ocv_common.hpp>
+#include <utils/performance_metrics.hpp>
+#include <utils/slog.hpp>
+
+#include <sys/stat.h>
+
+DEFINE_INPUT_FLAGS
+DEFINE_OUTPUT_FLAGS
+
+static const char help_message[] = "Print a usage message.";
+static const char model_message[] = "Required. Path to an .xml file with a trained model.";
+static const char target_device_message[] =
+    "Optional. Specify the target device to infer on (the list of available devices is shown below). "
+    "Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
+    "The demo will look for a suitable plugin for a specified device.";
+static const char labels_message[] = "Optional. Path to a file with labels mapping.";
+static const char layout_message[] = "Optional. Specify inputs layouts."
+                                     " Ex. NCHW or input0:NCHW,input1:NC in case of more than one input.";
+static const char raw_output_message[] = "Optional. Output inference results as mask histogram.";
+static const char nireq_message[] = "Optional. Number of infer requests. If this option is omitted, number of infer "
+                                    "requests is determined automatically.";
+static const char input_resizable_message[] =
+    "Optional. Enables resizable input with support of ROI crop & auto resize.";
+static const char num_threads_message[] = "Optional. Number of threads.";
+static const char num_streams_message[] = "Optional. Number of streams to use for inference on the CPU or/and GPU in "
+                                          "throughput mode (for HETERO and MULTI device cases use format "
+                                          "<device1>:<nstreams1>,<device2>:<nstreams2> or just <nstreams>)";
+static const char no_show_message[] = "Optional. Don't show output.";
+static const char utilization_monitors_message[] = "Optional. List of monitors to show initially.";
+static const char output_resolution_message[] =
+    "Optional. Specify the maximum output window resolution "
+    "in (width x height) format. Example: 1280x720. Input frame size used by default.";
+static const char only_masks_message[] = "Optional. Display only masks. Could be switched by TAB key.";
+
+static const char plugins_message[] = "Optional. Select a custom plugins_xml file to use.";
+static const char arch_file_message[] = "Optional. Provide a path for the architecture .arch file.";
+
+DEFINE_bool(h, false, help_message);
+DEFINE_string(m, "", model_message);
+DEFINE_string(d, "CPU", target_device_message);
+DEFINE_string(labels, "", labels_message);
+DEFINE_string(layout, "", layout_message);
+DEFINE_bool(r, false, raw_output_message);
+DEFINE_int32(nireq, 0, nireq_message);
+DEFINE_bool(auto_resize, false, input_resizable_message);
+DEFINE_int32(nthreads, 0, num_threads_message);
+DEFINE_string(nstreams, "", num_streams_message);
+DEFINE_bool(no_show, false, no_show_message);
+DEFINE_string(u, "", utilization_monitors_message);
+DEFINE_string(output_resolution, "", output_resolution_message);
+DEFINE_bool(only_masks, false, only_masks_message);
+
+DEFINE_string(plugins, "", plugins_message);
+DEFINE_string(arch_file, "", arch_file_message);
+
+/**
+ * \brief This function shows a help message
+ */
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "segmentation_demo [OPTION]" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    -h                        " << help_message << std::endl;
+    std::cout << "    -i                        " << input_message << std::endl;
+    std::cout << "    -m \"<path>\"               " << model_message << std::endl;
+    std::cout << "    -o \"<path>\"               " << output_message << std::endl;
+    std::cout << "    -limit \"<num>\"            " << limit_message << std::endl;
+    std::cout << "    -d \"<device>\"             " << target_device_message << std::endl;
+    std::cout << "    -labels \"<path>\"          " << labels_message << std::endl;
+    std::cout << "    -layout \"<string>\"        " << layout_message << std::endl;
+    std::cout << "    -r                        " << raw_output_message << std::endl;
+    std::cout << "    -nireq \"<integer>\"        " << nireq_message << std::endl;
+    std::cout << "    -auto_resize              " << input_resizable_message << std::endl;
+    std::cout << "    -nthreads \"<integer>\"     " << num_threads_message << std::endl;
+    std::cout << "    -nstreams                 " << num_streams_message << std::endl;
+    std::cout << "    -loop                     " << loop_message << std::endl;
+    std::cout << "    -no_show                  " << no_show_message << std::endl;
+    std::cout << "    -output_resolution        " << output_resolution_message << std::endl;
+    std::cout << "    -u                        " << utilization_monitors_message << std::endl;
+    std::cout << "    -only_masks               " << only_masks_message << std::endl;
+}
+
+bool exists_test (const std::string& name) {
+  struct stat buffer;
+  return (stat(name.c_str(), &buffer) == 0);
+}
+
+bool ParseAndCheckCommandLine(int argc, char* argv[]) {
+    // ---------------------------Parsing and validation of input args--------------------------------------
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        showUsage();
+        showAvailableDevices();
+        return false;
+    }
+
+    if (FLAGS_i.empty()) {
+        throw std::logic_error("Parameter -i is not set");
+    }
+
+    if (FLAGS_m.empty()) {
+        throw std::logic_error("Parameter -m is not set");
+    }
+
+    if(!FLAGS_plugins.empty()) {
+        std::cout << "Using custom plugins xml file - " << FLAGS_plugins << std::endl;
+    }
+
+    if (!exists_test(FLAGS_plugins)) {
+        std::cout << "Error: plugins_xml file: " << FLAGS_plugins << " doesn't exist. Please provide a valid path." << std::endl;
+        throw std::logic_error("plugins_xml file path does not exist.");
+    }
+
+    if (!FLAGS_output_resolution.empty() && FLAGS_output_resolution.find("x") == std::string::npos) {
+        throw std::logic_error("Correct format of -output_resolution parameter is \"width\"x\"height\".");
+    }
+    return true;
+}
+
+// TODO: replace with cv::applyColorMap() after OpenCV3 is dropped
+class ParallelLoopBodyLambda : public cv::ParallelLoopBody {
+    std::function<void(const cv::Range &)> f;
+public:
+    explicit ParallelLoopBodyLambda(std::function<void(const cv::Range &)> f): f{f} {}
+    void operator()(const cv::Range& range) const override {f(range);}
+};
+
+void applyColorMapOpenCV(const cv::Mat& src, cv::Mat& dstMat, const cv::Mat& _lut) {
+    const int lut_type = _lut.type();
+
+    cv::Mat srcGray;
+    if (src.channels() == 1)
+        srcGray = src;
+    else
+        cv::cvtColor(src, srcGray, cv::COLOR_BGR2GRAY);//BGR because of historical cv::LUT() usage
+
+    dstMat.create(src.size(), lut_type);
+
+    //we do not use cv::LUT() which requires src.channels() == dst.channels()
+    const int rows = srcGray.rows;
+    const int cols = srcGray.cols;
+    const int minimalPixelsPerPacket = 1<<12;
+    const int rowsPerPacket = std::max(1, minimalPixelsPerPacket/cols);
+    const int rowsPacketsCount = (rows+rowsPerPacket-1)/rowsPerPacket;
+    const cv::Range all(0, rows);
+
+    if (lut_type == CV_8UC1) {
+        typedef unsigned char lut_pixel_t;
+        const lut_pixel_t* srcLUT = _lut.ptr<lut_pixel_t>(0);
+        ParallelLoopBodyLambda body([&, cols](const cv::Range& range) -> void {
+            for(int row = range.start ; row<range.end ; ++row)  {
+                const unsigned char* srcRow = srcGray.ptr<unsigned char>(row);
+                lut_pixel_t* dstRow = dstMat.ptr<lut_pixel_t>(row);
+                for(int col = 0 ; col<cols ; ++col)
+                    *dstRow++ = srcLUT[*srcRow++];
+            }
+        });
+        cv::parallel_for_(all, body, rowsPacketsCount);
+    }
+    else if (lut_type == CV_8UC3) {
+        typedef cv::Vec3b lut_pixel_t;
+        const lut_pixel_t* srcLUT = _lut.ptr<lut_pixel_t>(0);
+        ParallelLoopBodyLambda body([&, cols](const cv::Range& range) -> void {
+            for(int row = range.start ; row<range.end ; ++row)  {
+                const unsigned char* srcRow = srcGray.ptr<unsigned char>(row);
+                lut_pixel_t* dstRow = dstMat.ptr<lut_pixel_t>(row);
+                for(int col = 0 ; col<cols ; ++col)
+                    *dstRow++ = srcLUT[*srcRow++];
+            }
+        });
+        cv::parallel_for_(all, body, rowsPacketsCount);
+    }
+}
+
+cv::Mat applyColorMap(cv::Mat input) {
+    // Initializing colors array if needed
+    static const Color PASCAL_VOC_COLORS[] = {
+        {0, 0, 0},       {128, 0, 0},     {0, 128, 0}, {128, 128, 0}, {0, 0, 128},   {128, 0, 128}, {0, 128, 128},
+        {128, 128, 128}, {64, 0, 0},      {192, 0, 0}, {64, 128, 0},  {192, 128, 0}, {64, 0, 128},  {192, 0, 128},
+        {64, 128, 128},  {192, 128, 128}, {0, 64, 0},  {128, 64, 0},  {0, 192, 0},   {128, 192, 0}, {0, 64, 128}};
+    static cv::Mat colors;
+    static std::mt19937 rng;
+    static std::uniform_int_distribution<int> distr(0, 255);
+
+    if (colors.empty()) {
+        colors = cv::Mat(256, 1, CV_8UC3);
+        std::size_t i = 0;
+        for (; i < arraySize(PASCAL_VOC_COLORS); ++i) {
+            colors.at<cv::Vec3b>(i, 0) = {PASCAL_VOC_COLORS[i].blue(),
+                                          PASCAL_VOC_COLORS[i].green(),
+                                          PASCAL_VOC_COLORS[i].red()};
+        }
+        for (; i < (std::size_t)colors.cols; ++i) {
+            colors.at<cv::Vec3b>(i, 0) = cv::Vec3b(distr(rng), distr(rng), distr(rng));
+        }
+    }
+
+    // Converting class to color
+    cv::Mat out;
+    applyColorMapOpenCV(input, out, colors);
+    return out;
+}
+
+cv::Mat renderSegmentationData(const ImageResult& result, OutputTransform& outputTransform, bool masks_only) {
+    if (!result.metaData) {
+        throw std::invalid_argument("Renderer: metadata is null");
+    }
+
+    // Input image is stored inside metadata, as we put it there during submission stage
+    auto inputImg = result.metaData->asRef<ImageMetaData>().img;
+
+    if (inputImg.empty()) {
+        throw std::invalid_argument("Renderer: image provided in metadata is empty");
+    }
+
+    // Visualizing result data over source image
+    cv::Mat output =
+        masks_only ? applyColorMap(result.resultImage) : inputImg / 2 + applyColorMap(result.resultImage) / 2;
+    outputTransform.resize(output);
+    return output;
+}
+
+void printRawResults(const ImageResult& result, std::vector<std::string> labels) {
+    slog::debug << " --------------- Frame # " << result.frameId << " ---------------" << slog::endl;
+    slog::debug << "     Class ID     | Pixels | Percentage " << slog::endl;
+
+    double min_val, max_val;
+    cv::minMaxLoc(result.resultImage, &min_val, &max_val);
+    int max_classes = static_cast<int>(max_val) + 1;  // We use +1 for only background case
+    const float range[] = {0, static_cast<float>(max_classes)};
+    const float* ranges[] = {range};
+    cv::Mat histogram;
+    cv::calcHist(&result.resultImage, 1, 0, cv::Mat(), histogram, 1, &max_classes, ranges);
+
+    const double all = result.resultImage.cols * result.resultImage.rows;
+    for (int i = 0; i < max_classes; ++i) {
+        const int value = static_cast<int>(histogram.at<float>(i));
+        if (value > 0) {
+            std::string label = (size_t)i < labels.size() ? labels[i] : "#" + std::to_string(i);
+            slog::debug << " " << std::setw(16) << std::left << label << " | " << std::setw(6) << value << " | "
+                        << std::setw(5) << std::setprecision(2) << std::fixed << std::right << value / all * 100 << "%"
+                        << slog::endl;
+        }
+    }
+}
+
+int main(int argc, char* argv[]) {
+    try {
+        PerformanceMetrics metrics, renderMetrics;
+
+        // ------------------------------ Parsing and validation of input args ---------------------------------
+        if (!ParseAndCheckCommandLine(argc, argv)) {
+            return 0;
+        }
+
+        //------------------------------- Preparing Input ------------------------------------------------------
+        auto cap = openImagesCapture(FLAGS_i, FLAGS_loop, FLAGS_nireq == 1 ? read_type::efficient : read_type::safe);
+        cv::Mat curr_frame;
+
+        //------------------------------ Running Segmentation routines ----------------------------------------------
+        slog::info << ov::get_openvino_version() << slog::endl;
+
+        ov::Core core(FLAGS_plugins);
+        AsyncPipeline pipeline(
+            std::unique_ptr<SegmentationModel>(new SegmentationModel(FLAGS_m, FLAGS_auto_resize, FLAGS_layout)),
+            ConfigFactory::getUserConfig(FLAGS_d, FLAGS_nireq, FLAGS_nstreams, FLAGS_nthreads, FLAGS_arch_file),
+            core);
+        Presenter presenter(FLAGS_u);
+
+        std::vector<std::string> labels;
+        if (!FLAGS_labels.empty()) {
+            labels = SegmentationModel::loadLabels(FLAGS_labels);
+        }
+
+        bool keepRunning = true;
+        int64_t frameNum = -1;
+        std::unique_ptr<ResultBase> result;
+        uint32_t framesProcessed = 0;
+        LazyVideoWriter videoWriter{FLAGS_o, cap->fps(), static_cast<unsigned int>(FLAGS_limit)};
+
+        cv::Size outputResolution;
+        OutputTransform outputTransform = OutputTransform();
+        size_t found = FLAGS_output_resolution.find("x");
+
+        bool only_masks = FLAGS_only_masks;
+
+        while (keepRunning) {
+            if (pipeline.isReadyToProcess()) {
+                auto startTime = std::chrono::steady_clock::now();
+
+                //--- Capturing frame
+                curr_frame = cap->read();
+
+                if (curr_frame.empty()) {
+                    // Input stream is over
+                    break;
+                }
+
+                frameNum = pipeline.submitData(ImageInputData(curr_frame),
+                                               std::make_shared<ImageMetaData>(curr_frame, startTime));
+            }
+
+            if (frameNum == 0) {
+                if (found == std::string::npos) {
+                    outputResolution = curr_frame.size();
+                } else {
+                    outputResolution = cv::Size{
+                        std::stoi(FLAGS_output_resolution.substr(0, found)),
+                        std::stoi(FLAGS_output_resolution.substr(found + 1, FLAGS_output_resolution.length()))};
+                    outputTransform = OutputTransform(curr_frame.size(), outputResolution);
+                    outputResolution = outputTransform.computeResolution();
+                }
+            }
+
+            //--- Waiting for free input slot or output data available. Function will return immediately if any of them
+            // are available.
+            pipeline.waitForData();
+
+            //--- Checking for results and rendering data if it's ready
+            //--- If you need just plain data without rendering - cast result's underlying pointer to ImageResult*
+            //    and use your own processing instead of calling renderSegmentationData().
+            while (keepRunning && (result = pipeline.getResult())) {
+                auto renderingStart = std::chrono::steady_clock::now();
+                cv::Mat outFrame = renderSegmentationData(result->asRef<ImageResult>(), outputTransform, only_masks);
+                //--- Showing results and device information
+                if (FLAGS_r) {
+                    printRawResults(result->asRef<ImageResult>(), labels);
+                }
+                presenter.drawGraphs(outFrame);
+                renderMetrics.update(renderingStart);
+                metrics.update(result->metaData->asRef<ImageMetaData>().timeStamp,
+                               outFrame,
+                               {10, 22},
+                               cv::FONT_HERSHEY_COMPLEX,
+                               0.65);
+                videoWriter.write(outFrame);
+                framesProcessed++;
+                if (!FLAGS_no_show) {
+                    cv::imshow("Segmentation Results", outFrame);
+
+                    //--- Processing keyboard events
+                    auto key = cv::waitKey(1);
+                    if (27 == key || 'q' == key || 'Q' == key) {  // Esc
+                        keepRunning = false;
+                    } else if (9 == key) {
+                        only_masks = !only_masks;
+                    } else {
+                        presenter.handleKey(key);
+                    }
+                }
+            }
+        }  // while(keepRunning)
+
+        // ------------ Waiting for completion of data processing and rendering the rest of results ---------
+        pipeline.waitForTotalCompletion();
+
+        for (; framesProcessed <= frameNum; framesProcessed++) {
+            result = pipeline.getResult();
+            if (result != nullptr) {
+                cv::Mat outFrame = renderSegmentationData(result->asRef<ImageResult>(), outputTransform, only_masks);
+                //--- Showing results and device information
+                if (FLAGS_r) {
+                    printRawResults(result->asRef<ImageResult>(), labels);
+                }
+                presenter.drawGraphs(outFrame);
+                metrics.update(result->metaData->asRef<ImageMetaData>().timeStamp,
+                               outFrame,
+                               {10, 22},
+                               cv::FONT_HERSHEY_COMPLEX,
+                               0.65);
+                videoWriter.write(outFrame);
+                if (!FLAGS_no_show) {
+                    cv::imshow("Segmentation Results", outFrame);
+                    //--- Updating output window
+                    cv::waitKey(1);
+                }
+            }
+        }
+
+        slog::info << "Metrics report:" << slog::endl;
+        metrics.logTotal();
+        logLatencyPerStage(cap->getMetrics().getTotal().latency,
+                           pipeline.getPreprocessMetrics().getTotal().latency,
+                           pipeline.getInferenceMetircs().getTotal().latency,
+                           pipeline.getPostprocessMetrics().getTotal().latency,
+                           renderMetrics.getTotal().latency);
+        slog::info << presenter.reportMeans() << slog::endl;
+    } catch (const std::exception& error) {
+        slog::err << error.what() << slog::endl;
+        return 1;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/python/openvino/runtime/segmentation_demo/models.lst b/python/openvino/runtime/segmentation_demo/models.lst
new file mode 100755
index 0000000..773f23a
--- /dev/null
+++ b/python/openvino/runtime/segmentation_demo/models.lst
@@ -0,0 +1,14 @@
+# This file can be used with the --list option of the model downloader.
+deeplabv3
+drn-d-38
+erfnet
+fastseg-large
+fastseg-small
+hrnet-v2-c1-segmentation
+icnet-camvid-ava-????
+icnet-camvid-ava-sparse-??-????
+ocrnet-hrnet-w48-paddle
+pspnet-pytorch
+road-segmentation-adas-????
+semantic-segmentation-adas-????
+unet-camvid-onnx-????
diff --git a/python/openvino/runtime/streaming/ed0_streaming_example/README.md b/python/openvino/runtime/streaming/ed0_streaming_example/README.md
new file mode 100644
index 0000000..1cc241a
--- /dev/null
+++ b/python/openvino/runtime/streaming/ed0_streaming_example/README.md
@@ -0,0 +1,14 @@
+This directory contains an example system-console tcl script for the hostless
+streaming example design on the Agilex 7 I-series Development Kit.
+
+The system-console tcl script does the following:
+  1. Initialize path to JTAG Avalon Master IP
+  2. Initiates a reset via sources IP
+  3. Writes to coreDLA's CSR registers to prime for inference
+  4. Streams input data (img.bin) into on-chip memory via JTAG
+  5. Writes a descriptor into egress DMA (coreDLA -> on-chip memory)
+  6. Writes a descriptor into ingress DMA - beginning streaming process
+    from on-chip memory to DLA
+  7. Streams output from onchip memory to output.bin via JTAG
+
+This tcl script serves as an example for a specific CNN model. To understand how this "runtime" script can be extended to support your graph, please consult the Getting Started Guide.
diff --git a/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script.tcl b/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script.tcl
new file mode 100644
index 0000000..ab78d2e
--- /dev/null
+++ b/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script.tcl
@@ -0,0 +1,365 @@
+# This design example only supports an AXI Width of 128 bits = 16 bytes
+variable AXI_STREAM_DATA_WIDTH_BYTES 16
+# This design example has a limit to ingress on-chip memory size in bytes
+variable INGRESS_ON_CHIP_MEMORY_SIZE_BYTES 524288
+# This design example has a limit to egress on-chip memory size in bytes
+variable EGRESS_ON_CHIP_MEMORY_SIZE_BYTES 131072
+
+# DDR-Free ED Address Map Constants
+variable DLA_IP_0_CSR_ADDR 0x00038000
+variable INGRESS_SGDMA_CSR_ADDR 0x00030000
+variable INGRESS_SGDMA_DESCRIPTOR_ADDR 0x00030020
+variable EGRESS_SGDMA_CSR_ADDR 0x00030040
+variable EGRESS_SGDMA_DESCRIPTOR_ADDR 0x00030060
+
+
+# Process to validate arguments to script
+proc validate_args {input_file num_inferences} {
+  global INGRESS_ON_CHIP_MEMORY_SIZE_BYTES
+  global AXI_STREAM_DATA_WIDTH_BYTES
+  # Make sure user requested number of inferences is valid
+  if {$num_inferences < 0} {
+    puts "Number of inferences must be greater than 0."
+    exit 1
+  }
+
+  # Check if the file exists 
+  if {![file exists $input_file]} {
+      puts "Error: The file '$input_file' does not exist."
+      exit 1
+  }
+
+  # Get the size of the file in bytes
+  set file_size [file size $input_file]
+
+  # Make sure the input file can fit into on-chip memory
+  if {$file_size > $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
+      puts "Input file '$input_file' is too large to fully fit into on-chip memory of size 
+      $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes. Input file will be partitioned and transferred partially.\n"
+  }
+  
+  # Make sure the input file is aligned to the mSGDMA/FPGA AI Suite stream width
+  if {[expr {$file_size % $AXI_STREAM_DATA_WIDTH_BYTES}] != 0} {
+      puts "Error: this design example only supports input sizes aligned to 128 bits. Please pad accordingly."
+      exit 1
+  }
+
+  # Format input file size into hex representation
+  set file_size_hex [format "0x%X" $file_size]
+
+  return $file_size
+}
+
+
+# Process to calculate # of AXI transfers that will be sent out of output streamer
+# The output streamer will send out a number of AXI transfers based on the output shape
+# H, W, C and AXI stream data width
+proc calulate_egress_axi_transfers {C H W} {
+  global EGRESS_ON_CHIP_MEMORY_SIZE_BYTES
+  global AXI_STREAM_DATA_WIDTH_BYTES
+
+  # Calculation for # of AXI transfers from output streamer
+  # # of transfers in bytes = H * W * ceil(C/8)*16
+  set output_streamer_transfers_bytes [expr {
+    $H * $W * (int(($C + 7) / 8) * 16)
+  }]
+
+  # Make sure output streamer # of transfer bytes is aligned to AXI_STREAM_DATA_WIDTH
+  if {$output_streamer_transfers_bytes <=0 || [expr {$output_streamer_transfers_bytes % $AXI_STREAM_DATA_WIDTH_BYTES}] != 0} {
+    puts "Error with egress AXI transfer calculation. Please check your output shape size arguments (C H W)"
+    exit 1
+  }
+
+  # Ensure output inference result can fit into on-chip memory
+  if {$output_streamer_transfers_bytes > $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
+      puts "Output inference results is too large to fully fit into on-chip memory of size 
+      $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes. Output inference results will be partitioned and transferred partially.\n"
+  }
+  # Format input file size into hex representation
+  set output_streamer_transfers_hex [format "0x%X" $output_streamer_transfers_bytes]
+  puts "Expecting $output_streamer_transfers_hex bytes to be transferred by FPGA AI Suite output streamer"
+
+  return $output_streamer_transfers_bytes
+}
+
+
+# Initiate reset via source/probe IP
+proc assert_reset {} {
+  set issp_index 0
+  set issp [lindex [get_service_paths issp] 0]
+  set claimed_issp [claim_service issp $issp mylib]
+  set source_data 0x0
+  issp_write_source_data $claimed_issp $source_data
+  set source_data 0x1
+  issp_write_source_data $claimed_issp $source_data
+}
+
+
+# Initializing coreDLA (register map: fpga/csr/rtl/inc/dla_csr_constants.svh)
+proc initialize_coredla {master_path} {
+  global DLA_IP_0_CSR_ADDR
+  global INGRESS_SGDMA_CSR_ADDR
+  global EGRESS_SGDMA_CSR_ADDR
+
+  set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x220}]
+  master_write_32 $master_path $csr_register_addr 0
+
+  set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x204}]
+  master_write_32 $master_path $csr_register_addr 0
+
+  set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x200}]
+  master_write_32 $master_path $csr_register_addr 3
+
+  # Writing 0x1 to this register will instruct DLA to accept input until register is cleared
+  set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x22c}]
+  master_write_32 $master_path $csr_register_addr 1
+
+  # Reset egress SGDMA
+  set csr_register_addr [expr {$EGRESS_SGDMA_CSR_ADDR + 0x4}]
+  master_write_32 $master_path $csr_register_addr 0x2
+
+  # Reset ingress SGDMA
+  set csr_register_addr [expr {$INGRESS_SGDMA_CSR_ADDR + 0x4}]
+  master_write_32 $master_path $csr_register_addr 0x2
+}
+
+
+proc stage_input {input_file master_path} {
+  # Initializing rom with input image
+  master_write_from_file $master_path $input_file 0x00200000
+}
+
+
+# Adding descriptor to egress streaming mSGDMA
+proc queue_egress_descriptor {master_path size} {
+  global EGRESS_SGDMA_DESCRIPTOR_ADDR
+
+  # Destination addr
+  set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0x4}]
+  master_write_32 $master_path $csr_register_addr 0x00280000
+
+  # Length should be 128 bit aligned
+  set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0x8}]
+  master_write_32 $master_path $csr_register_addr $size
+
+  # Queue descriptor (Writing 0x8000_0000)
+  set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0xc}]
+  master_write_32 $master_path $csr_register_addr 0x80000000
+}
+
+
+# Adding descriptor to ingress streaming mSGDMA
+proc queue_ingress_descriptor {master_path size} {
+  global INGRESS_SGDMA_DESCRIPTOR_ADDR
+
+  # Source addr
+  master_write_32 $master_path $INGRESS_SGDMA_DESCRIPTOR_ADDR 0x00200000
+
+  # Transfer length in bytes (input size)
+  set csr_register_addr [expr {$INGRESS_SGDMA_DESCRIPTOR_ADDR + 0x8}]
+  master_write_32 $master_path $csr_register_addr $size
+
+  # Queue descriptor
+  set csr_register_addr [expr {$INGRESS_SGDMA_DESCRIPTOR_ADDR + 0xc}]
+  master_write_32 $master_path $csr_register_addr 0x80000000
+}
+
+
+# Read output from on-chip memory
+proc read_output {master_path output_file size} {
+  master_read_to_file $master_path $output_file 0x00280000 $size
+}
+
+
+# Read output from on-chip memory
+proc check_inference_count {master_path iteration} {
+  global DLA_IP_0_CSR_ADDR
+  # Completion counter assert from index
+  set completion_counter_assert 0x00000000
+  set completion_counter_assert [expr {$completion_counter_assert + $iteration}]
+  set formatted_counter_assert [format "0x%08X" $completion_counter_assert]
+
+  # Check what completion counter CSR in HW is set to
+  set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x224}]
+  set completion_counter_result [master_read_32 $master_path $csr_register_addr 1]
+  puts "Completion counter from HW: $completion_counter_result"
+  if {$completion_counter_result != $formatted_counter_assert} {
+    error "Error: completion counter should be equal to $formatted_counter_assert but instead is $completion_counter_result"
+  }
+}
+
+
+# This process handles creating a binary file from input partition data 
+proc create_input_bin {partition_data index} {
+  set temp_file "chunk_$index.bin"
+  set temp_fh [open $temp_file "wb"]
+  fconfigure $temp_fh -translation binary
+  puts -nonewline $temp_fh $partition_data
+  close $temp_fh
+  return $temp_file
+}
+
+
+# This process polls a register and returns if assertion is true within a timeout window  
+proc poll_register {master_path register_addr register_val_assert} {
+  # Set timeout to be 30 seconds (in centi-seconds)
+  set timeout_count 3000
+  while {$timeout_count > 0} {
+    set register_val [master_read_32 $master_path $register_addr 1]
+    if {$register_val == $register_val_assert} {
+      break
+    }
+    set timeout_count [expr {$timeout_count - 1}]
+    after 10
+  }
+  if {$timeout_count == 0} {
+    puts "Register polling timeout. CSR addr: $register_addr = $register_val \nRegister should be = $register_val_assert"
+    exit 1
+  }
+}
+
+
+# Printing usage process
+proc print_usage {} {
+  puts "Usage: system-console --script system_console_script.tcl <input.bin file> <# of inferences> 
+  <output channels> <output height> <output width>"
+  exit 1
+}
+
+
+# Main Function
+proc main {argc argv} {
+  global INGRESS_ON_CHIP_MEMORY_SIZE_BYTES
+  global EGRESS_ON_CHIP_MEMORY_SIZE_BYTES
+  global AXI_STREAM_DATA_WIDTH_BYTES
+  global INGRESS_SGDMA_DESCRIPTOR_ADDR
+  global EGRESS_SGDMA_DESCRIPTOR_ADDR
+  global INGRESS_SGDMA_CSR_ADDR
+  global EGRESS_SGDMA_CSR_ADDR
+
+  # Check if the script should display help information
+  if {$argc > 0} {
+      set firstArg [lindex $argv 0]
+      if {[string equal $firstArg "help"] || [string equal $firstArg "--help"] || [string equal $firstArg "-help"]} {
+          print_usage
+      }
+  }
+
+  # Check the total number of arguments
+  if {$argc != 5} {
+      print_usage
+  }
+
+  # Setting script arguments to variables
+  set input_file [lindex $argv 0]
+  set num_inferences [lindex $argv 1]
+  set C [lindex $argv 2]
+  set H [lindex $argv 3]
+  set W [lindex $argv 4]
+
+  # Validating script arguments. Return input file size in bytes
+  set file_size [validate_args $input_file $num_inferences]
+  set file_size_hex [format "0x%X" $file_size]
+
+  # Calculate # of AXI transfers from FPGA AI Suite IP output streamer in bytes
+  set output_streamer_transfers [calulate_egress_axi_transfers $C $H $W]
+
+  puts "\nInput file provided: $input_file and is of size $file_size_hex bytes"
+  puts "Number of inferences: $num_inferences"
+
+  # Claim service path to System Console
+  set mpath [lindex [get_service_paths master] 0]
+  set master_path [claim_service master $mpath ""]
+
+  puts "\n________________________________________________________________________________"
+  puts "                    STARTING FPGA AI SUITE INFERENCE                            "
+  puts "________________________________________________________________________________\n"
+
+  # Assert resetn using source/probe IP
+  assert_reset
+  # Initialize coreDLA's CSR registers
+  initialize_coredla $master_path
+
+  # Open the input binary file for reading
+ 
+  for {set i 1} {$i <= $num_inferences} {incr i} {
+    # Open input file per iteration due to the potential partioning in the case where input file > INGRESS_ON_CHIP_MEMORY_SIZE_BYTES.
+    set input_fh [open $input_file "rb"]
+    fconfigure $input_fh -translation binary
+
+    # Create an output file every iteration of inferences
+    set combined_fh [open "output$i.bin" "wb"]
+    fconfigure $combined_fh -translation binary
+
+    # Logic to ensure input image can fully fit into ingress on-chip memory
+    # If not, must partition input data into chunks at a time. This allows us to queue
+    # descriptors for partial input sizes. 
+    set num_input_partition [expr {int(($file_size + $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES - 1) / $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES)}]
+    for {set j 0} {$j < $num_input_partition} {incr j} {
+      set offset [expr {$j * $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES}]
+      set size [
+        expr {($file_size - $offset) < $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES ? ($file_size - $offset) : $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES}
+      ]
+
+      # Seek to the offset and read the chunk
+      # Need to catch an error if offset > file size
+      if {[catch {seek $input_fh $offset} err]} {
+        puts "Error seeking to offset $offset: $err"
+        close $input_fh
+        exit 1
+      }
+
+      # Begin partioning the input data to INGRESS_ON_CHIP_MEMORY_SIZE_BYTES chunks 
+      set partition_data [read $input_fh $size]
+      set partition_data_file_name [create_input_bin $partition_data $j]
+      stage_input $partition_data_file_name $master_path
+      queue_ingress_descriptor $master_path $size
+      file delete $partition_data_file_name
+
+      # Poll SGDMA register to check if input data streaming is complete
+      set sgdma_csr_assert 0x00000002
+      poll_register $master_path $INGRESS_SGDMA_CSR_ADDR $sgdma_csr_assert
+    }
+
+    close $input_fh
+
+    # Logic to ensure output inference results can fully fit into egress on-chip memory
+    # If not, must partition output data into chunks at a time. This allows us to queue
+    # descriptors for partial output sizes. 
+    set num_output_partition [expr {int(($output_streamer_transfers + $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES - 1) / $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES)}]
+    for {set j 0} {$j < $num_output_partition} {incr j} {
+      set offset [expr {$j * $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES}]
+      set size [
+        expr {($output_streamer_transfers - $offset) < $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES ? ($output_streamer_transfers - $offset) : $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES}
+      ]
+      # Queue chunks of EGRESS_ON_CHIP_MEMORY_SIZE_BYTES at a time to ensure a fit in egress on-chip memory
+      queue_egress_descriptor $master_path $size
+      
+      # Poll SGDMA register to check if output data streaming is complete
+      set sgdma_csr_assert 0x00000002
+      poll_register $master_path $EGRESS_SGDMA_CSR_ADDR $sgdma_csr_assert
+
+      # Write a partition of the inference result to the partition file
+      set output_file "partition_out_$j.bin"
+      read_output $master_path $output_file $size
+
+      # Open partioned output inference result
+      set bin_fh [open $output_file "rb"]
+      fconfigure $bin_fh -translation binary
+      set bin_data [read $bin_fh]
+
+      # Append smaller partition of inference result to larger output$i.bin file for inference iteration
+      puts -nonewline $combined_fh $bin_data
+      close $bin_fh
+      file delete $output_file
+    }
+    # Ensure inference count has gone up
+    check_inference_count $master_path $i
+    close $combined_fh
+  }
+
+  puts "\n$num_inferences inferences successfully completed"
+}
+
+# Main function call
+main $argc $argv
+\ No newline at end of file
diff --git a/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script_perf.tcl b/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script_perf.tcl
new file mode 100644
index 0000000..f0cd5f7
--- /dev/null
+++ b/python/openvino/runtime/streaming/ed0_streaming_example/system_console_script_perf.tcl
@@ -0,0 +1,190 @@
+# Initiate reset via source/probe IP
+proc assert_reset {} {
+  set issp_index 0
+  set issp [lindex [get_service_paths issp] 0]
+  set claimed_issp [claim_service issp $issp mylib]
+  set source_data 0x0
+  issp_write_source_data $claimed_issp $source_data
+  set source_data 0x1
+  issp_write_source_data $claimed_issp $source_data
+}
+
+# Initializing coreDLA (register map: fpga/csr/rtl/inc/dla_csr_constants.svh)
+proc initialize_coredla {master_path} {
+  master_write_32 $master_path 0x00038220 0
+  master_write_32 $master_path 0x00038204 0
+  master_write_32 $master_path 0x00038200 3
+  # Writing 0x1 to this register will instruct DLA to accept input until register is cleared
+  master_write_32 $master_path 0x0003822c 1
+
+  # Reset egress descriptor
+  master_write_32 $master_path 0x00030044 0x2
+  # Stop the descriptor
+  master_write_32 $master_path 0x00030044 0x20
+
+  # Reset ingress descriptor
+  master_write_32 $master_path 0x00030004 0x2
+  # Stop the descriptor
+  master_write_32 $master_path 0x00030004 0x20
+}
+
+proc start_stream {master_path} {
+  # Start the egress descriptor
+  master_write_32 $master_path 0x00030044 0x00
+
+  # Start the ingress descriptor
+  master_write_32 $master_path 0x00030004 0x00
+}
+
+# This checks if the descriptor buffers are full
+proc check_descriptor_buffer_full {master_path} {
+  set egress_descriptor_status [master_read_32 $master_path 0x00030040 1]
+  set ingress_descriptor_status [master_read_32 $master_path 0x00030000 1]
+
+  if {$egress_descriptor_status & 0x4} {
+    error "Egress descriptor is full."
+  }
+  if {$ingress_descriptor_status & 0x4} {
+    error "Ingress descriptor is full."
+  }
+}
+
+proc stage_input {input_file master_path} {
+  # Initializing rom with input image
+  master_write_from_file $master_path $input_file 0x00200000
+}
+
+# Adding descriptor to egress streaming mSGDMA
+proc queue_egress_descriptor {master_path} {
+  # Destination addr
+  master_write_32 $master_path 0x00030064 0x00280000
+  # Length should be 128 bit aligned
+  master_write_32 $master_path 0x00030068 0xA800
+  # Queue descriptor
+  master_write_32 $master_path 0x0003006c 0x80000000
+}
+
+# Adding descriptor to ingress streaming mSGDMA
+proc queue_ingress_descriptor {master_path} {
+  # Source addr
+  master_write_32 $master_path 0x00030020 0x00200000
+  # Transfer length in bytes (input size)
+  master_write_32 $master_path 0x00030028 0x17A00
+  # Queue descriptor
+  master_write_32 $master_path 0x0003002c 0x80000000
+}
+
+# Copying input and output to file
+proc copy_input_for_validation {master_path} {
+  master_read_to_file $master_path input.bin 0x00200000 0x17A00
+}
+
+# Read inference counter values to get performance
+# There is an assumption here that the clk_ddr is attached to 100MHz
+proc get_performance {master_path num_inferences} {
+  set active_clk_lo [master_read_32 $master_path 0x00038240 1]
+  set active_clk_hi [master_read_32 $master_path 0x00038244 1]
+  set total_active_clk_count [expr { $active_clk_lo | ($active_clk_hi << 32) }]
+  set active_clk_count_per_inference [expr {$total_active_clk_count / $num_inferences}]
+  puts "Total active clk cycles: 0x$total_active_clk_count"
+
+  set all_active_clk_lo [master_read_32 $master_path 0x00038248 1]
+  set all_active_clk_hi [master_read_32 $master_path 0x0003824c 1]
+  set all_active_clk_count [expr { $all_active_clk_lo | ($all_active_clk_hi << 32) }]
+  set all_active_clk_count_per_inference [expr {$all_active_clk_count / $num_inferences}]
+ 
+  set core_active_clk_lo [master_read_32 $master_path 0x0003827c 1]
+  set core_active_clk_hi [master_read_32 $master_path 0x00038280 1]
+  set total_core_active_clk_count [expr { $core_active_clk_lo | ($core_active_clk_hi << 32) }]
+  set core_active_clk_count_per_inference [expr {$total_core_active_clk_count / $num_inferences}]
+  puts "Total core active clk cycles (without input and output streamer): 0x$total_core_active_clk_count"
+ 
+  set clk_period [expr { 1.0 / 100000000.0 }]
+  set final_fps [expr { 1 / ($clk_period * $active_clk_count_per_inference) }]
+  set final_latency [expr { 1 / ($clk_period * $all_active_clk_count_per_inference) }]
+
+  puts "--------------------------------------------------------"
+  puts "Final Throughput: $final_fps fps assuming 100MHz clk_ddr"
+}
+
+# Poll the completion counter until it reaches the expected number of inferences
+proc wait_for_completion_counter {master_path num_inferences} {
+  # Set timeout to be 30 seconds (in centi-seconds)
+  set timeout_count 3000
+  while {$timeout_count > 0} {
+    set completion_counter_result [master_read_32 $master_path 0x00038224 1]
+    if {$completion_counter_result == $num_inferences} {
+      break
+    }
+    set timeout_count [expr {$timeout_count - 1}]
+    after 10
+  }
+  if {$timeout_count == 0} {
+    error "Timeout hit at 30 seconds. Increase the timeout if the inference is expected to take longer."
+  }
+}
+
+# Read output from on-chip memory
+proc read_last_output {master_path num_inference} {
+  # Completion counter assert form index
+  set completion_counter_assert 0x00000000
+  set completion_counter_assert [expr {$completion_counter_assert + $num_inference}]
+  set formatted_counter_assert [format "0x%08X" $completion_counter_assert]
+
+  # Check what completion counter CSR in HW is set to
+  set completion_counter_result [master_read_32 $master_path 0x00038224 1]
+  puts "Completion counter from HW: $completion_counter_result"
+  if {$completion_counter_result == $formatted_counter_assert} {
+    master_read_to_file $master_path output0.bin 0x00280000 0xA800
+  } else {
+      error "Error: completion counter should be equal to $formatted_counter_assert but instead is $completion_counter_result"
+  }
+}
+# This design example has a limit to ingress on-chip memory size in bytes
+set INGRESS_ON_CHIP_MEMORY_SIZE_BYTES 524288
+
+# Main Function
+if {$argc != 1} {
+  error "Usage: system-console --script system_console_script_perf.tcl <input.bin file>"
+}
+set input_file [lindex $argv 0]
+puts "Input file provided: $input_file"
+
+set file_size [file size $input_file]
+
+# Make sure the input file can fit into on-chip memory
+if {$file_size > $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
+    puts "Input file '$input_file' is too large to fully fit into on-chip memory of size 
+    $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes.\nThe `system_console_script.tcl` file will 
+    partition the input file for partial transfers to solve this problem but it should not
+    be used for performance testing. Please increase the on-chip memory size for performance 
+    testing.\n"
+    exit 1
+}
+
+set mpath [lindex [get_service_paths master] 0]
+set master_path [claim_service master $mpath ""]
+
+# Assert resetn using source/probe IP
+assert_reset
+# Stage input file into on-chip memory
+stage_input $input_file $master_path
+# Initialize coreDLA's CSR registers
+initialize_coredla $master_path
+
+# Number of inferences cannot exceed the descriptor queue FIFO size
+set num_inferences 32 
+for {set i 1} {$i <= $num_inferences} {incr i} {
+  check_descriptor_buffer_full $master_path
+  # Queue egress descriptor into mSGDMA
+  queue_egress_descriptor $master_path
+  # Queue egress descriptor into mSGDMA
+  queue_ingress_descriptor $master_path
+}
+
+start_stream $master_path 
+wait_for_completion_counter $master_path $num_inferences
+get_performance $master_path $num_inferences
+read_last_output $master_path $num_inferences
+
+puts "\n$num_inferences inferences successfully completed"
diff --git a/python/openvino/runtime/streaming/image_streaming_app/CMakeLists.txt b/python/openvino/runtime/streaming/image_streaming_app/CMakeLists.txt
new file mode 100644
index 0000000..4ac6627
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright 2023 Intel Corporation
+
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+
+project(image_streaming_app)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(all_files ${header_files}
+    command_line.cpp
+    command_line.h
+    image_streaming_app.cpp
+    image_streaming_app.h
+    raw_image.cpp
+    raw_image.h
+    bmp_file.cpp
+    bmp_file.h
+    float16.h)
+
+# Targets
+add_executable(${PROJECT_NAME} ${all_files})
+
+add_subdirectory(uio)
+add_subdirectory(layout_transform)
+
+target_link_libraries(${PROJECT_NAME} layout_transform)
diff --git a/python/openvino/runtime/streaming/image_streaming_app/bmp_file.cpp b/python/openvino/runtime/streaming/image_streaming_app/bmp_file.cpp
new file mode 100644
index 0000000..6502897
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/bmp_file.cpp
@@ -0,0 +1,277 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "bmp_file.h"
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+
+BmpFile::BmpFile(const std::string& filename, bool planarBGR) {
+  bool loaded = LoadFile(filename, planarBGR);
+  (void)loaded;
+}
+
+bool BmpFile::LoadFile(const std::string& filename, bool planarBGR) {
+  std::ifstream inputFile(filename, std::fstream::binary);
+  if (inputFile.bad()) {
+    return false;
+  }
+
+  // Read signature
+  uint16_t fileSignature = 0;
+  if (!inputFile.read((char*)&fileSignature, sizeof(fileSignature))) {
+    return false;
+  }
+
+  if (fileSignature != 0x4d42) {
+    return false;
+  }
+
+  // Read file size
+  uint32_t fileSize = 0;
+  if (!inputFile.read((char*)&fileSize, sizeof(fileSize))) {
+    return false;
+  }
+
+  if (fileSize > (8192 * 4320 * 3)) { // Check excessive file size
+    return false;
+  }
+
+  // Reserved
+  uint32_t unused = 0;
+  if (!inputFile.read((char*)&unused, sizeof(unused))) {
+    return false;
+  }
+
+  // Read data offset
+  uint32_t dataOffset = 0;
+  if (!inputFile.read((char*)&dataOffset, sizeof(dataOffset))) {
+    return false;
+  }
+
+  if ((dataOffset >= fileSize) or (dataOffset == 0)) {
+    return false;
+  }
+
+  // Read bitmap header
+  BitmapHeader infoHeader{};
+  if (!inputFile.read((char*)&infoHeader, sizeof(infoHeader))) {
+    return false;
+  }
+
+  uint32_t headerSize = sizeof(infoHeader);
+  uint32_t header4Size = 108;  // sizeof(BITMAPV4HEADER);
+  uint32_t header5Size = 124;  // sizeof(BITMAPV5HEADER);
+  if ((infoHeader._size != headerSize) and (infoHeader._size != header4Size) and (infoHeader._size != header5Size)) {
+    return false;
+  }
+
+  int palletteSize = infoHeader._colorUsed;
+  std::vector<uint32_t> pallette;
+  if ((infoHeader._bitCount < 16) and (infoHeader._colorUsed == 0) and (infoHeader._bitCount != 1)) {
+    palletteSize = 1 << infoHeader._bitCount;
+  }
+
+  if (palletteSize > 0) {
+    // V3 Pallette follows 4 bytes per entry
+    pallette.resize(palletteSize);
+    if (!inputFile.read((char*)pallette.data(), pallette.size())) {
+      return false;
+    }
+  }
+
+  inputFile.seekg(dataOffset);
+
+  uint32_t height = static_cast<uint32_t>(std::abs(infoHeader._height));
+  size_t dataSize = static_cast<size_t>(infoHeader._sizeImage);
+  uint32_t nPixels = height * static_cast<uint32_t>(infoHeader._width);
+
+  if (infoHeader._bitCount == 32) {
+    dataSize = height * infoHeader._width * 4;
+  } else if (infoHeader._bitCount == 16) {
+    dataSize = height * infoHeader._width * 2;
+  } else if (infoHeader._bitCount == 8) {
+    if (dataSize == 0) dataSize = height * infoHeader._width;  // 8 bit data - through pallette
+  } else {
+    uint32_t line_length = infoHeader._width;
+    if ((infoHeader._bitCount == 24) and ((infoHeader._width % 4) != 0)) {
+      line_length = (infoHeader._width + 4) & ~3;
+    }
+    dataSize = height * line_length * 3;
+  }
+
+  std::vector<uint8_t> _temporaryBuffer;
+  bool useTemporaryBuffer = (infoHeader._bitCount == 16) or (infoHeader._bitCount == 1) or (palletteSize > 0);
+
+  if (useTemporaryBuffer) {
+    _temporaryBuffer.resize(dataSize);
+    if (!inputFile.read((char*)_temporaryBuffer.data(), dataSize)) return false;
+  } else {
+    _data.resize(dataSize);
+    if (!inputFile.read((char*)_data.data(), dataSize)) return false;
+  }
+
+  if (infoHeader._bitCount == 16) {
+    int inputStride = infoHeader._sizeImage / height;
+
+    dataSize = nPixels * 4;
+    _data.resize(dataSize);
+    uint32_t* pOutputScan = reinterpret_cast<uint32_t*>(_data.data());
+
+    for (uint32_t y = 0; y < height; y++) {
+      uint8_t* pInputLineStart = _temporaryBuffer.data() + (y * inputStride);
+      uint16_t* pInputScan = (uint16_t*)pInputLineStart;
+
+      for (int x = 0; x < infoHeader._width; x++) {
+        uint16_t inputValue = *pInputScan++;
+        uint32_t r = ((inputValue & 0x7C00) >> 10) * 8;
+        uint32_t g = ((inputValue & 0x3E0) >> 5) * 8;
+        uint32_t b = ((inputValue & 0x1f) * 8);
+
+        *pOutputScan++ = 0xff000000 | r << 16 | g << 8 | b;
+      }
+    }
+
+    infoHeader._bitCount = 32;
+  } else if (infoHeader._bitCount == 1) {
+    int inputStride = infoHeader._sizeImage / height;
+
+    dataSize = nPixels * 4;
+    _data.resize(dataSize);
+    uint32_t* pOutputScan = reinterpret_cast<uint32_t*>(_data.data());
+
+    for (uint32_t y = 0; y < height; y++) {
+      uint8_t* pInputLineStart = _temporaryBuffer.data() + (y * inputStride);
+      uint8_t* pInputScan = pInputLineStart;
+
+      uint16_t inputValue = *pInputScan++;
+      for (int x = 0; x < infoHeader._width; x++) {
+        int bit = x % 8;
+        if (bit == 0) {
+          inputValue = *pInputScan++;
+        }
+
+        int bit_mask = 1 << (7 - bit);
+
+        if ((inputValue & bit_mask) == 0)
+          *pOutputScan++ = 0xff000000;
+        else
+          *pOutputScan++ = 0xffffffff;
+      }
+    }
+
+    infoHeader._bitCount = 32;
+  }
+
+  if (palletteSize > 0) {
+    // we're using a pallette - convert _buffer using pallette
+    _data.resize(dataSize * sizeof(uint32_t));
+    uint32_t* pOutputScan = reinterpret_cast<uint32_t*>(_data.data());
+    infoHeader._bitCount = 32;  // pretend were now 32 bits as that is format of Pallette
+    for (size_t i = 0; i < dataSize; i++) {
+      *pOutputScan++ = pallette[_temporaryBuffer[i]];
+    }
+  }
+
+  _height = height;
+  _width = infoHeader._width;
+  _bitsPerPixel = infoHeader._bitCount;
+
+  uint32_t lineLengthBytes = (_width * _bitsPerPixel) / 8;
+
+  if ((_bitsPerPixel == 24) and ((lineLengthBytes % 4) != 0)) {
+    _stride = (lineLengthBytes + 4) & ~3;
+  } else {
+    _stride = lineLengthBytes;
+  }
+
+  _upsideDown = (infoHeader._height > 0);
+
+  // BMP channel order is BGR, as required by ResNet
+  if (_upsideDown) {
+    std::vector<uint8_t> flippedData(_data.size());
+    for (uint32_t y = 0; y < _height; y++) {
+      uint8_t* pDestinationLine = flippedData.data() + (y * _stride);
+      uint8_t* pSourceLine = _data.data() + ((_height - y - 1) * _stride);
+
+      std::memcpy(pDestinationLine, pSourceLine, _stride);
+    }
+
+    _data = flippedData;
+  }
+
+  if (planarBGR) {
+    uint32_t channelSize = _width * _height;
+    std::vector<uint8_t> planarData(_data.size());
+    uint8_t* pBPlane = planarData.data();
+    uint8_t* pGPlane = pBPlane + channelSize;
+    uint8_t* pRPlane = pGPlane + channelSize;
+    uint8_t* pInputBGR = _data.data();
+
+    for (uint32_t i = 0; i < channelSize; i++) {
+      *pBPlane++ = *pInputBGR++;
+      *pGPlane++ = *pInputBGR++;
+      *pRPlane++ = *pInputBGR++;
+
+      // Skip alpha channel
+      if (infoHeader._bitCount == 32) {
+        pInputBGR++;
+      }
+    }
+
+    _data = planarData;
+  } else {
+    uint32_t channelSize = _width * _height;
+
+    // Must be 32bpp
+    if (infoHeader._bitCount == 32) {
+      // Swap endianness
+      uint8_t* pInputBGR = _data.data();
+
+      for (uint32_t i = 0; i < channelSize; i++) {
+        uint8_t b = pInputBGR[0];
+        uint8_t g = pInputBGR[1];
+        uint8_t r = pInputBGR[2];
+        uint8_t a = pInputBGR[3];
+
+        pInputBGR[0] = a;
+        pInputBGR[1] = r;
+        pInputBGR[2] = g;
+        pInputBGR[3] = b;
+
+        pInputBGR += 4;
+      }
+    } else {
+      std::vector<uint8_t> newData(channelSize * 4);
+      uint8_t* pInputBGR = _data.data();
+      uint8_t* pOutputBGRA = newData.data();
+      for (uint32_t i = 0; i < channelSize; i++) {
+        uint8_t b = pInputBGR[0];
+        uint8_t g = pInputBGR[1];
+        uint8_t r = pInputBGR[2];
+
+        pOutputBGRA[0] = 0;
+        pOutputBGRA[1] = r;
+        pOutputBGRA[2] = g;
+        pOutputBGRA[3] = b;
+
+        pInputBGR += 3;
+        pOutputBGRA += 4;
+      }
+
+      _data = newData;
+    }
+  }
+
+  return true;
+}
diff --git a/python/openvino/runtime/streaming/image_streaming_app/bmp_file.h b/python/openvino/runtime/streaming/image_streaming_app/bmp_file.h
new file mode 100644
index 0000000..95ab306
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/bmp_file.h
@@ -0,0 +1,47 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+struct BitmapHeader {
+  uint32_t _size;
+  int32_t _width;
+  int32_t _height;
+  uint16_t _planes;
+  uint16_t _bitCount;
+  uint32_t _compression;
+  uint32_t _sizeImage;
+  int32_t _xPixelsPerMeter;
+  int32_t _yPixelsPerMeter;
+  uint32_t _colorUsed;
+  uint32_t _colorImportant;
+};
+
+class BmpFile {
+ public:
+  BmpFile(const std::string& filename, bool planarBGR);
+  std::vector<uint8_t>& GetData() { return _data; }
+  uint32_t GetNumPixels() { return (_width * _height); }
+
+ private:
+  bool LoadFile(const std::string& filename, bool planarBGR);
+  std::vector<uint8_t> _data;
+  uint32_t _width = 0;
+  uint32_t _height = 0;
+  uint32_t _bitsPerPixel = 0;
+  uint32_t _stride = 0;
+  bool _upsideDown = false;
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/command_line.cpp b/python/openvino/runtime/streaming/image_streaming_app/command_line.cpp
new file mode 100644
index 0000000..794310b
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/command_line.cpp
@@ -0,0 +1,72 @@
+// Copyright 2021-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "command_line.h"
+#include <algorithm>
+
+static void TrimString(std::string& trimString) {
+  trimString.erase(0, trimString.find_first_not_of(" \n\r\t"));
+  trimString.erase(trimString.find_last_not_of(" \n\r\t") + 1);
+}
+
+static void MakeLower(std::string& stringValue) {
+  std::transform(stringValue.begin(), stringValue.end(), stringValue.begin(), ::tolower);
+}
+
+// Program -option=value
+CommandLine::CommandLine(int argumentCount, char* argumentValues[]) {
+  if (argumentCount > 0) _executableName = argumentValues[0];
+
+  for (int i = 1; i < argumentCount; i++) {
+    std::string inputString(argumentValues[i]);
+    std::string nextChar = inputString.substr(0, 1);
+    if ((nextChar == "-") or (nextChar == "/")) {
+      inputString = inputString.substr(1);
+      size_t equals = inputString.find("=");
+      std::string option;
+      std::string value;
+
+      if (equals == std::string::npos) {
+        option = inputString;
+      } else {
+        option = inputString.substr(0, equals);
+        value = inputString.substr(equals + 1);
+      }
+
+      TrimString(option);
+      TrimString(value);
+      MakeLower(option);
+      _optionMap[option] = value;
+    }
+  }
+}
+
+std::string CommandLine::GetOptionValue(const char* optionName) {
+  auto i = _optionMap.find(optionName);
+  if (i != _optionMap.end())
+    return i->second;
+  else
+    return "";
+}
+
+bool CommandLine::HaveOption(const char* optionName) { return (_optionMap.find(optionName) != _optionMap.end()); }
+
+bool CommandLine::GetOption(const char* optionName, std::string& optionValue) {
+  auto i = _optionMap.find(optionName);
+  if (i == _optionMap.end()) return false;
+
+  optionValue = i->second;
+  return true;
+}
+
+size_t CommandLine::NumOptions() { return _optionMap.size(); }
diff --git a/python/openvino/runtime/streaming/image_streaming_app/command_line.h b/python/openvino/runtime/streaming/image_streaming_app/command_line.h
new file mode 100644
index 0000000..41b12f0
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/command_line.h
@@ -0,0 +1,31 @@
+// Copyright 2021-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <string>
+#include <unordered_map>
+
+class CommandLine {
+ public:
+  CommandLine(int argumentCount, char* argumentValues[]);
+
+  std::string GetOptionValue(const char* optionName);
+  bool GetOption(const char* optionName, std::string& optionValue);
+  bool HaveOption(const char* optionName);
+  std::string GetExecutableName() { return _executableName; }
+  size_t NumOptions();
+
+ private:
+  std::string _executableName;
+  std::unordered_map<std::string, std::string> _optionMap;
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/float16.h b/python/openvino/runtime/streaming/image_streaming_app/float16.h
new file mode 100644
index 0000000..07b8ece
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/float16.h
@@ -0,0 +1,204 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/*
+Copyright (C) 2009 Apple Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ * This file helps with conversion to/from fp16.  The contents of this file are duplicated
+ * in at least four places, just to ensure they are not accidentally lost:
+ *
+ * 1) runtime/plugin/io_transformations/dlia_fp.hpp.
+ * 2) hps/apps/coredla-test-harness/float16.h (Intel internal)
+ * 3) runtime/streaming/image_streaming_app/float16.h
+ * 4) util/inc/dla_element.h (internal)
+ *
+ * The algorithm for conversion to fp16 is described in this 2008 paper by Jeroen van der Zijp
+ * http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+ *
+ * We received this code, I am told, by way of OpenVINO.  As best as I can infer from
+ * the copyright header and available bits of discussion on the web, OpenVINO seems to have
+ * copied this from Webkit.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+class Float16 {
+ public:
+  Float16(float value) {
+    // This method is used by the CoreDLA plugin, it is not binary equivalent
+    // to f32to16 (below) used by OpenVino
+    union FloatAndUint32 {
+      float _floatValue;
+      uint32_t _uint32Value;
+    };
+    FloatAndUint32 convertType;
+    convertType._floatValue = value;
+
+    uint32_t floatBits = convertType._uint32Value;
+    uint32_t shiftIndex = (floatBits >> 23) & 0x1ff;
+    _uintValue = base[shiftIndex] + ((floatBits & 0x007fffff) >> shift[shiftIndex]);
+  }
+
+  Float16() {}
+  operator uint16_t() { return _uintValue; }
+  uint16_t _uintValue;
+
+  const uint16_t base[512] = {
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      0,     0,     0,     0,     0,     0,     0,     1,     2,     4,     8,     16,    32,    64,    128,   256,
+      512,   1024,  2048,  3072,  4096,  5120,  6144,  7168,  8192,  9216,  10240, 11264, 12288, 13312, 14336, 15360,
+      16384, 17408, 18432, 19456, 20480, 21504, 22528, 23552, 24576, 25600, 26624, 27648, 28672, 29696, 30720, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744, 31744,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768,
+      32768, 32768, 32768, 32768, 32768, 32768, 32768, 32769, 32770, 32772, 32776, 32784, 32800, 32832, 32896, 33024,
+      33280, 33792, 34816, 35840, 36864, 37888, 38912, 39936, 40960, 41984, 43008, 44032, 45056, 46080, 47104, 48128,
+      49152, 50176, 51200, 52224, 53248, 54272, 55296, 56320, 57344, 58368, 59392, 60416, 61440, 62464, 63488, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512,
+      64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512, 64512};
+
+  const uint8_t shift[512] = {
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19,
+      18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13};
+
+// Function to convert F32 into F16
+// F32: exp_bias:127 SEEEEEEE EMMMMMMM MMMMMMMM MMMMMMMM.
+// F16: exp_bias:15  SEEEEEMM MMMMMMMM
+#define EXP_MASK_F32 0x7F800000U
+#define EXP_MASK_F16 0x7C00U
+
+  // small helper function to represent uint32_t value as float32
+  inline float asfloat(uint32_t v) {
+    // Both type-punning casts and unions are UB per C++ spec
+    // But compilers usually only break code with casts
+    union {
+      float f;
+      uint32_t i;
+    };
+    i = v;
+    return f;
+  }
+
+  uint16_t f32tof16_OpenVino(float x) {
+    // create minimal positive normal f16 value in f32 format
+    // exp:-14,mantissa:0 -> 2^-14 * 1.0
+    static float min16 = asfloat((127 - 14) << 23);
+
+    // create maximal positive normal f16 value in f32 and f16 formats
+    // exp:15,mantissa:11111 -> 2^15 * 1.(11111)
+    static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
+    static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
+
+    // define and declare variable for intermediate and output result
+    // the union is used to simplify representation changing
+    union {
+      float f;
+      uint32_t u;
+    } v;
+    v.f = x;
+
+    // get sign in 16bit format
+    uint32_t s = (v.u >> 16) & 0x8000;  // sign 16:  00000000 00000000 10000000 00000000
+
+    // make it abs
+    v.u &= 0x7FFFFFFF;  // abs mask: 01111111 11111111 11111111 11111111
+
+    // check NAN and INF
+    if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
+      if (v.u & 0x007FFFFF) {
+        return s | (v.u >> (23 - 10)) | 0x0200;  // return NAN f16
+      } else {
+        return s | EXP_MASK_F16;  // return INF f16
+      }
+    }
+
+    // to make f32 round to nearest f16
+    // create halfULP for f16 and add it to origin value
+    float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
+    v.f += halfULP;
+
+    // if input value is not fit normalized f16 then return 0
+    // denormals are not covered by this code and just converted to 0
+    if (v.f < min16 * 0.5F) {
+      return s;
+    }
+
+    // if input value between min16/2 and min16 then return min16
+    if (v.f < min16) {
+      return s | (1 << 10);
+    }
+
+    // if input value more than maximal allowed value for f16
+    // then return this maximal value
+    if (v.f >= max16) {
+      return max16f16 | s;
+    }
+
+    // change exp bias from 127 to 15
+    v.u -= ((127 - 15) << 23);
+
+    // round to f16
+    v.u >>= (23 - 10);
+
+    return v.u | s;
+  }
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.cpp b/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.cpp
new file mode 100644
index 0000000..bf4cb2b
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.cpp
@@ -0,0 +1,306 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "image_streaming_app.h"
+#include <signal.h>
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <fcntl.h>
+#include "raw_image.h"
+
+int main(int numParams, char* paramValues[]) {
+  ImageStreamingApp imageStreamingApp(numParams, paramValues);
+  imageStreamingApp.Run();
+  return 0;
+}
+
+volatile bool ImageStreamingApp::_shutdownEvent;
+
+ImageStreamingApp::ImageStreamingApp(int numParams, char* paramValues[]) : _commandLine(numParams, paramValues) {
+  std::string imagesFolder;
+  if (_commandLine.GetOption("images_folder", imagesFolder))
+    _imageFilesFolder = imagesFolder;
+  else
+    _imageFilesFolder = "./";
+
+  std::string imageFile;
+  if (_commandLine.GetOption("image", imageFile)) {
+    _numToSend = 1;
+    _imageFile = imageFile;
+  }
+
+  std::string nSendStr;
+  if (_commandLine.GetOption("send", nSendStr)) _numToSend = std::strtoul(nSendStr.c_str(), 0, 0);
+
+  std::string rateStr;
+  if (_commandLine.GetOption("rate", rateStr)) _sendRate = std::strtoul(rateStr.c_str(), 0, 0);
+
+  _dumpTransformedImages = _commandLine.HaveOption("dump");
+  _runLayoutTransform = _commandLine.HaveOption("layout_transform");
+
+  _ltConfiguration._width = GetUintOption("width", 224);
+  _ltConfiguration._height = GetUintOption("height", 224);
+  _ltConfiguration._cVector = GetUintOption("c_vector", 32);
+  _ltConfiguration._blueVariance = GetFloatOption("blue_variance", 1.0f);
+  _ltConfiguration._greenVariance = GetFloatOption("green_variance", 1.0f);
+  _ltConfiguration._redVariance = GetFloatOption("red_variance", 1.0f);
+  _ltConfiguration._blueShift = GetFloatOption("blue_shift", -103.94f);
+  _ltConfiguration._greenShift = GetFloatOption("green_shift", -116.78f);
+  _ltConfiguration._redShift = GetFloatOption("red_shift", -123.68f);
+
+  signal(SIGINT, SigIntHandler);
+}
+
+void ImageStreamingApp::Run() {
+  if (_commandLine.HaveOption("-help")) {
+    std::cout << "Usage:\n";
+    std::cout << " image_streaming_app [Options]\n";
+    std::cout << "\nOptions:\n";
+    std::cout << "-images_folder=folder     Location of bitmap files. Defaults to working folder.\n";
+    std::cout << "-image=path               Location of a single bitmap file for single inference.\n";
+    std::cout << "-send=n                   Number of images to stream. Default is 1 if -image is set, otherwise infinite.\n";
+    std::cout << "-rate=n                   Rate to stream images, in Hz. n is an integer. Default is 30.\n";
+    std::cout << "-width=n                  Image width in pixels, default = 224\n";
+    std::cout << "-height=n                 Image height in pixels, default = 224\n";
+    std::cout << "-c_vector=n               C vector size, default = 32\n";
+    std::cout << "-blue_variance=n          Blue variance, default = 1.0\n";
+    std::cout << "-green_variance=n         Green variance, default = 1.0\n";
+    std::cout << "-red_variance=n           Red variance, default = 1.0\n";
+    std::cout << "-blue_shift=n             Blue shift, default = -103.94\n";
+    std::cout << "-green_shift=n            Green shift, default -116.78\n";
+    std::cout << "-red_shift=n              Red shift, default = -123.68\n";
+    return;
+  }
+
+  if (not ProgramLayoutTransform()) {
+    return;
+  }
+
+  if (not LoadImageFiles(_dumpTransformedImages)) {
+    return;
+  }
+
+  if (_dumpTransformedImages) {
+    return;
+  }
+
+  if (not WaitForInferenceApp())
+    return;
+
+  // Start event signal thread
+  auto sendImageEventThreadCB = [this]() { RunSendImageSignalThread(); };
+  std::thread sendImageEventThread(sendImageEventThreadCB);
+  uint32_t sentCount = 0;
+
+  while (not _shutdownEvent) {
+    // Wait for the send image event
+    _sendNextImageEvent.Wait();
+
+    if (not SendNextImage()) {
+      _shutdownEvent = true;
+      break;
+    }
+    sentCount++;
+
+    if ((_numToSend > 0) and (sentCount >= _numToSend)) {
+      _shutdownEvent = true;
+      break;
+    }
+  }
+
+  // Wait for signalling thread to finish
+  sendImageEventThread.join();
+}
+
+bool ImageStreamingApp::LoadImageFiles(bool dumpLayoutTransform) {
+  if (not _imageFile.empty()) {
+    std::filesystem::path filePath(_imageFile);
+    std::string extension = filePath.extension();
+    std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
+    if ((extension == ".bmp") or (extension == ".raw") or (extension == ".lt")) {
+      auto spRawImage = std::make_shared<RawImage>(filePath, _runLayoutTransform, _ltConfiguration);
+      if (spRawImage->IsValid()) {
+        _images.push_back(spRawImage);
+
+        if (dumpLayoutTransform and _runLayoutTransform) {
+          spRawImage->DumpLayoutTransform();
+        }
+      } else {
+        std::cout << "Unsupported image: " << filePath << '\n';
+      }
+    }
+  } else {
+    for (const auto& entry : std::filesystem::directory_iterator(_imageFilesFolder)) {
+      std::string filename = entry.path();
+      std::filesystem::path filePath(filename);
+      std::string extension = filePath.extension();
+      std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
+      if ((extension == ".bmp") or (extension == ".raw") or (extension == ".lt")) {
+        auto spRawImage = std::make_shared<RawImage>(filePath, _runLayoutTransform, _ltConfiguration);
+        _images.push_back(spRawImage);
+
+        if (dumpLayoutTransform and _runLayoutTransform) {
+          spRawImage->DumpLayoutTransform();
+        }
+
+        // Don't load any more than we need to send
+        if (_images.size() == _numToSend) {
+          break;
+        }
+      }
+    }
+  }
+
+  std::cout << "Loaded " << _images.size() << " image";
+  if (_images.size() > 1) {
+    std::cout << "s";
+  }
+  std::cout << '\n';
+  return not _images.empty();
+}
+
+bool ImageStreamingApp::OpenMsgDmaStream() {
+  if (_msgDmaStream) {
+    return true;
+  }
+
+  constexpr const char* msgdmaFilename = "/dev/msgdma_stream0";
+  _msgDmaStream = ::fopen(msgdmaFilename, "w+");
+  if (_msgDmaStream == NULL) {
+    std::cout << "Failed to open" << '\n';
+    return false;
+  }
+
+  // Turn off output buffering
+  setvbuf(_msgDmaStream, NULL, _IONBF, 0);
+
+  return true;
+}
+
+void ImageStreamingApp::CloseMsgDmaStream() {
+  if (_msgDmaStream) {
+    ::fclose(_msgDmaStream);
+    _msgDmaStream = nullptr;
+  }
+}
+
+bool ImageStreamingApp::SendNextImage() {
+  size_t nImages = _images.size();
+  if (nImages == 0) {
+    return false;
+  }
+
+  if (not _msgDmaStream) {
+    if (not OpenMsgDmaStream()) {
+      return false;
+    }
+  }
+
+  std::shared_ptr<RawImage> uploadImage = _images[_nextImageIndex];
+
+  // Move to next index for next time
+  _nextImageIndex = (_nextImageIndex + 1) % nImages;
+  _sentCount++;
+
+  char* pBuffer = reinterpret_cast<char*>(uploadImage->GetData());
+  size_t bufferSize = uploadImage->GetSize();
+
+  std::cout << _sentCount << " Send image " << uploadImage->Filename() << " size = " << bufferSize;
+
+  size_t nWritten = ::fwrite(pBuffer, 1, bufferSize, _msgDmaStream);
+  bool ok = (nWritten == bufferSize);
+  if (ok) {
+    std::cout << '\n';
+  } else {
+    std::cout << " failed\n";
+  }
+
+  return ok;
+}
+
+void ImageStreamingApp::RunSendImageSignalThread() {
+  int64_t microSeconds = 1000000 / _sendRate;
+  if (_sendRate == 59) {
+    microSeconds = 16683;  // 59.94 Hz
+  }
+
+  while (not _shutdownEvent) {
+    std::this_thread::sleep_for(std::chrono::microseconds(microSeconds));
+    _sendNextImageEvent.Set();
+  }
+}
+
+bool ImageStreamingApp::ProgramLayoutTransform() {
+  auto spLayoutTransform = ILayoutTransform::Create();
+  spLayoutTransform->SetConfiguration(_ltConfiguration);
+  return true;
+}
+
+uint32_t ImageStreamingApp::GetUintOption(const char* optionName, uint32_t defaultValue) {
+  std::string optionValue;
+  if (_commandLine.GetOption(optionName, optionValue)) {
+    return std::strtoul(optionValue.c_str(), nullptr, 0);
+  } else {
+    return defaultValue;
+  }
+}
+
+float ImageStreamingApp::GetFloatOption(const char* optionName, float defaultValue) {
+  std::string optionValue;
+  if (_commandLine.GetOption(optionName, optionValue)) {
+    return std::strtof(optionValue.c_str(), nullptr);
+  } else {
+    return defaultValue;
+  }
+}
+
+void ImageStreamingApp::SigIntHandler(int) {
+  std::cout << "\nShutting down application\n";
+  _shutdownEvent = true;
+}
+
+bool ImageStreamingApp::WaitForInferenceApp() {
+  bool isReady = false;
+  bool firstTime = true;
+  sem_t* pSemaphore = ::sem_open("/CoreDLA_ready_for_streaming", O_CREAT, 0644, 0);
+  if (!pSemaphore) {
+    return isReady;
+  }
+
+  while (not _shutdownEvent) {
+    // Don't use a wait timeout because we need to break
+    // if the user presses Ctrl+C
+    timespec waitTimeout = {};
+    int r = ::sem_timedwait(pSemaphore, &waitTimeout);
+    if (r == 0) {
+      isReady = true;
+      ::sem_post(pSemaphore);
+      break;
+    }
+
+    if (firstTime) {
+      firstTime = false;
+      std::cout << "Waiting for streaming_inference_app to become ready." << std::endl;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  ::sem_close(pSemaphore);
+
+  return isReady;
+}
diff --git a/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.h b/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.h
new file mode 100644
index 0000000..0693ef8
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/image_streaming_app.h
@@ -0,0 +1,79 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <condition_variable>
+#include <filesystem>
+#include <vector>
+#include <semaphore.h>
+#include "ILayoutTransform.h"
+#include "command_line.h"
+
+using namespace std::chrono_literals;
+
+class RawImage;
+
+class Event {
+ public:
+  void Wait() {
+    std::unique_lock<std::mutex> lock(_signalMutex);
+    _conditionVariable.wait(lock);
+  }
+
+  void Set() {
+    std::lock_guard<std::mutex> lock(_signalMutex);
+    _conditionVariable.notify_all();
+  }
+
+  bool IsSignalled() {
+    std::unique_lock<std::mutex> lock(_signalMutex);
+    return (_conditionVariable.wait_for(lock, 0ms) != std::cv_status::timeout);
+  }
+
+ private:
+  std::mutex _signalMutex;
+  std::condition_variable _conditionVariable;
+};
+
+class ImageStreamingApp {
+ public:
+  ImageStreamingApp(int numParams, char* paramValues[]);
+  void Run();
+
+ private:
+  bool ProgramLayoutTransform();
+  bool SendNextImage();
+  bool LoadImageFiles(bool dumpLayoutTransform);
+  void RunSendImageSignalThread();
+  static void SigIntHandler(int);
+  uint32_t GetUintOption(const char* optionName, uint32_t defaultValue);
+  float GetFloatOption(const char* optionName, float defaultValue);
+  bool OpenMsgDmaStream();
+  void CloseMsgDmaStream();
+  bool WaitForInferenceApp();
+
+  CommandLine _commandLine;
+  std::filesystem::path _imageFilesFolder;
+  std::string _imageFile;
+  std::vector<std::shared_ptr<RawImage>> _images;
+  Event _sendNextImageEvent;
+  static volatile bool _shutdownEvent;
+  size_t _nextImageIndex = 0;
+  uint32_t _numToSend = 0;
+  uint32_t _sendRate = 30;
+  uint32_t _sentCount = 0;
+  bool _dumpTransformedImages = false;
+  bool _runLayoutTransform = true;
+  FILE* _msgDmaStream = nullptr;
+  ILayoutTransform::Configuration _ltConfiguration = {};
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/layout_transform/CMakeLists.txt b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/CMakeLists.txt
new file mode 100644
index 0000000..ecbffca
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Copyright 2023 Intel Corporation
+#
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+#
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+
+project(layout_transform)
+
+set(header_files "")
+set(source_files "")
+
+# specify the C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Header files
+set(header_files ${header_files} "include/ILayoutTransform.h")
+set(header_files ${header_files} "source/LayoutTransform.h")
+
+# Source files
+set(source_files ${source_files} "source/LayoutTransform.cpp")
+
+set(all_files ${header_files} ${source_files})
+
+add_library(${PROJECT_NAME} STATIC ${all_files})
+target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+target_link_libraries(${PROJECT_NAME} uio)
+
diff --git a/python/openvino/runtime/streaming/image_streaming_app/layout_transform/include/ILayoutTransform.h b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/include/ILayoutTransform.h
new file mode 100644
index 0000000..f54f9d5
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/include/ILayoutTransform.h
@@ -0,0 +1,38 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+class ILayoutTransform {
+ public:
+  class Configuration {
+   public:
+    uint32_t _width;
+    uint32_t _height;
+    uint32_t _cVector;
+    float _blueVariance;
+    float _greenVariance;
+    float _redVariance;
+    float _blueShift;
+    float _greenShift;
+    float _redShift;
+  };
+
+  virtual ~ILayoutTransform() {}
+  virtual void SetConfiguration(Configuration& configuration) = 0;
+
+  static std::shared_ptr<ILayoutTransform> Create();
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.cpp b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.cpp
new file mode 100644
index 0000000..a045b6a
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.cpp
@@ -0,0 +1,51 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "LayoutTransform.h"
+#include <thread>
+
+std::shared_ptr<ILayoutTransform> ILayoutTransform::Create() { return std::make_shared<LayoutTransform>(); }
+
+LayoutTransform::LayoutTransform() { _spUioDevice = UIO::IDevice::Load("layout_transform"); }
+
+static uint32_t FloatToUint32(float value) {
+  union FloatAndUint32 {
+    float _floatValue;
+    uint32_t _uint32Value;
+  };
+  FloatAndUint32 convertType;
+  convertType._floatValue = value;
+  return convertType._uint32Value;
+}
+
+void LayoutTransform::SetConfiguration(Configuration& configuration) {
+  _configuration = configuration;
+
+  if (_spUioDevice) {
+    _spUioDevice->Write((uint32_t)RegisterMap::RESET, 1u);
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    _spUioDevice->Write((uint32_t)RegisterMap::RESET, 0u);
+
+    _spUioDevice->Write((uint32_t)RegisterMap::C_VECT, _configuration._cVector);
+    _spUioDevice->Write((uint32_t)RegisterMap::WIDTH, _configuration._width);
+    _spUioDevice->Write((uint32_t)RegisterMap::HEIGHT, _configuration._height);
+
+    _spUioDevice->Write((uint32_t)RegisterMap::VARIANCES + 0, FloatToUint32(_configuration._blueVariance));
+    _spUioDevice->Write((uint32_t)RegisterMap::VARIANCES + 1, FloatToUint32(_configuration._greenVariance));
+    _spUioDevice->Write((uint32_t)RegisterMap::VARIANCES + 2, FloatToUint32(_configuration._redVariance));
+
+    _spUioDevice->Write((uint32_t)RegisterMap::SHIFTS + 0, FloatToUint32(_configuration._blueShift));
+    _spUioDevice->Write((uint32_t)RegisterMap::SHIFTS + 1, FloatToUint32(_configuration._greenShift));
+    _spUioDevice->Write((uint32_t)RegisterMap::SHIFTS + 2, FloatToUint32(_configuration._redShift));
+  }
+}
diff --git a/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.h b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.h
new file mode 100644
index 0000000..3d67a5d
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/layout_transform/source/LayoutTransform.h
@@ -0,0 +1,38 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <memory>
+#include "ILayoutTransform.h"
+#include "IUioDevice.h"
+
+enum class RegisterMap : uint32_t {
+  RESET = 0,
+  C_VECT,
+  WIDTH,
+  HEIGHT,
+  VARIANCES = 0x10,  // to 0x1f
+  SHIFTS = 0x20,     // to 0x2f
+};
+
+class LayoutTransform : public ILayoutTransform {
+ public:
+  LayoutTransform();
+
+  // ILayoutTransform interface
+  void SetConfiguration(Configuration& configuration) override;
+
+ private:
+  ILayoutTransform::Configuration _configuration = {};
+  std::shared_ptr<UIO::IDevice> _spUioDevice;
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/raw_image.cpp b/python/openvino/runtime/streaming/image_streaming_app/raw_image.cpp
new file mode 100644
index 0000000..bd5658b
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/raw_image.cpp
@@ -0,0 +1,225 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "raw_image.h"
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+
+std::vector<int32_t> RawImage::_indexes;
+
+RawImage::RawImage(std::filesystem::path filePath,
+                   bool runLayoutTransform,
+                   const ILayoutTransform::Configuration& ltConfiguration)
+    : _filePath(filePath), _ltConfiguration(ltConfiguration) {
+  std::string extension = filePath.extension();
+  std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower);
+  if (extension == ".lt") {
+    uintmax_t fileSize = std::filesystem::file_size(filePath);
+    std::ifstream layoutFile(filePath, std::fstream::binary);
+    _layoutTransformData.resize(fileSize / sizeof(uint16_t));
+    layoutFile.read((char*)_layoutTransformData.data(), fileSize);
+  } else {
+    bool planar = runLayoutTransform;
+    _spBmpFile = std::make_shared<BmpFile>(filePath, planar);
+    if (_runLayoutTransform) LayoutTransform(_ltConfiguration);
+  }
+}
+
+uint8_t* RawImage::GetData() {
+  if (_runLayoutTransform)
+    return reinterpret_cast<uint8_t*>(_layoutTransformData.data());
+  else
+    return _spBmpFile->GetData().data();
+}
+
+size_t RawImage::GetSize() {
+  if (_runLayoutTransform)
+    return _layoutTransformData.size() * sizeof(uint16_t);
+  else
+    return _spBmpFile->GetData().size();
+}
+
+bool RawImage::IsValid()
+{
+    constexpr size_t dlaImageSize = 224 * 224 * 4;
+    return (GetSize() == dlaImageSize);
+}
+
+std::vector<uint16_t> RawImage::LayoutTransform(uint32_t width,
+                                                uint32_t height,
+                                                const std::vector<uint8_t>& sourceData,
+                                                const ILayoutTransform::Configuration& ltConfiguration) {
+  uint32_t numPixels = width * height;
+  std::vector<uint16_t> layoutTransformData = LayoutTransform(sourceData, numPixels, ltConfiguration);
+  return layoutTransformData;
+}
+
+void RawImage::LayoutTransform(const ILayoutTransform::Configuration& ltConfiguration) {
+  const std::vector<uint8_t>& sourceData = _spBmpFile->GetData();
+  uint32_t numPixels = _spBmpFile->GetNumPixels();
+  _layoutTransformData = LayoutTransform(sourceData, numPixels, ltConfiguration);
+}
+
+std::vector<uint16_t> RawImage::LayoutTransform(const std::vector<uint8_t>& sourceData,
+                                                uint32_t numPixels,
+                                                const ILayoutTransform::Configuration& ltConfiguration) {
+  if (_indexes.empty()) GenerateLayoutIndexes(ltConfiguration);
+
+  uint32_t numChannels = 3;
+  uint32_t numSamples = numPixels * numChannels;
+
+  std::vector<uint16_t> meanAdjustedData(numSamples);
+  const uint8_t* pSourceData = sourceData.data();
+
+  const uint8_t* pBlueSourcePlane = pSourceData;
+  const uint8_t* pGreenSourcePlane = pBlueSourcePlane + numPixels;
+  const uint8_t* pRedSourcePlane = pGreenSourcePlane + numPixels;
+
+  // First adjust by subtracting the means values
+  std::vector<float> meanAdjustedFloat32(numSamples);
+  float* pBlueFloat32 = &meanAdjustedFloat32[0];
+  float* pGreenFloat32 = pBlueFloat32 + numPixels;
+  float* pRedFloat32 = pGreenFloat32 + numPixels;
+
+  for (uint32_t i = 0; i < numPixels; i++) {
+    *pBlueFloat32++ = static_cast<float>(*pBlueSourcePlane++) + ltConfiguration._blueShift;
+    *pGreenFloat32++ = static_cast<float>(*pGreenSourcePlane++) + ltConfiguration._greenShift;
+    *pRedFloat32++ = static_cast<float>(*pRedSourcePlane++) + ltConfiguration._redShift;
+  }
+
+  pBlueFloat32 = &meanAdjustedFloat32[0];
+  pGreenFloat32 = pBlueFloat32 + numPixels;
+  pRedFloat32 = pGreenFloat32 + numPixels;
+  uint16_t* pBlueDestinationPlane = &meanAdjustedData[0];
+  uint16_t* pGreenDestinationPlane = pBlueDestinationPlane + numPixels;
+  uint16_t* pRedDestinationPlane = pGreenDestinationPlane + numPixels;
+
+  for (uint32_t i = 0; i < numPixels; i++) {
+    *pBlueDestinationPlane++ = Float16(*pBlueFloat32++);
+    *pGreenDestinationPlane++ = Float16(*pGreenFloat32++);
+    *pRedDestinationPlane++ = Float16(*pRedFloat32++);
+  }
+
+  // Now map the data to the layout expected by the DLA
+  size_t nLayoutEntries = _indexes.size();
+  std::vector<uint16_t> layoutTransformData(nLayoutEntries);
+
+  for (size_t outputIndex = 0; outputIndex < nLayoutEntries; outputIndex++) {
+    int32_t inputIndex = _indexes[outputIndex];
+    if (inputIndex >= 0)
+      layoutTransformData[outputIndex] = meanAdjustedData[inputIndex];
+    else
+      layoutTransformData[outputIndex] = 0;
+  }
+
+  return layoutTransformData;
+}
+
+bool RawImage::DumpLayoutTransform() {
+  if (!_spBmpFile) return false;
+
+  std::filesystem::path filePath(_filePath);
+  filePath.replace_extension("raw");
+  std::ofstream rawRgbaFile(filePath, std::fstream::binary);
+  if (rawRgbaFile.bad()) return false;
+
+  uint32_t numPixels = _spBmpFile->GetNumPixels();
+  uint32_t numChannels = 4;
+  uint32_t numSamples = numPixels * numChannels;
+  std::vector<uint8_t> buffer(numSamples);
+  uint8_t* pSourceData = _spBmpFile->GetData().data();
+
+  uint8_t* pBlueSourcePlane = pSourceData;
+  uint8_t* pGreenSourcePlane = pBlueSourcePlane + numPixels;
+  uint8_t* pRedSourcePlane = pGreenSourcePlane + numPixels;
+  uint8_t* pDestination = buffer.data();
+
+  for (uint32_t i = 0; i < numPixels; i++) {
+    *pDestination++ = *pBlueSourcePlane++;
+    *pDestination++ = *pGreenSourcePlane++;
+    *pDestination++ = *pRedSourcePlane++;
+    *pDestination++ = 0;
+  }
+
+  rawRgbaFile.write((char*)buffer.data(), buffer.size());
+
+  filePath.replace_extension("lt");
+  std::ofstream transformFile(filePath, std::fstream::binary);
+  if (transformFile.bad()) return false;
+
+  transformFile.write((char*)GetData(), GetSize());
+
+  return true;
+}
+
+// Convert from RGBA to planar BGR
+std::vector<uint8_t> RawImage::MakePlanar(uint32_t width, uint32_t height, const std::vector<uint8_t>& data) {
+  uint32_t channelSize = width * height;
+  std::vector<uint8_t> planarData(channelSize * 3);
+  uint8_t* pBPlane = planarData.data();
+  uint8_t* pGPlane = pBPlane + channelSize;
+  uint8_t* pRPlane = pGPlane + channelSize;
+  const uint8_t* pInputRGBA = data.data();
+
+  for (uint32_t i = 0; i < channelSize; i++) {
+    *pRPlane++ = *pInputRGBA++;
+    *pGPlane++ = *pInputRGBA++;
+    *pBPlane++ = *pInputRGBA++;
+
+    // Skip alpha channel
+    uint8_t alpha = *pInputRGBA++;
+    alpha = alpha;
+  }
+
+  return planarData;
+}
+
+void RawImage::GenerateLayoutIndexes(const ILayoutTransform::Configuration& ltConfiguration) {
+  size_t nEntries = ltConfiguration._width * ltConfiguration._height * ltConfiguration._cVector;
+
+  uint32_t c_vector = ltConfiguration._cVector;
+  uint32_t width_stride = 1;
+  uint32_t height_stride = 1;
+  uint32_t input_width = ltConfiguration._width;
+  uint32_t input_height = ltConfiguration._height;
+  uint32_t input_channels = 3;
+  uint32_t output_width = ltConfiguration._width;
+  uint32_t output_width_banked = ltConfiguration._width;
+  uint32_t output_height = ltConfiguration._height;
+  uint32_t pad_left = 0;
+  uint32_t pad_top = 0;
+
+  _indexes.resize(nEntries, -1);
+
+  for (uint32_t c = 0; c < input_channels; c++) {
+    for (uint32_t h = 0; h < input_height; h++) {
+      for (uint32_t w = 0; w < input_width; w++) {
+        uint32_t output_w = (w + pad_left) / width_stride;
+        uint32_t output_h = (h + pad_top) / height_stride;
+        uint32_t output_d = c * height_stride * width_stride + ((h + pad_top) % height_stride) * width_stride +
+                            (w + pad_left) % width_stride;
+        uint32_t output_d_c_vector = output_d / c_vector;
+        uint32_t cvec = output_d % c_vector;
+        uint32_t inIndex = c * input_height * input_width + h * input_width + w;
+
+        uint32_t outIndex = (output_d_c_vector * output_height * output_width_banked * c_vector) +
+                            (output_h * output_width_banked * c_vector) + (output_w * c_vector) + cvec;
+
+        if ((output_h < output_height) && (output_w < output_width)) {
+          _indexes[outIndex] = static_cast<int32_t>(inIndex);
+        }
+      }
+    }
+  }
+}
diff --git a/python/openvino/runtime/streaming/image_streaming_app/raw_image.h b/python/openvino/runtime/streaming/image_streaming_app/raw_image.h
new file mode 100644
index 0000000..9cb08b4
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/raw_image.h
@@ -0,0 +1,52 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <cstdint>
+#include <filesystem>
+#include <memory>
+#include <vector>
+#include "ILayoutTransform.h"
+#include "bmp_file.h"
+#include "float16.h"
+
+class RawImage {
+ public:
+  RawImage(std::filesystem::path filePath,
+           bool runLayoutTransform,
+           const ILayoutTransform::Configuration& ltConfiguration);
+  uint8_t* GetData();
+  size_t GetSize();
+  std::string Filename() { return _filePath; }
+  bool DumpLayoutTransform();
+  static std::vector<uint16_t> LayoutTransform(uint32_t width,
+                                               uint32_t height,
+                                               const std::vector<uint8_t>& data,
+                                               const ILayoutTransform::Configuration& ltConfiguration);
+  static std::vector<uint8_t> MakePlanar(uint32_t width, uint32_t height, const std::vector<uint8_t>& data);
+  bool IsValid();
+
+ private:
+  static void GenerateLayoutIndexes(const ILayoutTransform::Configuration& ltConfiguration);
+  void LayoutTransform(const ILayoutTransform::Configuration& ltConfiguration);
+  static std::vector<uint16_t> LayoutTransform(const std::vector<uint8_t>& sourceData,
+                                               uint32_t numPixels,
+                                               const ILayoutTransform::Configuration& ltConfiguration);
+
+  std::filesystem::path _filePath;
+  std::shared_ptr<BmpFile> _spBmpFile;
+  std::vector<uint16_t> _layoutTransformData;
+  static std::vector<int32_t> _indexes;
+  bool _runLayoutTransform = false;
+  ILayoutTransform::Configuration _ltConfiguration;
+};
diff --git a/python/openvino/runtime/streaming/image_streaming_app/uio/CMakeLists.txt b/python/openvino/runtime/streaming/image_streaming_app/uio/CMakeLists.txt
new file mode 100644
index 0000000..4c445a2
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/uio/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Copyright 2023 Intel Corporation
+#
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+#
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+
+project(uio)
+
+set(header_files "")
+set(source_files "")
+
+# specify the C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Header files
+set(header_files ${header_files} "include/IUioDevice.h")
+set(header_files ${header_files} "source/UioDevice.h")
+
+# Source files
+set(source_files ${source_files} "source/UioDevice.cpp")
+
+set(all_files ${header_files} ${source_files})
+
+add_library(${PROJECT_NAME} STATIC ${all_files})
+
+# Include directories
+target_include_directories(${PROJECT_NAME} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
diff --git a/python/openvino/runtime/streaming/image_streaming_app/uio/include/IUioDevice.h b/python/openvino/runtime/streaming/image_streaming_app/uio/include/IUioDevice.h
new file mode 100644
index 0000000..9b964a7
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/uio/include/IUioDevice.h
@@ -0,0 +1,40 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <filesystem>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace UIO {
+class DeviceItem {
+ public:
+  std::string _name;
+  uint32_t _index;
+  std::string _indexedName;
+  std::filesystem::path _rootPath;
+};
+
+class IDevice {
+ public:
+  static std::shared_ptr<IDevice> Load(const std::string& deviceName, uint32_t index = 0);
+  static std::vector<DeviceItem> GetDevices();
+
+  virtual uint32_t Read(uint32_t registerIndex) = 0;
+  virtual void Write(uint32_t registerIndex, uint32_t value) = 0;
+  virtual void ReadBlock(void* host_addr, size_t offset, size_t size) = 0;
+  virtual void WriteBlock(const void* host_addr, size_t offset, size_t size) = 0;
+};
+
+}  // namespace UIO
diff --git a/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.cpp b/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.cpp
new file mode 100644
index 0000000..d1b5d17
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.cpp
@@ -0,0 +1,168 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "UioDevice.h"
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <filesystem>
+#include <fstream>
+
+namespace UIO {
+static const std::string uioDriverFolder = "/sys/class/uio";
+
+std::shared_ptr<IDevice> IDevice::Load(const std::string& deviceName, uint32_t index) {
+  std::vector<DeviceItem> deviceItems = GetDevices();
+  std::string indexedDeviceName = deviceName + std::to_string(index);
+
+  for (auto& deviceItem : deviceItems) {
+    if (deviceItem._indexedName == indexedDeviceName) {
+      auto spUioDevice = std::make_shared<Device>(deviceItem);
+      return spUioDevice->IsValid() ? spUioDevice : nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+std::vector<DeviceItem> IDevice::GetDevices() {
+  std::vector<DeviceItem> deviceItems;
+
+  for (const auto& entry : std::filesystem::directory_iterator(uioDriverFolder)) {
+    // Filter out uio*
+    if (entry.is_directory()) {
+      std::filesystem::path filePath = entry.path();
+      std::string stem = filePath.filename();
+      if (stem.substr(0, 3) == "uio") {
+        std::string indexedDeviceName = Device::ReadStringFromFile(filePath / "name");
+        if (not indexedDeviceName.empty()) {
+          std::string deviceName;
+          uint32_t index = 0;
+          Device::SplitIndexedDeviceName(indexedDeviceName, deviceName, index);
+          deviceItems.push_back({deviceName, index, indexedDeviceName, filePath});
+        }
+      }
+    }
+  }
+
+  return deviceItems;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+Device::Device(const DeviceItem& deviceItem) : _deviceItem(deviceItem) { MapRegion(); }
+
+Device::~Device() { UnmapRegion(); }
+
+bool Device::IsValid() { return (_fd >= 0); }
+
+uint32_t Device::Read(uint32_t registerIndex) {
+  if (registerIndex >= _maximumRegisterIndex) return 0;
+
+  uint32_t* pRegister = (uint32_t*)_pPtr;
+
+  uint32_t value = pRegister[registerIndex];
+  return value;
+}
+
+void Device::Write(uint32_t registerIndex, uint32_t value) {
+  if (registerIndex < _maximumRegisterIndex) {
+    uint32_t* pRegister = (uint32_t*)_pPtr;
+    pRegister[registerIndex] = value;
+  }
+}
+
+void Device::ReadBlock(void* pHostDestination, size_t offset, size_t size) {
+  if ((offset + size) < _size) {
+    uint8_t* pDeviceMem = (uint8_t*)_pPtr + offset;
+    ::memcpy(pHostDestination, pDeviceMem, size);
+  }
+}
+
+void Device::WriteBlock(const void* pHostSourceAddress, size_t offset, size_t size) {
+  if ((offset + size) < _size) {
+    uint8_t* pDeviceMem = (uint8_t*)_pPtr + offset;
+    ::memcpy(pDeviceMem, pHostSourceAddress, size);
+  }
+}
+
+uint64_t Device::ReadValueFromFile(const std::filesystem::path& path) {
+  std::string line = ReadStringFromFile(path);
+  int base = (line.substr(0, 2) == "0x") ? 16 : 10;
+  return std::stoull(line, nullptr, base);
+}
+
+std::string Device::ReadStringFromFile(const std::filesystem::path& path) {
+  std::ifstream inputStream(path);
+  if (inputStream.good()) {
+    std::string line;
+    std::getline(inputStream, line);
+    return line;
+  }
+  return "";
+}
+
+bool Device::MapRegion() {
+  _size = ReadValueFromFile(_deviceItem._rootPath / "maps/map0/size");
+  _offset = ReadValueFromFile(_deviceItem._rootPath / "maps/map0/offset");
+  _physicalAddress = ReadValueFromFile(_deviceItem._rootPath / "maps/map0/addr");
+  _maximumRegisterIndex = _size / sizeof(uint32_t);
+
+  std::filesystem::path uioDevicePath = "/dev";
+  std::filesystem::path uioDeviceId = _deviceItem._rootPath.stem();
+  uioDevicePath /= uioDeviceId;
+
+  _fd = ::open(uioDevicePath.c_str(), O_RDWR);
+  if (_fd < 0) {
+    return false;
+  }
+
+  // Map the region into userspace
+  _pBase = (uint8_t*)::mmap(NULL, (size_t)_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);
+  if (_pBase == MAP_FAILED) {
+    return false;
+  }
+
+  // CST base address is at _pBase + _offset
+  _pPtr = (uint32_t*)(_pBase + _offset);
+
+  return true;
+};
+
+void Device::UnmapRegion() {
+  int r = 0;
+  if (_pBase) {
+    r = ::munmap(_pBase, _size);
+    _pBase = nullptr;
+  }
+
+  if (_fd >= 0) {
+    r = ::close(_fd);
+    _fd = -1;
+  }
+  (void)r;
+}
+
+void Device::SplitIndexedDeviceName(const std::string& indexedDeviceName, std::string& deviceName, uint32_t& index) {
+  int32_t len = static_cast<int32_t>(indexedDeviceName.length());
+  int32_t nDecimals = 0;
+  for (int32_t i = (len - 1); i >= 0; i--) {
+    if (::isdigit(indexedDeviceName[i])) nDecimals++;
+  }
+
+  deviceName = indexedDeviceName.substr(0, len - nDecimals);
+  index = std::stoul(indexedDeviceName.substr(len - nDecimals));
+}
+
+}  // namespace UIO
diff --git a/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.h b/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.h
new file mode 100644
index 0000000..49c6f51
--- /dev/null
+++ b/python/openvino/runtime/streaming/image_streaming_app/uio/source/UioDevice.h
@@ -0,0 +1,56 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <filesystem>
+#include <memory>
+#include <string>
+#include <vector>
+#include "IUioDevice.h"
+
+namespace UIO {
+class Device : public IDevice {
+ public:
+  Device(const DeviceItem& deviceItem);
+  ~Device();
+
+  // IDevice interface
+  uint32_t Read(uint32_t registerIndex) override;
+  void Write(uint32_t registerIndex, uint32_t value) override;
+  void ReadBlock(void* host_addr, size_t offset, size_t size) override;
+  void WriteBlock(const void* host_addr, size_t offset, size_t size) override;
+
+  bool IsValid();
+  static uint64_t ReadValueFromFile(const std::filesystem::path& path);
+  static std::string ReadStringFromFile(const std::filesystem::path& path);
+  static void SplitIndexedDeviceName(const std::string& indexedDeviceName, std::string& deviceName, uint32_t& index);
+
+ private:
+  Device() = delete;
+  Device(Device const&) = delete;
+  void operator=(Device const&) = delete;
+
+  bool MapRegion();
+  void UnmapRegion();
+
+  DeviceItem _deviceItem;
+  uint32_t _maximumRegisterIndex = 0;
+  int _fd = -1;  // File pointer to UIO - Used to indicate the the Device is valid
+  uint64_t _physicalAddress = 0;
+  uint64_t _size = 0;         // Size of the mmapped region
+  uint64_t _offset = 0;       // Offset of the first register
+  uint8_t* _pBase = nullptr;  // Base of the mmapped region
+  uint32_t* _pPtr = nullptr;  // The first register
+};
+}  // namespace UIO
diff --git a/python/openvino/runtime/streaming/runtime_scripts/run_image_stream.sh b/python/openvino/runtime/streaming/runtime_scripts/run_image_stream.sh
new file mode 100755
index 0000000..db63761
--- /dev/null
+++ b/python/openvino/runtime/streaming/runtime_scripts/run_image_stream.sh
@@ -0,0 +1,7 @@
+#! /bin/sh
+# This script should be run from the /home/root/app folder.
+
+# Run the image streaming app, specifying the folder containing the source
+# images, and an upload rate
+./image_streaming_app -images_folder=/home/root/resnet-50-tf/sample_images -rate=50
+
diff --git a/python/openvino/runtime/streaming/runtime_scripts/run_inference_stream.sh b/python/openvino/runtime/streaming/runtime_scripts/run_inference_stream.sh
new file mode 100755
index 0000000..3d14302
--- /dev/null
+++ b/python/openvino/runtime/streaming/runtime_scripts/run_inference_stream.sh
@@ -0,0 +1,14 @@
+#! /bin/sh
+# This script should be run from the /home/root/app folder.
+
+# Set the location of the shared libraries:
+export LD_LIBRARY_PATH=.
+
+# Immediately after startup, a Linux process rngd sometimes
+# runs at 100% CPU for a few minutes. This can be stopped
+# safely as there is no dependency on this
+killall -9 rngd >& /dev/null
+
+# Run the inference app, specifying the compiled model, the architecutre file and the CoreDLA device name
+# nb the model must be compiled with no folding. Use the option --ffolding-option=0 with the dlac compiler
+./streaming_inference_app -model=/home/root/resnet-50-tf/RN50_Performance_no_folding.bin -arch=/home/root/resnet-50-tf/A10_Performance.arch -device=HETERO:FPGA
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/CMakeLists.txt b/python/openvino/runtime/streaming/streaming_inference_app/CMakeLists.txt
new file mode 100644
index 0000000..ec9cc4f
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright 2023 Intel Corporation
+#
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+#
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+
+project(streaming_inference_app)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+set(all_files
+    streaming_inference_app.cpp
+    streaming_inference_app.h
+    command_line.cpp
+    command_line.h)
+
+# Targets
+add_executable(${PROJECT_NAME} ${all_files})
+
+target_link_libraries(${PROJECT_NAME} openvino::runtime)
+target_link_libraries(${PROJECT_NAME} coreDlaRuntimePlugin)
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/categories.txt b/python/openvino/runtime/streaming/streaming_inference_app/categories.txt
new file mode 100644
index 0000000..d77b8ba
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/categories.txt
@@ -0,0 +1,1001 @@
+-
+class ID 0
+class ID 1
+class ID 2
+class ID 3
+class ID 4
+class ID 5
+class ID 6
+class ID 7
+class ID 8
+class ID 9
+class ID 10
+class ID 11
+class ID 12
+class ID 13
+class ID 14
+class ID 15
+class ID 16
+class ID 17
+class ID 18
+class ID 19
+class ID 20
+class ID 21
+class ID 22
+class ID 23
+class ID 24
+class ID 25
+class ID 26
+class ID 27
+class ID 28
+class ID 29
+class ID 30
+class ID 31
+class ID 32
+class ID 33
+class ID 34
+class ID 35
+class ID 36
+class ID 37
+class ID 38
+class ID 39
+class ID 40
+class ID 41
+class ID 42
+class ID 43
+class ID 44
+class ID 45
+class ID 46
+class ID 47
+class ID 48
+class ID 49
+class ID 50
+class ID 51
+class ID 52
+class ID 53
+class ID 54
+class ID 55
+class ID 56
+class ID 57
+class ID 58
+class ID 59
+class ID 60
+class ID 61
+class ID 62
+class ID 63
+class ID 64
+class ID 65
+class ID 66
+class ID 67
+class ID 68
+class ID 69
+class ID 70
+class ID 71
+class ID 72
+class ID 73
+class ID 74
+class ID 75
+class ID 76
+class ID 77
+class ID 78
+class ID 79
+class ID 80
+class ID 81
+class ID 82
+class ID 83
+class ID 84
+class ID 85
+class ID 86
+class ID 87
+class ID 88
+class ID 89
+class ID 90
+class ID 91
+class ID 92
+class ID 93
+class ID 94
+class ID 95
+class ID 96
+class ID 97
+class ID 98
+class ID 99
+class ID 100
+class ID 101
+class ID 102
+class ID 103
+class ID 104
+class ID 105
+class ID 106
+class ID 107
+class ID 108
+class ID 109
+class ID 110
+class ID 111
+class ID 112
+class ID 113
+class ID 114
+class ID 115
+class ID 116
+class ID 117
+class ID 118
+class ID 119
+class ID 120
+class ID 121
+class ID 122
+class ID 123
+class ID 124
+class ID 125
+class ID 126
+class ID 127
+class ID 128
+class ID 129
+class ID 130
+class ID 131
+class ID 132
+class ID 133
+class ID 134
+class ID 135
+class ID 136
+class ID 137
+class ID 138
+class ID 139
+class ID 140
+class ID 141
+class ID 142
+class ID 143
+class ID 144
+class ID 145
+class ID 146
+class ID 147
+class ID 148
+class ID 149
+class ID 150
+class ID 151
+class ID 152
+class ID 153
+class ID 154
+class ID 155
+class ID 156
+class ID 157
+class ID 158
+class ID 159
+class ID 160
+class ID 161
+class ID 162
+class ID 163
+class ID 164
+class ID 165
+class ID 166
+class ID 167
+class ID 168
+class ID 169
+class ID 170
+class ID 171
+class ID 172
+class ID 173
+class ID 174
+class ID 175
+class ID 176
+class ID 177
+class ID 178
+class ID 179
+class ID 180
+class ID 181
+class ID 182
+class ID 183
+class ID 184
+class ID 185
+class ID 186
+class ID 187
+class ID 188
+class ID 189
+class ID 190
+class ID 191
+class ID 192
+class ID 193
+class ID 194
+class ID 195
+class ID 196
+class ID 197
+class ID 198
+class ID 199
+class ID 200
+class ID 201
+class ID 202
+class ID 203
+class ID 204
+class ID 205
+class ID 206
+class ID 207
+class ID 208
+class ID 209
+class ID 210
+class ID 211
+class ID 212
+class ID 213
+class ID 214
+class ID 215
+class ID 216
+class ID 217
+class ID 218
+class ID 219
+class ID 220
+class ID 221
+class ID 222
+class ID 223
+class ID 224
+class ID 225
+class ID 226
+class ID 227
+class ID 228
+class ID 229
+class ID 230
+class ID 231
+class ID 232
+class ID 233
+class ID 234
+class ID 235
+class ID 236
+class ID 237
+class ID 238
+class ID 239
+class ID 240
+class ID 241
+class ID 242
+class ID 243
+class ID 244
+class ID 245
+class ID 246
+class ID 247
+class ID 248
+class ID 249
+class ID 250
+class ID 251
+class ID 252
+class ID 253
+class ID 254
+class ID 255
+class ID 256
+class ID 257
+class ID 258
+class ID 259
+class ID 260
+class ID 261
+class ID 262
+class ID 263
+class ID 264
+class ID 265
+class ID 266
+class ID 267
+class ID 268
+class ID 269
+class ID 270
+class ID 271
+class ID 272
+class ID 273
+class ID 274
+class ID 275
+class ID 276
+class ID 277
+class ID 278
+class ID 279
+class ID 280
+class ID 281
+class ID 282
+class ID 283
+class ID 284
+class ID 285
+class ID 286
+class ID 287
+class ID 288
+class ID 289
+class ID 290
+class ID 291
+class ID 292
+class ID 293
+class ID 294
+class ID 295
+class ID 296
+class ID 297
+class ID 298
+class ID 299
+class ID 300
+class ID 301
+class ID 302
+class ID 303
+class ID 304
+class ID 305
+class ID 306
+class ID 307
+class ID 308
+class ID 309
+class ID 310
+class ID 311
+class ID 312
+class ID 313
+class ID 314
+class ID 315
+class ID 316
+class ID 317
+class ID 318
+class ID 319
+class ID 320
+class ID 321
+class ID 322
+class ID 323
+class ID 324
+class ID 325
+class ID 326
+class ID 327
+class ID 328
+class ID 329
+class ID 330
+class ID 331
+class ID 332
+class ID 333
+class ID 334
+class ID 335
+class ID 336
+class ID 337
+class ID 338
+class ID 339
+class ID 340
+class ID 341
+class ID 342
+class ID 343
+class ID 344
+class ID 345
+class ID 346
+class ID 347
+class ID 348
+class ID 349
+class ID 350
+class ID 351
+class ID 352
+class ID 353
+class ID 354
+class ID 355
+class ID 356
+class ID 357
+class ID 358
+class ID 359
+class ID 360
+class ID 361
+class ID 362
+class ID 363
+class ID 364
+class ID 365
+class ID 366
+class ID 367
+class ID 368
+class ID 369
+class ID 370
+class ID 371
+class ID 372
+class ID 373
+class ID 374
+class ID 375
+class ID 376
+class ID 377
+class ID 378
+class ID 379
+class ID 380
+class ID 381
+class ID 382
+class ID 383
+class ID 384
+class ID 385
+class ID 386
+class ID 387
+class ID 388
+class ID 389
+class ID 390
+class ID 391
+class ID 392
+class ID 393
+class ID 394
+class ID 395
+class ID 396
+class ID 397
+class ID 398
+class ID 399
+class ID 400
+class ID 401
+class ID 402
+class ID 403
+class ID 404
+class ID 405
+class ID 406
+class ID 407
+class ID 408
+class ID 409
+class ID 410
+class ID 411
+class ID 412
+class ID 413
+class ID 414
+class ID 415
+class ID 416
+class ID 417
+class ID 418
+class ID 419
+class ID 420
+class ID 421
+class ID 422
+class ID 423
+class ID 424
+class ID 425
+class ID 426
+class ID 427
+class ID 428
+class ID 429
+class ID 430
+class ID 431
+class ID 432
+class ID 433
+class ID 434
+class ID 435
+class ID 436
+class ID 437
+class ID 438
+class ID 439
+class ID 440
+class ID 441
+class ID 442
+class ID 443
+class ID 444
+class ID 445
+class ID 446
+class ID 447
+class ID 448
+class ID 449
+class ID 450
+class ID 451
+class ID 452
+class ID 453
+class ID 454
+class ID 455
+class ID 456
+class ID 457
+class ID 458
+class ID 459
+class ID 460
+class ID 461
+class ID 462
+class ID 463
+class ID 464
+class ID 465
+class ID 466
+class ID 467
+class ID 468
+class ID 469
+class ID 470
+class ID 471
+class ID 472
+class ID 473
+class ID 474
+class ID 475
+class ID 476
+class ID 477
+class ID 478
+class ID 479
+class ID 480
+class ID 481
+class ID 482
+class ID 483
+class ID 484
+class ID 485
+class ID 486
+class ID 487
+class ID 488
+class ID 489
+class ID 490
+class ID 491
+class ID 492
+class ID 493
+class ID 494
+class ID 495
+class ID 496
+class ID 497
+class ID 498
+class ID 499
+class ID 500
+class ID 501
+class ID 502
+class ID 503
+class ID 504
+class ID 505
+class ID 506
+class ID 507
+class ID 508
+class ID 509
+class ID 510
+class ID 511
+class ID 512
+class ID 513
+class ID 514
+class ID 515
+class ID 516
+class ID 517
+class ID 518
+class ID 519
+class ID 520
+class ID 521
+class ID 522
+class ID 523
+class ID 524
+class ID 525
+class ID 526
+class ID 527
+class ID 528
+class ID 529
+class ID 530
+class ID 531
+class ID 532
+class ID 533
+class ID 534
+class ID 535
+class ID 536
+class ID 537
+class ID 538
+class ID 539
+class ID 540
+class ID 541
+class ID 542
+class ID 543
+class ID 544
+class ID 545
+class ID 546
+class ID 547
+class ID 548
+class ID 549
+class ID 550
+class ID 551
+class ID 552
+class ID 553
+class ID 554
+class ID 555
+class ID 556
+class ID 557
+class ID 558
+class ID 559
+class ID 560
+class ID 561
+class ID 562
+class ID 563
+class ID 564
+class ID 565
+class ID 566
+class ID 567
+class ID 568
+class ID 569
+class ID 570
+class ID 571
+class ID 572
+class ID 573
+class ID 574
+class ID 575
+class ID 576
+class ID 577
+class ID 578
+class ID 579
+class ID 580
+class ID 581
+class ID 582
+class ID 583
+class ID 584
+class ID 585
+class ID 586
+class ID 587
+class ID 588
+class ID 589
+class ID 590
+class ID 591
+class ID 592
+class ID 593
+class ID 594
+class ID 595
+class ID 596
+class ID 597
+class ID 598
+class ID 599
+class ID 600
+class ID 601
+class ID 602
+class ID 603
+class ID 604
+class ID 605
+class ID 606
+class ID 607
+class ID 608
+class ID 609
+class ID 610
+class ID 611
+class ID 612
+class ID 613
+class ID 614
+class ID 615
+class ID 616
+class ID 617
+class ID 618
+class ID 619
+class ID 620
+class ID 621
+class ID 622
+class ID 623
+class ID 624
+class ID 625
+class ID 626
+class ID 627
+class ID 628
+class ID 629
+class ID 630
+class ID 631
+class ID 632
+class ID 633
+class ID 634
+class ID 635
+class ID 636
+class ID 637
+class ID 638
+class ID 639
+class ID 640
+class ID 641
+class ID 642
+class ID 643
+class ID 644
+class ID 645
+class ID 646
+class ID 647
+class ID 648
+class ID 649
+class ID 650
+class ID 651
+class ID 652
+class ID 653
+class ID 654
+class ID 655
+class ID 656
+class ID 657
+class ID 658
+class ID 659
+class ID 660
+class ID 661
+class ID 662
+class ID 663
+class ID 664
+class ID 665
+class ID 666
+class ID 667
+class ID 668
+class ID 669
+class ID 670
+class ID 671
+class ID 672
+class ID 673
+class ID 674
+class ID 675
+class ID 676
+class ID 677
+class ID 678
+class ID 679
+class ID 680
+class ID 681
+class ID 682
+class ID 683
+class ID 684
+class ID 685
+class ID 686
+class ID 687
+class ID 688
+class ID 689
+class ID 690
+class ID 691
+class ID 692
+class ID 693
+class ID 694
+class ID 695
+class ID 696
+class ID 697
+class ID 698
+class ID 699
+class ID 700
+class ID 701
+class ID 702
+class ID 703
+class ID 704
+class ID 705
+class ID 706
+class ID 707
+class ID 708
+class ID 709
+class ID 710
+class ID 711
+class ID 712
+class ID 713
+class ID 714
+class ID 715
+class ID 716
+class ID 717
+class ID 718
+class ID 719
+class ID 720
+class ID 721
+class ID 722
+class ID 723
+class ID 724
+class ID 725
+class ID 726
+class ID 727
+class ID 728
+class ID 729
+class ID 730
+class ID 731
+class ID 732
+class ID 733
+class ID 734
+class ID 735
+class ID 736
+class ID 737
+class ID 738
+class ID 739
+class ID 740
+class ID 741
+class ID 742
+class ID 743
+class ID 744
+class ID 745
+class ID 746
+class ID 747
+class ID 748
+class ID 749
+class ID 750
+class ID 751
+class ID 752
+class ID 753
+class ID 754
+class ID 755
+class ID 756
+class ID 757
+class ID 758
+class ID 759
+class ID 760
+class ID 761
+class ID 762
+class ID 763
+class ID 764
+class ID 765
+class ID 766
+class ID 767
+class ID 768
+class ID 769
+class ID 770
+class ID 771
+class ID 772
+class ID 773
+class ID 774
+class ID 775
+class ID 776
+class ID 777
+class ID 778
+class ID 779
+class ID 780
+class ID 781
+class ID 782
+class ID 783
+class ID 784
+class ID 785
+class ID 786
+class ID 787
+class ID 788
+class ID 789
+class ID 790
+class ID 791
+class ID 792
+class ID 793
+class ID 794
+class ID 795
+class ID 796
+class ID 797
+class ID 798
+class ID 799
+class ID 800
+class ID 801
+class ID 802
+class ID 803
+class ID 804
+class ID 805
+class ID 806
+class ID 807
+class ID 808
+class ID 809
+class ID 810
+class ID 811
+class ID 812
+class ID 813
+class ID 814
+class ID 815
+class ID 816
+class ID 817
+class ID 818
+class ID 819
+class ID 820
+class ID 821
+class ID 822
+class ID 823
+class ID 824
+class ID 825
+class ID 826
+class ID 827
+class ID 828
+class ID 829
+class ID 830
+class ID 831
+class ID 832
+class ID 833
+class ID 834
+class ID 835
+class ID 836
+class ID 837
+class ID 838
+class ID 839
+class ID 840
+class ID 841
+class ID 842
+class ID 843
+class ID 844
+class ID 845
+class ID 846
+class ID 847
+class ID 848
+class ID 849
+class ID 850
+class ID 851
+class ID 852
+class ID 853
+class ID 854
+class ID 855
+class ID 856
+class ID 857
+class ID 858
+class ID 859
+class ID 860
+class ID 861
+class ID 862
+class ID 863
+class ID 864
+class ID 865
+class ID 866
+class ID 867
+class ID 868
+class ID 869
+class ID 870
+class ID 871
+class ID 872
+class ID 873
+class ID 874
+class ID 875
+class ID 876
+class ID 877
+class ID 878
+class ID 879
+class ID 880
+class ID 881
+class ID 882
+class ID 883
+class ID 884
+class ID 885
+class ID 886
+class ID 887
+class ID 888
+class ID 889
+class ID 890
+class ID 891
+class ID 892
+class ID 893
+class ID 894
+class ID 895
+class ID 896
+class ID 897
+class ID 898
+class ID 899
+class ID 900
+class ID 901
+class ID 902
+class ID 903
+class ID 904
+class ID 905
+class ID 906
+class ID 907
+class ID 908
+class ID 909
+class ID 910
+class ID 911
+class ID 912
+class ID 913
+class ID 914
+class ID 915
+class ID 916
+class ID 917
+class ID 918
+class ID 919
+class ID 920
+class ID 921
+class ID 922
+class ID 923
+class ID 924
+class ID 925
+class ID 926
+class ID 927
+class ID 928
+class ID 929
+class ID 930
+class ID 931
+class ID 932
+class ID 933
+class ID 934
+class ID 935
+class ID 936
+class ID 937
+class ID 938
+class ID 939
+class ID 940
+class ID 941
+class ID 942
+class ID 943
+class ID 944
+class ID 945
+class ID 946
+class ID 947
+class ID 948
+class ID 949
+class ID 950
+class ID 951
+class ID 952
+class ID 953
+class ID 954
+class ID 955
+class ID 956
+class ID 957
+class ID 958
+class ID 959
+class ID 960
+class ID 961
+class ID 962
+class ID 963
+class ID 964
+class ID 965
+class ID 966
+class ID 967
+class ID 968
+class ID 969
+class ID 970
+class ID 971
+class ID 972
+class ID 973
+class ID 974
+class ID 975
+class ID 976
+class ID 977
+class ID 978
+class ID 979
+class ID 980
+class ID 981
+class ID 982
+class ID 983
+class ID 984
+class ID 985
+class ID 986
+class ID 987
+class ID 988
+class ID 989
+class ID 990
+class ID 991
+class ID 992
+class ID 993
+class ID 994
+class ID 995
+class ID 996
+class ID 997
+class ID 998
+class ID 999
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/command_line.cpp b/python/openvino/runtime/streaming/streaming_inference_app/command_line.cpp
new file mode 100644
index 0000000..794310b
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/command_line.cpp
@@ -0,0 +1,72 @@
+// Copyright 2021-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "command_line.h"
+#include <algorithm>
+
+static void TrimString(std::string& trimString) {
+  trimString.erase(0, trimString.find_first_not_of(" \n\r\t"));
+  trimString.erase(trimString.find_last_not_of(" \n\r\t") + 1);
+}
+
+static void MakeLower(std::string& stringValue) {
+  std::transform(stringValue.begin(), stringValue.end(), stringValue.begin(), ::tolower);
+}
+
+// Program -option=value
+CommandLine::CommandLine(int argumentCount, char* argumentValues[]) {
+  if (argumentCount > 0) _executableName = argumentValues[0];
+
+  for (int i = 1; i < argumentCount; i++) {
+    std::string inputString(argumentValues[i]);
+    std::string nextChar = inputString.substr(0, 1);
+    if ((nextChar == "-") or (nextChar == "/")) {
+      inputString = inputString.substr(1);
+      size_t equals = inputString.find("=");
+      std::string option;
+      std::string value;
+
+      if (equals == std::string::npos) {
+        option = inputString;
+      } else {
+        option = inputString.substr(0, equals);
+        value = inputString.substr(equals + 1);
+      }
+
+      TrimString(option);
+      TrimString(value);
+      MakeLower(option);
+      _optionMap[option] = value;
+    }
+  }
+}
+
+std::string CommandLine::GetOptionValue(const char* optionName) {
+  auto i = _optionMap.find(optionName);
+  if (i != _optionMap.end())
+    return i->second;
+  else
+    return "";
+}
+
+bool CommandLine::HaveOption(const char* optionName) { return (_optionMap.find(optionName) != _optionMap.end()); }
+
+bool CommandLine::GetOption(const char* optionName, std::string& optionValue) {
+  auto i = _optionMap.find(optionName);
+  if (i == _optionMap.end()) return false;
+
+  optionValue = i->second;
+  return true;
+}
+
+size_t CommandLine::NumOptions() { return _optionMap.size(); }
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/command_line.h b/python/openvino/runtime/streaming/streaming_inference_app/command_line.h
new file mode 100644
index 0000000..41b12f0
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/command_line.h
@@ -0,0 +1,31 @@
+// Copyright 2021-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <string>
+#include <unordered_map>
+
+class CommandLine {
+ public:
+  CommandLine(int argumentCount, char* argumentValues[]);
+
+  std::string GetOptionValue(const char* optionName);
+  bool GetOption(const char* optionName, std::string& optionValue);
+  bool HaveOption(const char* optionName);
+  std::string GetExecutableName() { return _executableName; }
+  size_t NumOptions();
+
+ private:
+  std::string _executableName;
+  std::unordered_map<std::string, std::string> _optionMap;
+};
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.cpp b/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.cpp
new file mode 100644
index 0000000..d0e1ed0
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.cpp
@@ -0,0 +1,413 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "streaming_inference_app.h"
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <sstream>
+#include <thread>
+#include "dla_plugin_config.hpp"
+
+using namespace std::chrono_literals;
+
+std::ofstream StreamingInferenceApp::_resultsStream("results.txt");
+std::mutex StreamingInferenceApp::_signalMutex;
+std::condition_variable StreamingInferenceApp::_signalConditionVariable;
+std::chrono::time_point<std::chrono::system_clock> StreamingInferenceApp::_startTime;
+
+int main(int numParams, char* paramValues[]) {
+  StreamingInferenceApp app(numParams, paramValues);
+
+  try {
+    app.Run();
+  } catch (const std::exception& ex) {
+    std::cerr << ex.what() << '\n';
+  }
+  return 0;
+}
+
+StreamingInferenceApp::StreamingInferenceApp(int numParams, char* paramValues[])
+    : _commandLine(numParams, paramValues) {
+  OsStartup();
+  LoadClassNames();
+}
+
+StreamingInferenceApp::~StreamingInferenceApp() {
+  timespec waitTimeout = {};
+  if (_pCancelSemaphore) {
+    // Reset the cancel semaphore
+    int r = 0;
+    do {
+      r = ::sem_timedwait(_pCancelSemaphore, &waitTimeout);
+    } while (r == 0);
+    ::sem_close(_pCancelSemaphore);
+  }
+
+  if (_pReadyForImageStreamSemaphore) {
+    // Reset the ready semaphore
+    int r = 0;
+    do {
+      r = ::sem_timedwait(_pReadyForImageStreamSemaphore, &waitTimeout);
+    } while (r == 0);
+    ::sem_close(_pReadyForImageStreamSemaphore);
+  }
+}
+
+void StreamingInferenceApp::Run() {
+  std::filesystem::path pluginsFilename = "plugins.xml";
+
+  std::string deviceName;
+  std::string arch;
+  std::string model;
+
+  // Get the command line options for the model, arch file, and device
+  if (not _commandLine.GetOption("model", model) or not _commandLine.GetOption("arch", arch) or
+      not _commandLine.GetOption("device", deviceName)) {
+    return Usage();
+  }
+
+  std::filesystem::path architectureFilename = arch;
+  std::filesystem::path compiledModelFilename = model;
+
+  // Check that the provided files do in fact exist
+  if (not CheckFileExists(architectureFilename, "architecture") or not CheckFileExists(pluginsFilename, "plugins") or
+      not CheckFileExists(compiledModelFilename, "compiled model")) {
+    return;
+  }
+
+  InferenceEngine::Core inferenceEngine(pluginsFilename);
+
+  // Setup CoreDLA private configuration parameters
+  const std::map<std::string, std::string> configParameters;
+  inferenceEngine.SetConfig({{DLIAPlugin::properties::arch_path.name(), architectureFilename}}, "FPGA");
+
+  // If dropSourceBuffers is 0, no input buffers are dropped
+  // If dropSourceBuffers is 1, then 1 buffer is processed, 1 gets dropped
+  // If dropSourceBuffers is 2, then 1 buffer is processed, 2 get dropped, etc.
+  uint32_t dropSourceBuffers = 0;
+
+  inferenceEngine.SetConfig({{DLIAPlugin::properties::streaming_drop_source_buffers.name(), std::to_string(dropSourceBuffers)},
+                             {DLIAPlugin::properties::external_streaming.name(), CONFIG_VALUE(YES)}},
+                            "FPGA");
+
+  std::ifstream inputFile(compiledModelFilename, std::fstream::binary);
+  if (not inputFile) {
+    std::cout << "Failed to load compiled model file.\n";
+    return;
+  }
+
+  // Load the model to the device
+  InferenceEngine::ExecutableNetwork importedNetwork = inferenceEngine.ImportNetwork(inputFile, deviceName, {});
+
+  // The plugin defines the number of inferences requests required for streaming
+  uint32_t numStreamingInferenceRequests = importedNetwork.GetMetric(DLIAPlugin::properties::num_streaming_inference_requests.name()).as<uint32_t>();
+  const std::string cancelSemaphoreName = importedNetwork.GetMetric(DLIAPlugin::properties::cancel_semaphore_name.name()).as<std::string>();
+  _cancelSemaphoreName = cancelSemaphoreName;
+
+  for (uint32_t i = 0; i < numStreamingInferenceRequests; i++) {
+    auto spInferenceData = std::make_shared<SingleInferenceData>(this, importedNetwork, i);
+    _inferences.push_back(spInferenceData);
+  }
+
+  // Start the inference requests. Streaming inferences will reschedule
+  // themselves when complete
+  for (auto& inference : _inferences) {
+    inference->StartAsync();
+  }
+
+  std::cout << "Ready to start image input stream.\n";
+
+  // Signal the image streaming app that we are ready, so it can
+  // begin transferring files
+  SetReadyForImageStreamSemaphore();
+
+  // Wait until Ctrl+C
+  bool done = false;
+  while (not done) {
+    std::unique_lock<std::mutex> lock(_signalMutex);
+    done = (_signalConditionVariable.wait_for(lock, 1000ms) != std::cv_status::timeout);
+  }
+
+  SetShutdownSemaphore();
+
+  for (auto& inference : _inferences) {
+    inference->Cancel();
+  }
+
+  _inferences.clear();
+}
+
+
+void StreamingInferenceApp::SetShutdownSemaphore() {
+  _pCancelSemaphore = ::sem_open(_cancelSemaphoreName.c_str(), O_CREAT, 0644, 0);
+  if (_pCancelSemaphore) {
+    ::sem_post(_pCancelSemaphore);
+  }
+}
+
+
+void StreamingInferenceApp::SetReadyForImageStreamSemaphore() {
+  _pReadyForImageStreamSemaphore = ::sem_open("/CoreDLA_ready_for_streaming", O_CREAT, 0644, 0);
+  if (_pReadyForImageStreamSemaphore) {
+    ::sem_post(_pReadyForImageStreamSemaphore);
+  }
+}
+
+
+/**
+ * Print a help menu to the console
+ */
+void StreamingInferenceApp::Usage() {
+  std::cout << "Usage:\n";
+  std::cout << "\tstreaming_inference_app -model=<model> -arch=<arch> -device=<device>\n\n";
+  std::cout << "Where:\n";
+  std::cout << "\t<model>    is the compiled model binary file, eg /home/root/resnet-50-tf/RN50_Performance_no_folding.bin\n";
+  std::cout << "\t<arch>     is the architecture file, eg /home/root/resnet-50-tf/A10_Performance.arch\n";
+  std::cout << "\t<device>   is the OpenVINO device ID, eg HETERO:FPGA or HETERO:FPGA,CPU\n";
+}
+
+
+/**
+ * Check that a file exists
+ *
+ * @param[in]  filename Filename to check
+ * @param[in]  message  Description of file to display if it does not exist
+ * @returns             true if the file exists, false otherwise
+ */
+bool StreamingInferenceApp::CheckFileExists(const std::filesystem::path& filename, const std::string& message) {
+  if (not std::filesystem::exists(filename)) {
+    std::cout << "Can't find " << message << ", '" << filename.c_str() << "'\n";
+    return false;
+  }
+
+  return true;
+}
+
+////////////
+
+std::atomic<uint32_t> SingleInferenceData::_atomic{0};
+uint32_t SingleInferenceData::_numResults = 0;
+
+SingleInferenceData::SingleInferenceData(StreamingInferenceApp* pApp,
+                                         InferenceEngine::ExecutableNetwork& importedNetwork,
+                                         uint32_t index)
+    : _pApp(pApp), _importedNetwork(importedNetwork), _index(index), _inferenceCount(0) {
+  // Set up output blob
+  InferenceEngine::ConstOutputsDataMap outputsInfo = importedNetwork.GetOutputsInfo();
+  std::shared_ptr<const InferenceEngine::Data> spOutputInfo = outputsInfo.begin()->second;
+  std::string outputName = outputsInfo.begin()->first;
+
+  _spOutputBlob = CreateOutputBlob(spOutputInfo);
+
+  // Create an inference request and set its completion callback
+  _inferenceRequest = importedNetwork.CreateInferRequest();
+  auto inferenceRequestCompleteCB = [=]() { ProcessResult(); };
+  _inferenceRequest.SetCompletionCallback(inferenceRequestCompleteCB);
+
+  // Assign the output blob to the inference request
+  _inferenceRequest.SetBlob(outputName, _spOutputBlob);
+}
+
+
+std::shared_ptr<InferenceEngine::Blob> SingleInferenceData::CreateOutputBlob(
+    std::shared_ptr<const InferenceEngine::Data> spOutputInfo) {
+  const InferenceEngine::TensorDesc& outputTensorDesc = spOutputInfo->getTensorDesc();
+  std::shared_ptr<InferenceEngine::Blob> pOutputBob = InferenceEngine::make_shared_blob<float>(outputTensorDesc);
+  pOutputBob->allocate();
+
+  InferenceEngine::MemoryBlob::Ptr pMemoryBlob = InferenceEngine::as<InferenceEngine::MemoryBlob>(pOutputBob);
+  if (pMemoryBlob) {
+    auto lockedMemory = pMemoryBlob->wmap();
+    float* pOutputBlobData = lockedMemory.as<float*>();
+    if (pOutputBlobData) {
+      size_t outputSize = pOutputBob->size();
+      for (size_t i = 0; i < outputSize; i++) {
+        pOutputBlobData[i] = 0.0f;
+      }
+    }
+  }
+
+  return pOutputBob;
+}
+
+
+void SingleInferenceData::StartAsync() {
+  _inferenceCount = _atomic++;
+  _inferenceRequest.StartAsync();
+}
+
+void SingleInferenceData::Wait() { _inferenceRequest.Wait(); }
+
+void SingleInferenceData::Cancel() { _inferenceRequest.Cancel(); }
+
+
+/**
+ * Stores the results of an inference
+ *
+ * The index corresponds to the category of the image, and the score is
+ * the confidence level of the image.
+ */
+class ResultItem {
+ public:
+  uint32_t _index;
+  float _score;
+  bool operator<(const ResultItem& other) { return (_score > other._score); }
+};
+
+
+/**
+ * Called when inference request has completed
+ *
+ * The inference results are floating point numbers consisting of the score for each category.
+ * The scores are then sorted and the highest is written to the console. The top 5 scores of the
+ * first 1000 images are saved to results.txt.
+ *
+ * Set as a callback in SingleInferenceData()
+ */
+void SingleInferenceData::ProcessResult() {
+  if (_pApp and _pApp->IsCancelling()) {
+    return;
+  }
+
+  // Increment the number of inference results that have returned thus far
+  _numResults++;
+
+  // If this is the first returned inference, store the current time to calculate the inference rate
+  if (_numResults == 1) {
+    StreamingInferenceApp::_startTime = std::chrono::system_clock::now();
+  } else if (_numResults == 101) {
+    // The inference rate is calculated afer 100 results have been received
+    auto endTime = std::chrono::system_clock::now();
+    auto duration = endTime - StreamingInferenceApp::_startTime;
+    double durationMS = (double)std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    double durationSecondsOne = durationMS / 100000.0;
+    double rate = 1.0 / durationSecondsOne;
+    std::cout << "Inference rate = " << rate << '\n';
+  }
+
+  // Create a float pointer to the returned data
+  size_t outputSize = _spOutputBlob->size();
+  float* pOutputData = _spOutputBlob->buffer().as<float*>();
+  if (!pOutputData) {
+    return;
+  }
+
+  // Store each score as a ResultItem
+  std::vector<ResultItem> results;
+  for (size_t i = 0; i < outputSize; i++) {
+    results.push_back({(uint32_t)i, pOutputData[i]});
+  }
+
+  // Sort the scores and set up the output streams
+  std::sort(results.begin(), results.end());
+  std::stringstream fileString;
+  std::stringstream outString;
+  bool flushFile = false;
+
+  // Store the top 5 results of the first 1000 images to be written to a file
+  if (_numResults <= 1000) {
+    fileString << "Result: image[" << _numResults << "]\n";
+    fileString << std::fixed << std::setprecision(1);
+
+    for (size_t i = 0; i < 5; i++) {
+      std::string className = _pApp->_imageNetClasses[results[i]._index];
+      float score = results[i]._score * 100.0f;
+      fileString << (i + 1) << ". " << className << ", score = " << score << '\n';
+    }
+
+    fileString << '\n';
+  }
+
+  if (_numResults == 1001) {
+    fileString << "End of results capture\n";
+    flushFile = true;
+  }
+
+  // Store the top score to write to the console
+  outString << std::fixed << std::setprecision(1);
+  std::string className = _pApp->_imageNetClasses[results[0]._index];
+  float score = results[0]._score * 100.0f;
+  outString << _numResults << " - " << className << ", score = " << score << '\n';
+
+  // Write the results to the file
+  std::string writeFileString = fileString.str();
+  if (not writeFileString.empty()) {
+    StreamingInferenceApp::_resultsStream << writeFileString;
+    if (flushFile) {
+      StreamingInferenceApp::_resultsStream << std::endl;
+    }
+  }
+
+  // Write the top score to the console
+  std::cout << outString.str();
+
+  // Start again
+  StartAsync();
+}
+
+
+/**
+ * Load the categories and store them in _imageNetClasses
+ */
+void StreamingInferenceApp::LoadClassNames() {
+  _imageNetClasses.resize(1001);
+
+  bool validClassFile = false;
+  std::filesystem::path classNameFilePath = "categories.txt";
+
+  if (std::filesystem::exists(classNameFilePath)) {
+    size_t classIndex = 0;
+    std::ifstream classNameStream(classNameFilePath);
+
+    if (classNameStream) {
+      std::string className;
+      while (std::getline(classNameStream, className)) {
+        if (classIndex < 1001) _imageNetClasses[classIndex] = className;
+
+        classIndex++;
+      }
+
+      validClassFile = (classIndex == 1001);
+      if (not validClassFile) {
+        std::cout << "Ignoring the categories.txt file. The file is expected to be a text file "
+                     "with 1000 lines.\n";
+      }
+    }
+  } else {
+    std::cout << "No categories.txt file found. This file should contain 1000\n"
+                 "lines, with the name of each category on each line.\n";
+  }
+
+  if (not validClassFile) {
+    _imageNetClasses[0] = "NONE";
+    for (size_t i = 1; i <= 1000; i++) {
+      _imageNetClasses[i] = "Image class #" + std::to_string(i);
+    }
+  }
+}
+
+static void SigIntHandler(int) {
+  std::cout << "\nCtrl+C detected. Shutting down application\n";
+  std::lock_guard<std::mutex> lock(StreamingInferenceApp::_signalMutex);
+  StreamingInferenceApp::_signalConditionVariable.notify_one();
+}
+
+void StreamingInferenceApp::OsStartup() {
+  // Ctrl+C will exit the application
+  signal(SIGINT, SigIntHandler);
+}
diff --git a/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.h b/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.h
new file mode 100644
index 0000000..3cdafa0
--- /dev/null
+++ b/python/openvino/runtime/streaming/streaming_inference_app/streaming_inference_app.h
@@ -0,0 +1,74 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <semaphore.h>
+#include <atomic>
+#include <condition_variable>
+#include <filesystem>
+#include "command_line.h"
+#include "inference_engine.hpp"
+
+class SingleInferenceData;
+using SingleInferenceDataPtr = std::shared_ptr<SingleInferenceData>;
+
+class StreamingInferenceApp {
+  friend class SingleInferenceData;
+
+ public:
+  StreamingInferenceApp(int numParams, char* paramValues[]);
+  ~StreamingInferenceApp();
+  void Usage();
+  void Run();
+  bool IsCancelling() { return (_pCancelSemaphore != nullptr); }
+
+  static std::mutex _signalMutex;
+  static std::condition_variable _signalConditionVariable;
+  static std::chrono::time_point<std::chrono::system_clock> _startTime;
+  static std::ofstream _resultsStream;
+
+ private:
+  void OsStartup();
+  bool CheckFileExists(const std::filesystem::path& filename, const std::string& message);
+  void SetShutdownSemaphore();
+  void SetReadyForImageStreamSemaphore();
+  void LoadClassNames();
+
+  std::vector<SingleInferenceDataPtr> _inferences;
+  CommandLine _commandLine;
+  sem_t* _pCancelSemaphore = nullptr;
+  sem_t* _pReadyForImageStreamSemaphore = nullptr;
+  std::string _cancelSemaphoreName;
+  std::vector<std::string> _imageNetClasses;
+};
+
+class SingleInferenceData {
+ public:
+  SingleInferenceData(StreamingInferenceApp* pApp, InferenceEngine::ExecutableNetwork& importedNetwork, uint32_t index);
+  void StartAsync();
+  void Wait();
+  void Cancel();
+
+ private:
+  void ProcessResult();
+  std::shared_ptr<InferenceEngine::Blob> CreateOutputBlob(std::shared_ptr<const InferenceEngine::Data> spOutputInfo);
+
+  StreamingInferenceApp* _pApp;
+  InferenceEngine::ExecutableNetwork& _importedNetwork;
+  std::shared_ptr<InferenceEngine::Blob> _spOutputBlob;
+  InferenceEngine::InferRequest _inferenceRequest;
+  uint32_t _index;
+  uint32_t _inferenceCount;
+  static uint32_t _numResults;
+  static std::atomic<uint32_t> _atomic;
+};