24 files changed, 5643 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
new file mode 100644
index 0000000..66e06bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
@@ -0,0 +1,18 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
new file mode 100644
index 0000000..28dcfa4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
@@ -0,0 +1,63 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/ccip_mmd.cpp
+   ./host/ccip_mmd_device.cpp
+   ./host/dma_work_thread.cpp
+   ./host/fpga_dma.c
+   ./host/kernel_interrupt.cpp
+   ./host/mmd_dma.cpp
+   ./host/memcpy_s_fast.c
+   ./host/x86-sse2.S
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when liking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
new file mode 100644
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
new file mode 100644
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
new file mode 100644
index 0000000..1530978
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
@@ -0,0 +1 @@
+*.o
+\ No newline at end of file
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
new file mode 100644
index 0000000..6d8f9fa
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
@@ -0,0 +1,123 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Driver does not support Address Span Extender
+ * - Implementation is not optimized for performance.
+ *   User buffer data is copied into a DMA-able buffer before the transfer
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef AFU_BBB_UTIL_H__
+#define AFU_BBB_UTIL_H__
+
+#include <assert.h>
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#define DFH_FEATURE_EOL(dfh) (((dfh >> 40) & 1) == 1)
+#define DFH_FEATURE(dfh) ((dfh >> 60) & 0xf)
+#define DFH_FEATURE_IS_PRIVATE(dfh) (DFH_FEATURE(dfh) == 3)
+#define DFH_FEATURE_IS_BBB(dfh) (DFH_FEATURE(dfh) == 2)
+#define DFH_FEATURE_IS_AFU(dfh) (DFH_FEATURE(dfh) == 1)
+#define DFH_FEATURE_NEXT(dfh) ((dfh >> 16) & 0xffffff)
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             uint64_t find_id_l,
+                             uint64_t find_id_h,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  assert(find_id_l);
+  assert(find_id_h);
+
+  uint64_t offset = 0;
+  if (result_offset) {
+    offset = *result_offset;
+  }
+  uint64_t dfh = 0;
+
+  // Limit the maximum number of DFH search iterations to avoid getting stuck
+  // in an infinte loop in case the DFH_FEATURE_EOL is not found.  Limit of
+  // 5000 is very conservaitve.  In practice search should terminate in 3 or
+  // fewer iterations.
+  int MAX_DFH_SEARCHES = 5000;
+  int dfh_search_iterations = 0;
+
+  do {
+    fpgaReadMMIO64(afc_handle, 0, offset, &dfh);
+
+    int is_bbb = DFH_FEATURE_IS_BBB(dfh);
+    int is_afu = DFH_FEATURE_IS_AFU(dfh);
+
+    if (is_afu || is_bbb) {
+      uint64_t id_l = 0;
+      uint64_t id_h = 0;
+      fpgaReadMMIO64(afc_handle, 0, offset + 8, &id_l);
+      fpgaReadMMIO64(afc_handle, 0, offset + 16, &id_h);
+
+      if (find_id_l == id_l && find_id_h == id_h) {
+        if (result_offset) *result_offset = offset;
+        if (result_next_offset) *result_next_offset = DFH_FEATURE_NEXT(dfh);
+        return true;
+      }
+    }
+    offset += DFH_FEATURE_NEXT(dfh);
+
+    dfh_search_iterations++;
+    if (dfh_search_iterations > MAX_DFH_SEARCHES) {
+      return false;
+    }
+  } while (!DFH_FEATURE_EOL(dfh));
+
+  return false;
+}
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             const char *guid_str,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  fpga_guid guid;
+
+  if (uuid_parse(guid_str, guid) < 0) return 0;
+
+  uint32_t i;
+  uint32_t s;
+
+  uint64_t find_id_l = 0;
+  uint64_t find_id_h = 0;
+
+  // The API expects the MSB of the GUID at [0] and the LSB at [15].
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_h = ((find_id_h << 8) | (0xff & guid[i]));
+  }
+
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_l = ((find_id_l << 8) | (0xff & guid[8 + i]));
+  }
+
+  return find_dfh_by_guid(afc_handle, find_id_l, find_id_h, result_offset, result_next_offset);
+}
+
+#endif  // AFU_BBB_UTIL_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
new file mode 100644
index 0000000..b7cd06a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
@@ -0,0 +1,655 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <cassert>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "aocl_mmd.h"
+#include "ccip_mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+#define ACL_DCP_ERROR_IF(COND, NEXT, ...)  \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define ACL_PKG_SECTION_DCP_GBS_GZ ".acl.gbs.gz"
+
+// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime
+// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure
+// the runtime doesn't get to reference them after MMD destructors have been called.
+// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does.
+// Implemented as a singleton.
+class DeviceMapManager final {
+ public:
+  typedef std::map<int, CcipDevice*> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  // Returns handle and device pointer to the device with the specified name
+  // Creates a new entry for this device if it doesn't already exist
+  // Return 0 on success, -1 on failure
+  int get_or_create_device(const char* board_name, int* handle, CcipDevice** device);
+
+  // Return obj id based on BSP name.
+  uint64_t id_from_name(const char* board_name);
+
+  // Return MMD handle based on obj id. Returned value is negative if board doesn't exist
+  inline int handle_from_id(uint64_t obj_id);
+
+  // Return pointer to CCIP device based on MMD handle. Returned value is null if board doesn't exist
+  CcipDevice* device_from_handle(int handle);
+
+  // Closes specified device if it exists
+  void close_device_if_exists(int handle);
+
+  // Returns a reference to the class singleton
+  static DeviceMapManager& get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const&) = delete;
+  void operator=(DeviceMapManager const&) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated CcipDevice* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+  }
+  t_handle_to_dev_map* handle_to_dev_map = nullptr;
+  t_id_to_handle_map* id_to_handle_map = nullptr;
+};
+static DeviceMapManager& device_manager = DeviceMapManager::get_instance();
+
+int DeviceMapManager::get_or_create_device(const char* board_name, int* handle, CcipDevice** device) {
+  int _handle = CCIP_MMD_INVALID_PARAM;
+  CcipDevice* _device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new CcipDevice(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+  return DeviceMapManager::SUCCESS;
+}
+
+uint64_t DeviceMapManager::id_from_name(const char* board_name) {
+  uint64_t obj_id = 0;
+  if (CcipDevice::parse_board_name(board_name, obj_id)) {
+    return obj_id;
+  } else {
+    // TODO: add error hanlding for DeviceMapManager (make sure 0 is marked as invalid device)
+    return 0;
+  }
+}
+
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = CCIP_MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+  }
+  return handle;
+}
+
+CcipDevice* DeviceMapManager::device_from_handle(int handle) {
+  CcipDevice* dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+  }
+  return dev;
+}
+
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      CcipDevice* dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+    }
+  }
+}
+
+// Interface for checking if AFU has BSP loaded
+bool ccip_mmd_bsp_loaded(const char* name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    CcipDevice* dev = device_manager.device_from_handle(handle);
+    if (dev)
+      return dev->bsp_loaded();
+    else
+      return false;
+  } else {
+    bool bsp_loaded = false;
+    try {
+      CcipDevice dev(obj_id);
+      bsp_loaded = dev.bsp_loaded();
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      return false;
+    }
+    return bsp_loaded;
+  }
+}
+
+static int get_offline_num_acl_boards(bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return CCIP_MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+bool static get_offline_board_names(std::string& boards, bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  fpga_properties filter = nullptr;
+  fpga_properties prop = nullptr;
+  std::ostringstream board_name;
+  fpga_token* toks = nullptr;
+  uint64_t obj_id;
+  bool success = true;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      success = false;
+      goto cleanup;
+    }
+  }
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  toks = static_cast<fpga_token*>(calloc(num_matches, sizeof(fpga_token)));
+  if (toks == NULL) {
+    LOG_ERR("Error allocating memory\n");
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaEnumerate(&filter, 1, toks, num_matches, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  for (unsigned int i = 0; i < num_matches; i++) {
+    if (prop) fpgaDestroyProperties(&prop);
+    res = fpgaGetProperties(toks[i], &prop);
+    if (res == FPGA_OK) {
+      res = fpgaPropertiesGetObjectID(prop, &obj_id);
+      if (res != FPGA_OK) {
+        LOG_ERR("Error reading object ID: %s\n", fpgaErrStr(res));
+        success = false;
+        break;
+      }
+      boards.append(CcipDevice::get_board_name(BSP_NAME, obj_id));
+      if (i < num_matches - 1) boards.append(";");
+    } else {
+      success = false;
+      LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    }
+  }
+
+cleanup:
+  if (prop) {
+    fpgaDestroyProperties(&prop);
+  }
+  if (filter) {
+    fpgaDestroyProperties(&filter);
+  }
+  if (toks) {
+    for (unsigned i = 0; i < num_matches; i++) {
+      if (toks[i]) {
+        fpgaDestroyToken(&toks[i]);
+      }
+    }
+    free(toks);
+  }
+
+  return success;
+}
+
+int aocl_mmd_yield(int handle) {
+  DEBUG_PRINT("* Called: aocl_mmd_yield\n");
+  YIELD_DELAY();
+
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  assert(dev);
+  if (dev) {
+    return dev->yield();
+  }
+
+  return 0;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int*)param_value) = X;                          \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strlen(X) + 1;                                           \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy_s_fast((void*)param_value, param_value_size, X, Xcpylen);         \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void* param_value,
+                              size_t* param_size_ret) {
+  // aocl_mmd_get_offline_info can be called many times by the runtime
+  // and it is expensive to query the system.  Only compute values first
+  // time aocl_mmd_get_offline_info called future iterations use saved results
+  static bool initialized = false;
+  static int mem_type_info;
+  static int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards();
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      if (num_acl_boards >= 0) {
+        RESULT_INT(num_acl_boards);
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(KernelInterrupt::yield_is_enabled());
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+int ccip_mmd_get_offline_board_names(size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  std::string boards;
+  bool success = get_offline_board_names(boards, false);
+  if (success) {
+    RESULT_STR(boards.c_str());
+  } else {
+    RESULT_INT(-1);
+  }
+
+  return 0;
+}
+
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  DEBUG_PRINT("called aocl_mmd_get_info\n");
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel PAC Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float* ptr = static_cast<float*>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+  } else {
+    return CCIP_MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) dev->set_status_handler(fn, user_data);
+  // TODO: handle error condition if dev null
+  return 0;
+}
+
+// Host to device-global-memory write
+int aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n- aocl_mmd_write: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, src, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n+ aocl_mmd_read: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, dst, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_open(const char* name) {
+  DEBUG_PRINT("Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return CCIP_MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  CcipDevice* dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    delete dev;
+    return CCIP_MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->bsp_loaded()) {
+    if (!dev->initialize_bsp()) {
+      LOG_ERR("Error initializing bsp\n");
+      return CCIP_MMD_BSP_INIT_FAILED;
+    }
+  } else {
+    return CCIP_MMD_BSP_NOT_LOADED;
+  }
+
+  return handle;
+}
+
+int aocl_mmd_close(int handle) {
+  device_manager.close_device_if_exists(handle);
+
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 2; }
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 266.666667; }  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 32) * instance + addr; }
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) {
+  return aocl_mmd_write(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
new file mode 100644
index 0000000..9bc055a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
@@ -0,0 +1,579 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <numa.h>
+
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+
+// TODO: better encapsulation of afu_bbb_util functions
+#include "afu_bbb_util.h"
+
+#define MMD_COPY_BUFFER_SIZE (1024 * 1024)
+
+#define MEM_WINDOW_BBB_GUID "72347537-7821-4125-442a-472d4b615064"
+#define MEM_WINDOW_BBB_SIZE 8192
+
+#define MSGDMA_BBB_GUID "ef82def7-f6ec-40fc-a914-9a35bace01ea"
+#define MSGDMA_BBB_SIZE 256
+
+#define NULL_DFH_BBB_GUID "da1182b1-b344-4e23-90fe-6aab12a0132f"
+#define BSP_AFU_GUID "96ef4230-dafa-cb5f-18b7-9ffa2ee54aa0"
+
+using namespace intel_opae_mmd;
+
+int CcipDevice::next_mmd_handle{1};
+
+std::string CcipDevice::get_board_name(std::string prefix, uint64_t obj_id) {
+  std::ostringstream stream;
+  stream << prefix << std::setbase(16) << obj_id;
+  return stream.str();
+}
+
+CcipDevice::CcipDevice(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      bsp_initialized(false),
+      mmio_is_mapped(false),
+      afc_handle(NULL),
+      filter(NULL),
+      afc_token(NULL),
+      dma_ch0_dfh_offset(0),
+      dma_ch1_dfh_offset(0),
+      dma_ase_dfh_offset(0),
+      dma_host_to_fpga(NULL),
+      dma_fpga_to_host(NULL),
+      mmd_copy_buffer(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  mmd_copy_buffer = (char *)malloc(MMD_COPY_BUFFER_SIZE);
+  if (mmd_copy_buffer == NULL) {
+    throw std::runtime_error(std::string("malloc failed for mmd_copy_buffer"));
+  }
+
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches;
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error creating properties object: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object type: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectID(filter, obj_id);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object ID: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaEnumerate(&filter, 1, &afc_token, 1, &num_matches);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error enumerating AFCs: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (num_matches < 1) {
+    res = fpgaDestroyProperties(&filter);
+    throw std::runtime_error("AFC not found");
+  }
+
+  res = fpgaOpen(afc_token, &afc_handle, 0);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error opening AFC: ") + std::string(fpgaErrStr(res)));
+  }
+
+  fpga_properties prop = nullptr;
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error reading properties: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (prop) {
+    res = fpgaPropertiesGetBus(prop, &bus);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading bus: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetDevice(prop, &device);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading device: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetFunction(prop, &function);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading function: ") + std::string(fpgaErrStr(res)));
+    }
+    fpgaDestroyProperties(&prop);
+  }
+
+  initialize_fme_sysfs();
+
+  mmd_dev_name = get_board_name(BSP_NAME, obj_id);
+  afu_initialized = true;
+}
+
+// Return true if board name parses correctly, false if it does not
+// Return the parsed object_id in obj_id as an [out] parameter
+bool CcipDevice::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  std::string prefix(BSP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    LOG_ERR("Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+// Read information directly from sysfs.  This is non-portable and relies on
+// paths set in driver (will not interoperate between DFH driver in up-stream
+// kernel and Intel driver distributed with PAC cards).  In the future hopefully
+// OPAE can provide SDK to read this information
+void CcipDevice::initialize_fme_sysfs() {
+  const int MAX_LEN = 250;
+  char temp_fmepath[MAX_LEN];
+  char numa_path[MAX_LEN];
+
+  // HACK: currently ObjectID is constructed using its lower 20 bits
+  // as the device minor number.  The device minor number also matches
+  // the device ID in sysfs.  This is a simple way to construct a path
+  // to the device FME using information that is already available (object_id).
+  // Eventually this code should be replaced with a direct call to OPAE C API,
+  // but API does not currently expose the device temperature.
+  int dev_num = 0xFFFFF & fpga_obj_id;
+
+  // Path to temperature value
+  snprintf(temp_fmepath,
+           MAX_LEN,
+           "/sys/class/fpga/intel-fpga-dev.%d/intel-fpga-fme.%d/thermal_mgmt/temperature",
+           dev_num,
+           dev_num);
+  // Path to NUMA node
+  snprintf(numa_path, MAX_LEN, "/sys/class/fpga/intel-fpga-dev.%d/device/numa_node", dev_num);
+
+  // Try to open the sysfs file. If open succeeds then set as initialized
+  // to be able to read temperature in future.  If open fails then not
+  // initalized and skip attempt to read temperature in future.
+  FILE *tmp;
+  tmp = fopen(temp_fmepath, "r");
+  if (tmp) {
+    fme_sysfs_temp_path = std::string(temp_fmepath);
+    fme_sysfs_temp_initialized = true;
+    fclose(tmp);
+  }
+
+  // Read NUMA node and set value for future use. If not available set to -1
+  // and disable use of NUMA setting
+  std::ifstream sysfs_numa_node(numa_path, std::ifstream::in);
+  if (sysfs_numa_node.is_open()) {
+    sysfs_numa_node >> fpga_numa_node;
+    sysfs_numa_node.close();
+    if (std::stoi(fpga_numa_node) >= 0) {
+      enable_set_numa = true;
+    } else {
+      enable_set_numa = false;
+    }
+  } else {
+    enable_set_numa = false;
+    fpga_numa_node = "-1";
+  }
+}
+
+bool CcipDevice::find_dma_dfh_offsets() {
+  uint64_t dfh_offset = 0;
+  uint64_t next_dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch0_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH1 offset: 0x%lX\t GUID: %s\n", dma_ch0_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA: Cannot find DMA channel 0 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset += next_dfh_offset;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch1_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH2 offset: 0x%lX\t GUID: %s\n", dma_ch1_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find DMA channel 2 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MEM_WINDOW_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ase_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA ASE offset: 0x%lX\t GUID: %s\n", dma_ase_dfh_offset, MEM_WINDOW_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find ASE DFH offset\n");
+    return false;
+  }
+
+  assert(dma_ch0_dfh_offset != 0);
+  assert(dma_ch1_dfh_offset != 0);
+  assert(dma_ase_dfh_offset != 0);
+  assert(dma_ch0_dfh_offset != dma_ch1_dfh_offset);
+
+  return true;
+}
+
+bool CcipDevice::initialize_bsp() {
+  if (bsp_initialized) {
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(afc_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  /* Reset AFC */
+  res = fpgaReset(afc_handle);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error resetting AFC: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for performance
+  // that the pinned buffer is located on the NUMA node as the threads that
+  // performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  find_dma_dfh_offsets();
+
+  const int dma_ch0_interrupt_num = 0;  // DMA channel 0 hardcoded to interrupt 0
+  dma_host_to_fpga = new mmd_dma(afc_handle, mmd_handle, dma_ch0_dfh_offset, dma_ase_dfh_offset, dma_ch0_interrupt_num);
+  if (!dma_host_to_fpga->initialized()) {
+    LOG_ERR("Error initializing mmd dma\n");
+    delete dma_host_to_fpga;
+    return false;
+  }
+
+  const int dma_ch1_interrupt_num = 2;  // DMA channel 1 hardcoded to interrupt 2
+  dma_fpga_to_host = new mmd_dma(afc_handle, mmd_handle, dma_ch1_dfh_offset, dma_ase_dfh_offset, dma_ch1_interrupt_num);
+  if (!dma_fpga_to_host->initialized()) {
+    fprintf(stderr, "Error initializing mmd dma\n");
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+  kernel_interrupt_thread = new KernelInterrupt(afc_handle, mmd_handle);
+
+  if (!kernel_interrupt_thread->initialized()) {
+    LOG_ERR("Error initializing kernel interrupts\n");
+    delete kernel_interrupt_thread;
+    return false;
+  }
+
+  bsp_initialized = true;
+  return bsp_initialized;
+}
+
+CcipDevice::~CcipDevice() {
+  int num_errors = 0;
+  if (mmd_copy_buffer) {
+    free(mmd_copy_buffer);
+    mmd_copy_buffer = NULL;
+  }
+
+  if (kernel_interrupt_thread) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (dma_host_to_fpga) {
+    delete dma_host_to_fpga;
+    dma_host_to_fpga = NULL;
+  }
+
+  if (dma_fpga_to_host) {
+    delete dma_fpga_to_host;
+    dma_fpga_to_host = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(afc_handle, 0)) num_errors++;
+  }
+
+  if (afc_handle) {
+    if (fpgaClose(afc_handle) != FPGA_OK) num_errors++;
+  }
+
+  if (afc_token) {
+    if (fpgaDestroyToken(&afc_token) != FPGA_OK) num_errors++;
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) num_errors++;
+  }
+
+  if (num_errors > 0) {
+    DEBUG_PRINT("Error freeing resources in destructor\n");
+  }
+}
+
+int CcipDevice::yield() {
+  if (kernel_interrupt_thread) kernel_interrupt_thread->yield();
+  return 0;
+}
+
+bool CcipDevice::bsp_loaded() {
+  fpga_guid dcp_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    return false;
+  }
+
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading GUID\n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(dcp_guid, afu_guid) == 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::string CcipDevice::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << unsigned(device) << "." << unsigned(function);
+
+  return bdf.str();
+}
+
+float CcipDevice::get_temperature() {
+  float temp = 0;
+  if (fme_sysfs_temp_initialized) {
+    std::ifstream sysfs_temp(fme_sysfs_temp_path, std::ifstream::in);
+    sysfs_temp >> temp;
+    sysfs_temp.close();
+  }
+  return temp;
+}
+
+void CcipDevice::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+void CcipDevice::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  event_update = fn;
+  event_update_user_data = user_data;
+  dma_host_to_fpga->set_status_handler(fn, user_data);
+  dma_fpga_to_host->set_status_handler(fn, user_data);
+}
+
+void CcipDevice::event_update_fn(aocl_mmd_op_t op, int status) {
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+int CcipDevice::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read of
+  // base address + offset
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_fpga_to_host->read_memory(op, static_cast<uint64_t *>(host_addr), offset, size);
+  } else {
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what status value should really be instead of just using 0
+      // Also handle case when op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaReadMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+int CcipDevice::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_host_to_fpga->write_memory(op, static_cast<const uint64_t *>(host_addr), offset, size);
+  } else {
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what 'status' value should really be.  Right now just
+      // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  // TODO: check what status values aocl wants and also parse the result
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaWriteMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+fpga_result CcipDevice::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DCP_DEBUG_MEM("read_mmio start: %p\t %lx\t %lu\n", host_addr, mmio_addr, size);
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaReadMMIO64(afc_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) return res;
+    memcpy_s_fast(host_addr32, size, &read_data, size);
+  }
+
+  return res;
+}
+
+fpga_result CcipDevice::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DEBUG_PRINT("write_mmio\n");
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaWriteMMIO64(afc_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+  while (size > 0) {
+    uint32_t tmp_data32 = 0;
+    size_t chunk_size = (size >= 4) ? 4 : size;
+    memcpy_s_fast(&tmp_data32, sizeof(tmp_data32), host_addr32, chunk_size);
+    res = fpgaWriteMMIO32(afc_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
new file mode 100644
index 0000000..f8088ac
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
@@ -0,0 +1,187 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _CCIP_MMD_DEVICE_H
+#define _CCIP_MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "kernel_interrupt.h"
+#include "mmd_dma.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// CCIP_MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define CCIP_MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define CCIP_MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning BSP is not loaded.
+#define CCIP_MMD_BSP_NOT_LOADED -2
+#define CCIP_MMD_BSP_INIT_FAILED -3
+
+// Delay settings
+// TODO: Figure out why these delays are needed and
+// have requirement removed (at least for HW)
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define DCP_OPENCL_BSP_AFU_ID "63B3779B-8BDD-4F03-9CEB-0301181D6AEF"
+
+#define BSP_NAME "pac_"
+
+// LOG ERRORS
+#define CCIP_MMD_ERR_LOGGING 1
+#ifdef CCIP_MMD_ERR_LOGGING
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define LOG_ERR(...)
+#endif
+
+// debugging
+#ifdef DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_MEM(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_MEM(...)
+#endif
+
+enum {
+#ifndef DLA_MMD                    // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0x4000,        /* Control interface into kernel interface */
+#else
+  AOCL_MMD_KERNEL = 0,  // CoreDLA completely removes the Opencl kernel interface, repurposed for CSRs
+#endif
+  AOCL_MMD_MEMORY = 0x100000 /* Data interface to device memory */
+};
+
+enum AfuStatu { CCIP_MMD_INVALID_ID = 0, CCIP_MMD_BSP, CCIP_MMD_AFU };
+
+class CcipDevice final {
+ public:
+  CcipDevice(uint64_t);
+  CcipDevice(const CcipDevice &) = delete;
+  CcipDevice &operator=(const CcipDevice &) = delete;
+  ~CcipDevice();
+
+  static std::string get_board_name(std::string prefix, uint64_t obj_id);
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+  bool initialize_bsp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  int yield();
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool bsp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  // HACK: use the sysfs path to read temperature value and NUMA node
+  // this should be replaced with OPAE call once that is
+  // available
+  std::string fme_sysfs_temp_path;
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+
+  void initialize_local_cpus_sysfs();
+
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool bsp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_handle afc_handle;
+  fpga_properties filter;
+  fpga_token afc_token;
+  uint64_t dma_ch0_dfh_offset;
+  uint64_t dma_ch1_dfh_offset;
+  uint64_t dma_ase_dfh_offset;
+  intel_opae_mmd::mmd_dma *dma_host_to_fpga;
+  intel_opae_mmd::mmd_dma *dma_fpga_to_host;
+
+  char *mmd_copy_buffer;
+
+  // Helper functions
+  fpga_result read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // _CCIP_MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
new file mode 100644
index 0000000..30113eb
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
@@ -0,0 +1,151 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include "dma_work_thread.h"
+#include <assert.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+dma_work_thread::dma_work_thread(mmd_dma &mmd_dma_arg)
+    : m_initialized(false),
+      m_thread_wake_event(NULL),
+      m_thread(NULL),
+      m_work_queue_mutex(),
+      m_work_queue(),
+      m_mmd_dma(mmd_dma_arg) {
+  m_thread_wake_event = new eventfd_wrapper();
+  if (!m_thread_wake_event->initialized()) return;
+
+  m_thread = new std::thread(work_thread, std::ref(*this));
+
+  m_initialized = true;
+}
+
+dma_work_thread::~dma_work_thread() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_thread_wake_event->notify(UINT64_MAX - 1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_thread_wake_event) {
+    delete m_thread_wake_event;
+    m_thread_wake_event = NULL;
+  }
+
+  m_initialized = false;
+}
+
+void dma_work_thread::work_thread(dma_work_thread &obj) {
+  int res;
+
+  // get eventfd handle
+  int thread_signal_fd = obj.m_thread_wake_event->get_fd();
+
+  struct pollfd pollfd_setup;
+  while (1) {
+    pollfd_setup.fd = thread_signal_fd;
+    pollfd_setup.events = POLLIN;
+    pollfd_setup.revents = 0;
+    res = poll(&pollfd_setup, 1, -1);
+    if (res < 0) {
+      fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    } else if (res > 0 && pollfd_setup.revents == POLLIN) {
+      uint64_t count_work_items = 0;
+      ssize_t bytes_read = read(thread_signal_fd, &count_work_items, sizeof(count_work_items));
+      if (bytes_read > 0) {
+        DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+      } else {
+        // TODO: the MMD should not exit.  But I have a different branch
+        // I'm working on that will change synchronization to use
+        // condition variable instead of eventfd in synchronization
+        // within the same process.  Will remove this exit() call at
+        // when PR for that change is submitted.
+        fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+        exit(-1);
+      }
+
+      // Ensure count is in proper range
+      const unsigned long MAX_WORK_ITEMS = 1000000000;
+      if (count_work_items > MAX_WORK_ITEMS && count_work_items != (UINT64_MAX - 1)) {
+          fprintf(stderr, "Error: poll value is out of range");
+          exit(-1);
+      }
+
+      obj.m_work_queue_mutex.lock();
+      if (obj.m_work_queue.empty() && count_work_items == UINT64_MAX - 1) {
+        // The maximum value of count is set when there is no work left
+        // The work queue must also be empty
+        // This thread can break out of the loop
+        obj.m_work_queue_mutex.unlock();
+        break;
+      }
+
+      std::queue<dma_work_item> items;
+      for (uint64_t i = 0; i < count_work_items; i++) {
+        // Check if there are enough jobs in the work queue as requested (count)
+        if (obj.m_work_queue.empty()) {
+          fprintf(stderr, "Poll error. Not enough tasks in queue.");
+          exit(-1);
+        }
+        dma_work_item item = obj.m_work_queue.front();
+        items.push(item);
+        obj.m_work_queue.pop();
+      }
+      obj.m_work_queue_mutex.unlock();
+
+      while (!items.empty()) {
+        dma_work_item item = items.front();
+        obj.do_dma(item);
+        items.pop();
+      }
+    }
+  }
+}
+
+int dma_work_thread::enqueue_dma(dma_work_item &item) {
+  if (item.op) {
+    m_work_queue_mutex.lock();
+    m_work_queue.push(item);
+    m_work_queue_mutex.unlock();
+    // send message to thread to wake it
+    // setting count to 1 as only 1 job is pushed to the work queue
+    m_thread_wake_event->notify(1);
+    return 0;
+  } else {
+    // if op is not specified, it is a blocking operation and we don't use
+    // the thread
+    return do_dma(item);
+  }
+}
+
+int dma_work_thread::do_dma(dma_work_item &item) { return m_mmd_dma.do_dma(item); }
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
new file mode 100644
index 0000000..0afb036
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
@@ -0,0 +1,73 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _DMA_WORK_THREAD_H
+#define _DMA_WORK_THREAD_H
+
+#include <opae/fpga.h>
+
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+// forward class definitions
+class eventfd_wrapper;
+class mmd_dma;
+
+class dma_work_item {
+ public:
+  aocl_mmd_op_t op;
+  uint64_t *rd_host_addr;
+  const uint64_t *wr_host_addr;
+  size_t dev_addr;
+  size_t size;
+};
+
+class dma_work_thread final {
+ public:
+  dma_work_thread(mmd_dma &mmd_dma_arg);
+  ~dma_work_thread();
+
+  bool initialized() { return m_initialized; }
+
+  int enqueue_dma(dma_work_item &item);
+  int do_dma(dma_work_item &item);
+
+ private:
+  static void work_thread(dma_work_thread &obj);
+
+  bool m_initialized;
+
+  eventfd_wrapper *m_thread_wake_event;
+  std::thread *m_thread;
+  std::mutex m_work_queue_mutex;
+  std::queue<dma_work_item> m_work_queue;
+
+  mmd_dma &m_mmd_dma;
+
+  // not used and not implemented
+  dma_work_thread(dma_work_thread &other);
+  dma_work_thread &operator=(const dma_work_thread &other);
+};  // class dma_work_thread
+
+};  // namespace intel_opae_mmd
+
+#endif  // _DMA_WORK_THREAD_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
new file mode 100644
index 0000000..2de3f74
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
@@ -0,0 +1,74 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _EVENTFD_WRAPPER_H
+#define _EVENTFD_WRAPPER_H
+
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+namespace intel_opae_mmd {
+
+// simple wrapper class for managing eventfd objects
+class eventfd_wrapper final {
+ public:
+  eventfd_wrapper() {
+    m_initialized = false;
+    // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set
+    // The implementation of functions using eventfd assumes that
+    m_fd = eventfd(0, 0);
+    if (m_fd < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return;
+    }
+
+    m_initialized = true;
+  }
+
+  ~eventfd_wrapper() {
+    if (m_initialized) {
+      if (close(m_fd) < 0) {
+        fprintf(stderr, "eventfd : %s", strerror(errno));
+      }
+    }
+  }
+
+  bool notify(uint64_t count) {
+    ssize_t res = write(m_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+    return true;
+  }
+
+  int get_fd() { return m_fd; }
+  bool initialized() { return m_initialized; }
+
+ private:
+  // not used and not implemented
+  eventfd_wrapper(eventfd_wrapper& other);
+  eventfd_wrapper& operator=(const eventfd_wrapper& other);
+
+  // member varaibles
+  int m_fd;
+  int m_initialized;
+};  // class eventfd_wrapper
+
+};  // namespace intel_opae_mmd
+
+#endif  // _EVENTFD_WRAPPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
new file mode 100644
index 0000000..6c8df30
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
@@ -0,0 +1,1313 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.c
+ * \brief FPGA DMA User-mode driver
+ */
+
+#include "fpga_dma.h"
+#include <assert.h>
+#include <errno.h>
+#include <opae/fpga.h>
+#include <poll.h>
+#include <safe_string/safe_string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "fpga_dma_internal.h"
+#include "memcpy_s_fast.h"
+
+#ifdef SIM
+#define USE_ASE
+#else
+// TODO:  Need this until we can adequately sync MMIO R/W with pointer accesses.
+// Causes module to use fpgaMMIORead32() instead of foo = *ptr;
+#define USE_ASE
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+static int err_cnt = 0;
+#endif
+
+#ifdef CHECK_DELAYS
+double poll_wait_count = 0;
+double buf_full_count = 0;
+#endif
+
+/*
+ * macro for checking return codes
+ */
+#define ON_ERR_GOTO(res, label, desc)                         \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      goto label;                                             \
+    }                                                         \
+  } while (0)
+
+#define ON_ERR_RETURN(res, desc)                              \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      return (res);                                           \
+    }                                                         \
+  } while (0)
+
+// Internal Functions
+
+/**
+ * MMIOWrite64Blk
+ *
+ * @brief                Writes a block of 64-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIOWrite32Blk
+ *
+ * @brief                Writes a block of 32-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead64Blk
+ *
+ * @brief                Reads a block of 64-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead32Blk
+ *
+ * @brief                Reads a block of 32-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+// Feature type is BBB
+static inline bool fpga_dma_feature_is_bbb(uint64_t dfh) {
+  // BBB is type 2
+  return ((dfh >> AFU_DFH_TYPE_OFFSET) & 0xf) == FPGA_DMA_BBB;
+}
+
+/**
+ * _switch_to_ase_page
+ *
+ * @brief                Updates the current page of ASE to the address given
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] addr       Address to which the ASE page should be switched
+ * @return Nothing.  Side-effect is to update the current page in the DMA handle.
+ *
+ */
+static inline void _switch_to_ase_page(fpga_dma_handle dma_h, uint64_t addr) {
+  uint64_t requested_page = addr & ~DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  if (requested_page != dma_h->cur_ase_page) {
+    MMIOWrite64Blk(dma_h, ASE_CNTL_BASE(dma_h), (uint64_t)&requested_page, sizeof(requested_page));
+    dma_h->cur_ase_page = requested_page;
+  }
+}
+
+/**
+ * _send_descriptor
+ *
+ * @brief                Queues a DMA descriptor to the FPGA
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] desc       Pointer to a descriptor structure to send
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _send_descriptor(fpga_dma_handle dma_h, msgdma_ext_desc_t *desc) {
+  fpga_result res = FPGA_OK;
+  msgdma_status_t status = {0};
+
+  debug_print("desc.rd_address = %x\n", desc->rd_address);
+  debug_print("desc.wr_address = %x\n", desc->wr_address);
+  debug_print("desc.len = %x\n", desc->len);
+  debug_print("desc.wr_burst_count = %x\n", desc->wr_burst_count);
+  debug_print("desc.rd_burst_count = %x\n", desc->rd_burst_count);
+  debug_print("desc.wr_stride %x\n", desc->wr_stride);
+  debug_print("desc.rd_stride %x\n", desc->rd_stride);
+  debug_print("desc.rd_address_ext %x\n", desc->rd_address_ext);
+  debug_print("desc.wr_address_ext %x\n", desc->wr_address_ext);
+
+  debug_print("SGDMA_CSR_BASE = %lx SGDMA_DESC_BASE=%lx\n", dma_h->dma_csr_base, dma_h->dma_desc_base);
+
+#ifdef CHECK_DELAYS
+  bool first = true;
+#endif
+  do {
+    res = MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+    ON_ERR_GOTO(res, out, "MMIORead32Blk");
+#ifdef CHECK_DELAYS
+    if (first && status.st.desc_buf_full) {
+      buf_full_count++;
+      first = false;
+    }
+#endif
+  } while (status.st.desc_buf_full);
+
+  res = MMIOWrite64Blk(dma_h, dma_h->dma_desc_base, (uint64_t)desc, sizeof(*desc));
+  ON_ERR_GOTO(res, out, "MMIOWrite64Blk");
+
+out:
+  return res;
+}
+
+/**
+ * _do_dma
+ *
+ * @brief                    Performs a DMA transaction with the FPGA
+ * @param[in] dma_h          Handle to the FPGA DMA object
+ * @param[in] dst            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] src            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] count          Number of bytes
+ * @param[in] is_last_desc   True if this is the last buffer of a batch
+ * @param[in] type           Direction of transfer
+ * @param[in] intr_en        True means to ask for an interrupt from the FPGA
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _do_dma(fpga_dma_handle dma_h,
+                           uint64_t dst,
+                           uint64_t src,
+                           int count,
+                           int is_last_desc,
+                           fpga_dma_transfer_t type,
+                           bool intr_en) {
+  msgdma_ext_desc_t desc = {0};
+  fpga_result res = FPGA_OK;
+  int alignment_offset = 0;
+  int segment_size = 0;
+
+  // src, dst and count must be 64-byte aligned
+  if (dst % FPGA_DMA_ALIGN_BYTES != 0 || src % FPGA_DMA_ALIGN_BYTES != 0 || count % FPGA_DMA_ALIGN_BYTES != 0) {
+    return FPGA_INVALID_PARAM;
+  }
+  // these fields are fixed for all DMA transfers
+  desc.seq_num = 0;
+  desc.wr_stride = 1;
+  desc.rd_stride = 1;
+
+  desc.control.go = 1;
+  if (intr_en)
+    desc.control.transfer_irq_en = 1;
+  else
+    desc.control.transfer_irq_en = 0;
+
+  // Enable "earlyreaddone" in the control field of the descriptor except the last.
+  // Setting early done causes the read logic to move to the next descriptor
+  // before the previous descriptor completes.
+  // This elminates a few hundred clock cycles of waiting between transfers.
+  if (!is_last_desc)
+    desc.control.early_done_en = 1;
+  else
+    desc.control.early_done_en = 0;
+
+  if (type == FPGA_TO_FPGA_MM) {
+    desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+    desc.len = count;
+    desc.wr_burst_count = 4;
+    desc.rd_burst_count = 4;
+    desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+    res = _send_descriptor(dma_h, &desc);
+    ON_ERR_GOTO(res, out, "_send_descriptor");
+  }
+  // either FPGA to Host or Host to FPGA transfer so we need to make sure the DMA transaction is aligned to the burst
+  // size (CCIP restriction)
+  else {
+    // need to determine if the CCIP (host) address is aligned to 4CL (256B).  When 0 the CCIP address is aligned.
+    alignment_offset =
+        (type == HOST_TO_FPGA_MM) ? (src % (4 * FPGA_DMA_ALIGN_BYTES)) : (dst % (4 * FPGA_DMA_ALIGN_BYTES));
+
+    // not aligned to 4CL so performing a short transfer to get aligned
+    if (alignment_offset != 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // count isn't large enough to hit next 4CL boundary
+      if (((4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset) >= count) {
+        segment_size = count;
+        count = 0;  // only had to transfer count amount of data to reach the end of the provided buffer
+      } else {
+        segment_size = (4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset;
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;  // subtract the segment size from count since the transfer below will bring us into 4CL
+                                // alignment
+        desc.control.transfer_irq_en = 0;
+      }
+
+      // will post short transfer to align to a 4CL (256 byte) boundary
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we are 4CL (256 byte) aligned
+    // if there is at least 4CL (256 bytes) of data to transfer, post bursts of 4
+    if (count >= (4 * FPGA_DMA_ALIGN_BYTES)) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 4;
+      desc.rd_burst_count = 4;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // buffer ends on 4CL boundary
+      if ((count % (4 * FPGA_DMA_ALIGN_BYTES)) == 0) {
+        segment_size = count;
+        count = 0;  // transfer below will move the remainder of the buffer
+      }
+      // buffers do not end on 4CL boundary so transfer only up to the last 4CL boundary leaving a segment at the end to
+      // finish later
+      else {
+        segment_size = count - (count % (4 * FPGA_DMA_ALIGN_BYTES));  // round count down to the nearest multiple of 4CL
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;
+        desc.control.transfer_irq_en = 0;
+      }
+
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we have posted all the bursts of length 4 we can but there might be 64, 128, or 192 bytes of data
+    // to transfer still if buffer did not end on 4CL (256 byte) boundary post short transfer to handle the remainder
+    if (count > 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.len = count;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+      if (intr_en) desc.control.transfer_irq_en = 1;
+      // will post short transfer to move the remainder of the buffer
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+
+  }  // end of FPGA --> Host or Host --> FPGA transfer
+
+out:
+  return res;
+}
+
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dfh_offset, int interrupt_num, fpga_dma_handle *dma_p) {
+  fpga_result res = FPGA_OK;
+  fpga_dma_handle dma_h = NULL;
+  int i = 0;
+  if (!fpga) {
+    return FPGA_INVALID_PARAM;
+  }
+  if (!dma_p) {
+    return FPGA_INVALID_PARAM;
+  }
+  // init the dma handle
+  dma_h = (fpga_dma_handle)malloc(sizeof(struct _dma_handle_t));
+  if (!dma_h) {
+    return FPGA_NO_MEMORY;
+  }
+  dma_h->fpga_h = fpga;
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) dma_h->dma_buf_ptr[i] = NULL;
+  dma_h->mmio_num = 0;
+  dma_h->cur_ase_page = 0xffffffffffffffffUll;
+
+  // Discover DMA BBB by traversing the device feature list
+  bool dma_found = false;
+
+#ifndef USE_ASE
+  res = fpgaMapMMIO(dma_h->fpga_h, 0, (uint64_t **)&dma_h->mmio_va);
+  ON_ERR_GOTO(res, out, "fpgaMapMMIO");
+#endif
+
+  dfh_feature_t dfh = {0};
+  res = MMIORead64Blk(dma_h, dfh_offset, (uint64_t)&dfh, sizeof(dfh));
+  ON_ERR_GOTO(res, out, "MMIORead64Blk");
+
+  if (fpga_dma_feature_is_bbb(dfh.dfh) && (dfh.feature_uuid_lo == FPGA_DMA_UUID_L) &&
+      (dfh.feature_uuid_hi == FPGA_DMA_UUID_H)) {
+    dma_h->dma_base = dfh_offset;
+    dma_h->dma_csr_base = dma_h->dma_base + FPGA_DMA_CSR;
+    dma_h->dma_desc_base = dma_h->dma_base + FPGA_DMA_DESC;
+    dma_h->dma_ase_cntl_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_CNTL;
+    dma_h->dma_ase_data_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_DATA;
+    dma_found = true;
+    *dma_p = dma_h;
+    res = FPGA_OK;
+  } else {
+    *dma_p = NULL;
+    res = FPGA_NOT_FOUND;
+    goto out;
+  }
+
+  // Buffer size must be page aligned for prepareBuffer
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaPrepareBuffer(
+        dma_h->fpga_h, FPGA_DMA_BUF_SIZE, (void **)&(dma_h->dma_buf_ptr[i]), &dma_h->dma_buf_wsid[i], 0);
+    ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+    // Make sure it's actually allocated
+    dma_h->dma_buf_ptr[i][0] = 0xff;
+    madvise((void *)dma_h->dma_buf_ptr[i], FPGA_DMA_BUF_SIZE, MADV_SEQUENTIAL);
+
+    res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->dma_buf_wsid[i], &dma_h->dma_buf_iova[i]);
+    ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  }
+
+  // Allocate magic number buffer
+  res = fpgaPrepareBuffer(dma_h->fpga_h, FPGA_DMA_ALIGN_BYTES, (void **)&(dma_h->magic_buf), &dma_h->magic_wsid, 0);
+  ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+  dma_h->magic_buf[0] = 0xff;
+
+  res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->magic_wsid, &dma_h->magic_iova);
+  ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  memset((void *)dma_h->magic_buf, 0, FPGA_DMA_ALIGN_BYTES);
+
+  // turn on global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 1;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, rel_buf, "MMIOWrite32Blk");
+
+  // register interrupt event handle
+  res = fpgaCreateEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaCreateEventHandle");
+
+  res = fpgaRegisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh, interrupt_num /*vector id */);
+  ON_ERR_GOTO(res, destroy_eh, "fpgaRegisterEvent");
+
+  return FPGA_OK;
+
+destroy_eh:
+  res = fpgaDestroyEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaDestroyEventHandle");
+
+rel_buf:
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+  }
+out:
+  if (!dma_found) {
+    free(dma_h);
+  }
+  return res;
+}
+
+/**
+ * _read_memory_mmio_unaligned
+ *
+ * @brief                Performs a unaligned read(address not 4/8/64 byte aligned) from FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _read_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                               uint64_t dev_addr,
+                                               uint64_t host_addr,
+                                               uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - shift) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast((void *)host_addr, count, ((char *)(&read_tmp)) + shift, count);
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio_unaligned
+ *
+ * @brief                Performs an unaligned write(address not 4/8/64 byte aligned) to FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _write_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                                uint64_t dev_addr,
+                                                uint64_t host_addr,
+                                                uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - (dev_addr % QWORD_BYTES)) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, count, (void *)host_addr, count);
+
+  // write back to device
+  res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio
+ *
+ * @brief                   Writes to a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in/out] count     Pointer to the Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _write_memory_mmio(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*dst_ptr));
+  if (!IS_ALIGNED_DWORD(*dst_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(dst)) {
+    // Write out a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(dst));
+
+  // Write out blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, size_to_copy);
+    ON_ERR_RETURN(res, "MMIOWrite64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Write out remaining DWORD
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_host_to_fpga
+ *
+ * @brief                   Tx "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in] count         Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_host_to_fpga(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t dst = *dst_ptr;
+  uint64_t src = *src_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using dst masking method
+  if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
+    unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    src += unaligned_size;
+    dst += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _write_memory_mmio(dma_h, &dst, &src, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using dst masking method
+  unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+/**
+ * _read_memory_mmio
+ *
+ * @brief                   Reads a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Pointer to the size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _read_memory_mmio(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*src_ptr));
+  if (!IS_ALIGNED_DWORD(*src_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(src)) {
+    // Read a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(src));
+
+  // Read blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, size_to_copy);
+    ON_ERR_RETURN(res, "MMIORead64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Read remaining DWORD
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_fpga_to_host
+ *
+ * @brief                   Tx "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_fpga_to_host(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using src masking method
+  if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
+    unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    dst += unaligned_size;
+    src += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _read_memory_mmio(dma_h, &src, &dst, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using src masking method
+  unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+static fpga_result clear_interrupt(fpga_dma_handle dma_h) {
+  // clear interrupt by writing 1 to IRQ bit in status register
+  msgdma_status_t status = {0};
+  status.st.irq = 1;
+
+  return MMIOWrite32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+}
+
+static fpga_result poll_interrupt(fpga_dma_handle dma_h) {
+  struct pollfd pfd = {0};
+  msgdma_status_t status = { 0 };
+  fpga_result res = FPGA_OK;
+  int poll_res;
+
+  res = fpgaGetOSObjectFromEventHandle(dma_h->eh, &pfd.fd);
+  ON_ERR_GOTO(res, out, "fpgaGetOSObjectFromEventHandle failed\n");
+
+  pfd.events = POLLIN;
+
+#ifdef CHECK_DELAYS
+  if (0 == poll(&pfd, 1, 0)) poll_wait_count++;
+#endif
+  poll_res = poll(&pfd, 1, FPGA_DMA_TIMEOUT_MSEC);
+  MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)& status.reg, sizeof(status.reg));
+  if (poll_res < 0) {
+    fprintf(stderr, "Poll error errno = %s DMA status reg: 0x%x\n", strerror(errno), status.reg);
+    res = FPGA_EXCEPTION;
+    goto out;
+  } else if (poll_res == 0) {
+    fprintf(stderr, "Poll(interrupt) timeout DMA status reg: 0x%x\n", status.reg);
+    res = FPGA_EXCEPTION;
+  } else {
+    uint64_t count = 0;
+    ssize_t bytes_read = read(pfd.fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      debug_print("Poll success. Return = %d, count = %d\n", poll_res, (int)count);
+      res = FPGA_OK;
+    } else {
+      fprintf(stderr, "Error: poll failed read: zero bytes read");
+      res = FPGA_EXCEPTION;
+    }
+  }
+
+out:
+  clear_interrupt(dma_h);
+  return res;
+}
+
+static fpga_result _issue_magic(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  *(dma_h->magic_buf) = 0x0ULL;
+
+  res = _do_dma(dma_h,
+                dma_h->magic_iova | FPGA_DMA_WF_HOST_MASK,
+                FPGA_DMA_WF_ROM_MAGIC_NO_MASK,
+                64,
+                1,
+                FPGA_TO_HOST_MM,
+                FPGA2HOST_IRQ_REQ /*intr_en */);
+  return res;
+}
+
+static void _wait_magic(fpga_dma_handle dma_h) {
+#ifndef SKIP_FPGA2HOST_IRQ
+  poll_interrupt(dma_h);
+#endif
+  while (*(dma_h->magic_buf) != FPGA_DMA_WF_MAGIC_NO)
+    ;
+  *(dma_h->magic_buf) = 0x0ULL;
+}
+
+fpga_result transferHostToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int issued_intr = 0;
+  debug_print("Host To Fpga ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(dst)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      return res;
+    } else {
+      aligned_addr = ((dst / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - dst;
+      res = _ase_host_to_fpga(dma_h, &dst, &src, align_bytes);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chuncks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      // constant size transfer, no length check required for memcpy
+      memcpy_s_fast(dma_h->dma_buf_ptr[i % FPGA_DMA_MAX_BUF],
+                    FPGA_DMA_BUF_SIZE,
+                    (void *)(src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE);
+      // The value of FPGA_DMA_MAX_BUF is 2. Thus FPGA_DMA_MAX_BUF/2 -- 1, so the comparison
+      // is always i % 1 == 0, which will always be true. This means that the i == (dma_chunks -1)
+      // portion of the conditional will never be reached. However, for clarity and in case
+      // FPGA_DMA_MAX_BUF changes, I will leave the conditional as is and apply a coverity supression
+      // coverity[deadcode:FALSE]
+      if ((i % (FPGA_DMA_MAX_BUF / 2) == (FPGA_DMA_MAX_BUF / 2) - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (i == (FPGA_DMA_MAX_BUF / 2) - 1) {
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true);
+        } else {
+          if (issued_intr) poll_interrupt(dma_h);
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true /*intr_en */);
+        }
+        issued_intr = 1;
+      } else {
+        res = _do_dma(dma_h,
+                      (dst + i * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                      FPGA_DMA_BUF_SIZE,
+                      0,
+                      type,
+                      false /*intr_en */);
+      }
+    }
+    if (issued_intr) {
+      poll_interrupt(dma_h);
+      issued_intr = 0;
+    }
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+
+        memcpy_s_fast(
+            dma_h->dma_buf_ptr[0], dma_tx_bytes, (void *)(src + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      true /*intr_en */);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+        poll_interrupt(dma_h);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToHost(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t j = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int wf_issued = 0;
+
+  debug_print("FPGA To Host ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(src)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      return res;
+    } else {
+      aligned_addr = ((src / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - src;
+      res = _ase_fpga_to_host(dma_h, &src, &dst, align_bytes);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chunks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+    uint64_t pending_buf = 0;
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    dma_h->dma_buf_iova[i % (FPGA_DMA_MAX_BUF)] | FPGA_DMA_HOST_MASK,
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+
+      const int num_pending = i - pending_buf + 1;
+      if (num_pending == (FPGA_DMA_MAX_BUF / 2)) {  // Enters this loop only once,after first batch of descriptors.
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+      if (num_pending > (FPGA_DMA_MAX_BUF - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (wf_issued) {
+          _wait_magic(dma_h);
+          for (j = 0; j < (FPGA_DMA_MAX_BUF / 2); j++) {
+            // constant size transfer; no length check required
+            memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                          FPGA_DMA_BUF_SIZE,
+                          dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                          FPGA_DMA_BUF_SIZE);
+            pending_buf++;
+          }
+          wf_issued = 0;
+        }
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+    }
+
+    if (wf_issued) _wait_magic(dma_h);
+
+    // clear out final dma memcpy operations
+    while (pending_buf < dma_chunks) {
+      // constant size transfer; no length check required
+      memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                    FPGA_DMA_BUF_SIZE);
+      pending_buf++;
+    }
+    if (count_left > 0) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      false /*intr_en */);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+        memcpy_s_fast(
+            (void *)(dst + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes, dma_h->dma_buf_ptr[0], dma_tx_bytes);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t *tmp_buf = NULL;
+  if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src) && IS_DMA_ALIGNED(count_left)) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print("!!!FPGA to FPGA!!! TX :dma chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                dma_chunks,
+                count_left,
+                dst,
+                src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    (dst + i * FPGA_DMA_BUF_SIZE),
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    0,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      if ((i + 1) % FPGA_DMA_MAX_BUF == 0 || i == (dma_chunks - 1) /*last descriptor */) {
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+      }
+    }
+    if (count_left > 0) {
+      debug_print("Count_left = %08lx  was transfered using DMA\n", count_left);
+      res = _do_dma(dma_h,
+                    (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    count_left,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      res = _issue_magic(dma_h);
+      ON_ERR_GOTO(res, out, "Magic number issue failed");
+      _wait_magic(dma_h);
+    }
+  } else {
+    if ((src < dst) && (src + count_left >= dst)) {
+      debug_print("Overlapping addresses, Provide correct dst address\n");
+      return FPGA_NOT_SUPPORTED;
+    }
+    uint32_t tx_chunks = count_left / FPGA_DMA_BUF_ALIGN_SIZE;
+    count_left -= (tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE);
+    debug_print("!!!FPGA to FPGA TX!!! : tx chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                tx_chunks,
+                count_left,
+                dst,
+                src);
+    tmp_buf = (uint64_t *)malloc(FPGA_DMA_BUF_ALIGN_SIZE);
+    for (i = 0; i < tx_chunks; i++) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + i * FPGA_DMA_BUF_ALIGN_SIZE), FPGA_DMA_BUF_ALIGN_SIZE, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + i * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, FPGA_DMA_BUF_ALIGN_SIZE, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    if (count_left > 0) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), count_left, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, count_left, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    free(tmp_buf);
+  }
+out:
+  return res;
+out_spl:
+  free(tmp_buf);
+  return res;
+}
+
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+
+  if (!dma_h) return FPGA_INVALID_PARAM;
+
+  if (type >= FPGA_MAX_TRANSFER_TYPE) return FPGA_INVALID_PARAM;
+
+  if (!dma_h->fpga_h) return FPGA_INVALID_PARAM;
+
+  if (type == HOST_TO_FPGA_MM) {
+    res = transferHostToFpga(dma_h, dst, src, count, HOST_TO_FPGA_MM);
+  } else if (type == FPGA_TO_HOST_MM) {
+    res = transferFpgaToHost(dma_h, dst, src, count, FPGA_TO_HOST_MM);
+  } else if (type == FPGA_TO_FPGA_MM) {
+    res = transferFpgaToFpga(dma_h, dst, src, count, FPGA_TO_FPGA_MM);
+  } else {
+    // Should not be possible, since we have handled all fpga_dma_transfer_t types
+    assert(0);
+  }
+
+  return res;
+}
+
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context) {
+  // TODO
+  return FPGA_NOT_SUPPORTED;
+}
+
+fpga_result fpgaDmaClose(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  int i = 0;
+  if (!dma_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  if (!dma_h->fpga_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer failed");
+  }
+
+  res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->magic_wsid);
+  ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+
+  fpgaUnregisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh);
+  fpgaDestroyEventHandle(&dma_h->eh);
+
+  // turn off global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 0;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, out, "MMIOWrite32Blk");
+
+out:
+  free((void *)dma_h);
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
new file mode 100644
index 0000000..e382696
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
@@ -0,0 +1,141 @@
+// Copyright 2017-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef __FPGA_DMA_H__
+#define __FPGA_DMA_H__
+
+#include <opae/fpga.h>
+
+//#define DEBUG_MEM 1
+//#define FPGA_DMA_DEBUG 1
+#define SKIP_FPGA2HOST_IRQ 1
+#ifdef SKIP_FPGA2HOST_IRQ
+#define FPGA2HOST_IRQ_REQ false
+#else
+#define FPGA2HOST_IRQ_REQ true
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The DMA driver supports host to FPGA, FPGA to host and FPGA
+ * to FPGA transfers. The FPGA interface can be streaming
+ * or memory-mapped. Streaming interfaces are not currently
+ * supported.
+ */
+typedef enum {
+  HOST_TO_FPGA_MM = 0,  // Memory mapped FPGA interface
+  FPGA_TO_HOST_MM,      // Memory mapped FPGA interface
+  FPGA_TO_FPGA_MM,      // Memory mapped FPGA interface
+  FPGA_MAX_TRANSFER_TYPE,
+} fpga_dma_transfer_t;
+
+typedef struct _dma_handle_t *fpga_dma_handle;
+
+// Callback for asynchronous DMA transfers
+typedef void (*fpga_dma_transfer_cb)(void *context);
+
+/**
+ * fpgaDmaOpen
+ *
+ * @brief           Open a handle to DMA BBB.
+ *                  Scans the device feature chain looking for a DMA BBB.
+ *
+ * @param[in]  fpga Handle to the FPGA AFU object obtained via fpgaOpen()
+ * @param[in]  dma_base to DMA channel DFH
+ * @param[in]  interrupt_num interrupt number assigned to DMA channel
+ * @param[out] dma  DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dma_base, int interrupt_num, fpga_dma_handle *dma);
+
+/**
+ * fpgaDmaTransferSync
+ *
+ * @brief             Perform a blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type);
+
+/**
+ * fpgaDmaTransferAsync (Not supported)
+ *
+ * @brief             Perform a non-blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @param[in] cb      Callback to invoke when DMA transfer is complete
+ * @param[in] context Pointer to define user-defined context
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context);
+
+/**
+ * fpgaDmaClose
+ *
+ * @brief           Close the DMA BBB handle.
+ *
+ * @param[in] dma   DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaClose(fpga_dma_handle dma);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __FPGA_DMA_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
new file mode 100644
index 0000000..e4c8373
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
@@ -0,0 +1,289 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma_internal.h
+ * \brief FPGA DMA BBB Internal Header
+ */
+
+#ifndef __FPGA_DMA_INT_H__
+#define __FPGA_DMA_INT_H__
+
+#include <opae/fpga.h>
+#include "x86-sse2.h"
+
+#ifdef CHECK_DELAYS
+#pragma message "Compiled with -DCHECK_DELAYS.  Not to be used in production"
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+#pragma message "Compiled with -DFPGA_DMA_DEBUG.  Not to be used in production"
+#endif
+
+#ifndef max
+#define max(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a > _b ? _a : _b;      \
+  })
+#endif
+
+#ifndef min
+#define min(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a < _b ? _a : _b;      \
+  })
+#endif
+
+#define FPGA_DMA_TIMEOUT_MSEC (5000)
+
+#define QWORD_BYTES 8
+#define DWORD_BYTES 4
+#define IS_ALIGNED_DWORD(addr) (addr % 4 == 0)
+#define IS_ALIGNED_QWORD(addr) (addr % 8 == 0)
+
+#define FPGA_DMA_UUID_H 0xef82def7f6ec40fc
+#define FPGA_DMA_UUID_L 0xa9149a35bace01ea
+#define FPGA_DMA_WF_MAGIC_NO 0x5772745F53796E63ULL
+#define FPGA_DMA_HOST_MASK 0x2000000000000
+#define FPGA_DMA_WF_HOST_MASK 0x3000000000000
+#define FPGA_DMA_WF_ROM_MAGIC_NO_MASK 0x1000000000000
+
+#define AFU_DFH_REG 0x0
+#define AFU_DFH_NEXT_OFFSET 16
+#define AFU_DFH_EOL_OFFSET 40
+#define AFU_DFH_TYPE_OFFSET 60
+
+// BBB Feature ID (refer CCI-P spec)
+#define FPGA_DMA_BBB 0x2
+
+// Feature ID for DMA BBB
+#define FPGA_DMA_BBB_FEATURE_ID 0x765
+
+// DMA Register offsets from base
+#define FPGA_DMA_CSR 0x40
+#define FPGA_DMA_DESC 0x60
+#define FPGA_DMA_ADDR_SPAN_EXT_CNTL 0x200
+#define FPGA_DMA_ADDR_SPAN_EXT_DATA 0x1000
+
+#define DMA_ADDR_SPAN_EXT_WINDOW (4 * 1024)
+#define DMA_ADDR_SPAN_EXT_WINDOW_MASK ((uint64_t)(DMA_ADDR_SPAN_EXT_WINDOW - 1))
+
+#define FPGA_DMA_MASK_32_BIT 0xFFFFFFFF
+
+#define FPGA_DMA_CSR_BUSY (1 << 0)
+#define FPGA_DMA_DESC_BUFFER_EMPTY 0x2
+#define FPGA_DMA_DESC_BUFFER_FULL 0x4
+
+#define FPGA_DMA_ALIGN_BYTES 64
+#define IS_DMA_ALIGNED(addr) (addr % FPGA_DMA_ALIGN_BYTES == 0)
+
+#define CSR_BASE(dma_handle) ((uint64_t)dma_handle->dma_csr_base)
+#define ASE_DATA_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_data_base)
+#define ASE_CNTL_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_cntl_base)
+#define HOST_MMIO_32_ADDR(dma_handle, offset) \
+  ((volatile uint32_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_64_ADDR(dma_handle, offset) \
+  ((volatile uint64_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_32(dma_handle, offset) (*HOST_MMIO_32_ADDR(dma_handle, offset))
+#define HOST_MMIO_64(dma_handle, offset) (*HOST_MMIO_64_ADDR(dma_handle, offset))
+
+#define CSR_STATUS(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, status))
+#define CSR_CONTROL(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, ctrl))
+
+// Granularity of DMA transfer (maximum bytes that can be packed
+// in a single descriptor).This value must match configuration of
+// the DMA IP. Larger transfers will be broken down into smaller
+// transactions.
+#define FPGA_DMA_BUF_SIZE (1024 * 1024 * 2UL)
+#define FPGA_DMA_BUF_ALIGN_SIZE FPGA_DMA_BUF_SIZE
+
+// Convenience macros
+
+#ifdef FPGA_DMA_DEBUG
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+#define FPGA_DMA_MAX_BUF 2
+
+typedef struct __attribute__((__packed__)) {
+  uint64_t dfh;
+  uint64_t feature_uuid_lo;
+  uint64_t feature_uuid_hi;
+} dfh_feature_t;
+
+typedef union {
+  uint64_t reg;
+  struct {
+    uint64_t feature_type : 4;
+    uint64_t reserved_8 : 8;
+    uint64_t afu_minor : 4;
+    uint64_t reserved_7 : 7;
+    uint64_t end_dfh : 1;
+    uint64_t next_dfh : 24;
+    uint64_t afu_major : 4;
+    uint64_t feature_id : 12;
+  } bits;
+} dfh_reg_t;
+
+struct _dma_handle_t {
+  fpga_handle fpga_h;
+  uint32_t mmio_num;
+  uint64_t mmio_va;
+  uint64_t cur_ase_page;
+  uint64_t dma_base;
+  uint64_t dma_offset;
+  uint64_t dma_csr_base;
+  uint64_t dma_desc_base;
+  uint64_t dma_ase_cntl_base;
+  uint64_t dma_ase_data_base;
+  // Interrupt event handle
+  fpga_event_handle eh;
+  // magic number buffer
+  volatile uint64_t *magic_buf;
+  uint64_t magic_iova;
+  uint64_t magic_wsid;
+  uint64_t *dma_buf_ptr[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_wsid[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_iova[FPGA_DMA_MAX_BUF];
+};
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t tx_channel : 8;
+    uint32_t generate_sop : 1;
+    uint32_t generate_eop : 1;
+    uint32_t park_reads : 1;
+    uint32_t park_writes : 1;
+    uint32_t end_on_eop : 1;
+    uint32_t reserved_1 : 1;
+    uint32_t transfer_irq_en : 1;
+    uint32_t early_term_irq_en : 1;
+    uint32_t trans_error_irq_en : 8;
+    uint32_t early_done_en : 1;
+    uint32_t reserved_2 : 6;
+    uint32_t go : 1;
+  };
+} msgdma_desc_ctrl_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  uint32_t rd_address;
+  // 0x4
+  uint32_t wr_address;
+  // 0x8
+  uint32_t len;
+  // 0xC
+  uint16_t seq_num;
+  uint8_t rd_burst_count;
+  uint8_t wr_burst_count;
+  // 0x10
+  uint16_t rd_stride;
+  uint16_t wr_stride;
+  // 0x14
+  uint32_t rd_address_ext;
+  // 0x18
+  uint32_t wr_address_ext;
+  // 0x1c
+  msgdma_desc_ctrl_t control;
+} msgdma_ext_desc_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t busy : 1;
+    uint32_t desc_buf_empty : 1;
+    uint32_t desc_buf_full : 1;
+    uint32_t rsp_buf_empty : 1;
+    uint32_t rsp_buf_full : 1;
+    uint32_t stopped : 1;
+    uint32_t resetting : 1;
+    uint32_t stopped_on_errror : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t irq : 1;
+    uint32_t reserved : 22;
+  } st;
+} msgdma_status_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t stop_dispatcher : 1;
+    uint32_t reset_dispatcher : 1;
+    uint32_t stop_on_error : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t global_intr_en_mask : 1;
+    uint32_t stop_descriptors : 1;
+    uint32_t rsvd : 22;
+  } ct;
+} msgdma_ctrl_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_fill_level : 16;
+    uint32_t wr_fill_level : 16;
+  } fl;
+} msgdma_fill_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rsp_fill_level : 16;
+    uint32_t rsvd : 16;
+  } rsp;
+} msgdma_rsp_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_seq_num : 16;
+    uint32_t wr_seq_num : 16;
+  } seq;
+} msgdma_seq_num_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  msgdma_status_t status;
+  // 0x4
+  msgdma_ctrl_t ctrl;
+  // 0x8
+  msgdma_fill_level_t fill_level;
+  // 0xc
+  msgdma_rsp_level_t rsp;
+  // 0x10
+  msgdma_seq_num_t seq_num;
+} msgdma_csr_t;
+
+#endif  // __FPGA_DMA_INT_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..206b98a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
@@ -0,0 +1,278 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <poll.h>
+#include <stdlib.h>
+
+#include <thread>
+
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "kernel_interrupt.h"
+
+using namespace intel_opae_mmd;
+
+// if ENABLE_OPENCL_KERNEL_INTERRUPTS is set at compile time, interrupts will
+// be enabled.
+#define ENABLE_OPENCL_KERNEL_INTERRUPTS
+
+// if ENABLE_OPENCL_KERNEL_POLLING_THREAD is set at compile time, a thread will
+// replace yield and the thread will call runtime call back
+
+// DLA runtime assumes interrupt service routing will run on its own (instead of runtime yielding to MMD) when hardware
+// interrupts
+#ifdef DLA_MMD
+#define ENABLE_OPENCL_KERNEL_POLLING_THREAD
+#endif
+
+// ccip interrupt line that is used for kernel
+#define MMD_KERNEL_INTERRUPT_LINE_NUM 1
+
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_initialized(false),
+      m_eventfd_wrapper(NULL),
+      m_thread(NULL),
+      m_kernel_interrupt_fn(NULL),
+      m_kernel_interrupt_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(0) {
+  enable_interrupts();
+}
+
+KernelInterrupt::~KernelInterrupt() { disable_interrupts(); }
+
+void KernelInterrupt::disable_interrupts() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_eventfd_wrapper->notify(1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_eventfd_wrapper) {
+    delete m_eventfd_wrapper;
+    m_eventfd_wrapper = NULL;
+  }
+
+  if (m_event_handle) {
+    fpga_result res;
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaUnregisterEvent");
+    }
+#endif
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaDestroyEventHandle");
+    }
+  }
+
+  // disable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000000);
+#endif
+
+  m_initialized = false;
+}
+
+void KernelInterrupt::enable_interrupts() {
+  m_eventfd_wrapper = new eventfd_wrapper();
+  if (!m_eventfd_wrapper->initialized()) return;
+
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  m_thread = new std::thread(interrupt_polling_thread, std::ref(*this));
+#endif
+
+  fpga_result res;
+  // Create event
+  res = fpgaCreateEventHandle(&m_event_handle);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error creating event handle");
+    return;
+  }
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // Register user interrupt with event handle
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, MMD_KERNEL_INTERRUPT_LINE_NUM);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error registering event");
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    return;
+  }
+
+  // enable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000001);
+#endif
+#endif
+
+  m_initialized = true;
+}
+
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+void KernelInterrupt::set_interrupt_mask(uint32_t intr_mask) {
+  fpga_result res;
+  res = fpgaWriteMMIO32(m_fpga_handle, 0, AOCL_IRQ_MASKING_BASE, intr_mask);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaWriteMMIO32: %d\n", res);
+    return;
+  }
+}
+#endif
+
+void KernelInterrupt::interrupt_polling_thread(KernelInterrupt& obj) {
+  bool thread_is_active = true;
+  while (thread_is_active) {
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    const int timeout = -1;
+#else
+    const int timeout = 0;
+    usleep(100);
+#endif
+    thread_is_active = obj.poll_interrupt(timeout);
+  }
+}
+
+bool KernelInterrupt::poll_interrupt(int poll_timeout_arg) {
+  fpga_result fpga_res;
+
+  int res;
+  // get eventfd handles
+  int intr_fd;
+  fpga_res = fpgaGetOSObjectFromEventHandle(m_event_handle, &intr_fd);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "error getting event file handle");
+    return false;
+  }
+  int thread_signal_fd = m_eventfd_wrapper->get_fd();
+
+  struct pollfd pollfd_arr[2];
+  pollfd_arr[0].fd = intr_fd;
+  pollfd_arr[0].events = POLLIN;
+  pollfd_arr[0].revents = 0;
+  pollfd_arr[1].fd = thread_signal_fd;
+  pollfd_arr[1].events = POLLIN;
+  pollfd_arr[1].revents = 0;
+  res = poll(pollfd_arr, 2, poll_timeout_arg);
+  if (res < 0) {
+    fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    return false;
+  } else if (res > 0 && pollfd_arr[0].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(intr_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+  } else if (res > 0 && pollfd_arr[1].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(thread_signal_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+    return false;
+  } else {
+    // no event fd event happened
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    return false;
+#endif
+  }
+
+#ifdef DLA_MMD
+  run_kernel_interrupt_fn();
+#else  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+
+  // probobly not required for interrupt polling but we poll the interrupt
+  // csr line to make sure an interrupt was actually triggered
+  uint32_t irqval = 0;
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  DEBUG_PRINT("irqval: %u\n", irqval);
+  if (irqval) run_kernel_interrupt_fn();
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // workaround for fb:530016
+  // check if irq line is still high and generate another interrupt event
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  // signal intr event fd
+  if (irqval) {
+    DEBUG_PRINT("CRITICAL WARNING: irqval has not been cleared by aocl runtime\n");
+    uint64_t count = 1;
+    ssize_t res = write(intr_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+  }
+#endif
+#endif
+
+  return true;
+}
+
+bool KernelInterrupt::yield_is_enabled() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  return false;
+#else
+  return true;
+#endif
+}
+
+void KernelInterrupt::yield() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  usleep(0);
+#else
+  poll_interrupt(0);
+#endif
+}
+
+void KernelInterrupt::run_kernel_interrupt_fn() {
+  if (m_kernel_interrupt_fn) {
+    m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+  } else {
+    fprintf(stderr, "m_kernel_interrupt_fn is NULL.  No interrupt handler set!\n");
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
new file mode 100644
index 0000000..44e9b50
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
@@ -0,0 +1,75 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _KERNEL_INTERRUPT_H
+#define _KERNEL_INTERRUPT_H
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  bool initialized() { return m_initialized; }
+
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+  void yield();
+  static bool yield_is_enabled();
+
+  void enable_interrupts();
+  void disable_interrupts();
+
+ private:
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  void set_interrupt_mask(uint32_t intr_mask);
+#endif
+  void run_kernel_interrupt_fn();
+  bool poll_interrupt(int poll_timeout_arg);
+
+  static void interrupt_polling_thread(KernelInterrupt& obj);
+
+  bool m_initialized;
+  eventfd_wrapper* m_eventfd_wrapper;
+
+  std::thread* m_thread;
+
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void* m_kernel_interrupt_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  fpga_event_handle m_event_handle;
+
+  // not used and not implemented
+  KernelInterrupt(KernelInterrupt& other);
+  KernelInterrupt& operator=(const KernelInterrupt& other);
+};  // class KernelInterrupt
+
+};  // namespace intel_opae_mmd
+
+#endif  // _KERNEL_INTERRUPT_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
new file mode 100644
index 0000000..65d7f1a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
@@ -0,0 +1,133 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <safe_string/safe_string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "memcpy_s_fast.h"
+#include "x86-sse2.h"
+
+#pragma pop_macro("_GNU_SOURCE")
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n);
+
+memcpy_fn_t p_memcpy = memcpy_setup;  // Initial value points to setup routine
+
+/**
+ * SSE2_memcpy
+ *
+ * @brief                memcpy using SSE2 or REP MOVSB
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+static void *SSE2_memcpy(void *dst, size_t max, const void *src, size_t n) {
+  assert(n <= max);
+
+  void *ldst = dst;
+  void *lsrc = (void *)src;
+  if (IS_CL_ALIGNED(src) && IS_CL_ALIGNED(dst))  // 64-byte aligned
+  {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      aligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  } else {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes (unaligned) with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      unaligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  }
+
+  if (n) {
+    register unsigned long int dummy;
+    debug_print("copying 0x%lx bytes with REP MOVSB\n", n);
+    __asm__ __volatile__("rep movsb\n"
+                         : "=&D"(ldst), "=&S"(lsrc), "=&c"(dummy)
+                         : "0"(ldst), "1"(lsrc), "2"(n)
+                         : "memory");
+  }
+
+  return dst;
+}
+
+/**
+ * memcpy_wrap
+ *
+ * @brief                Trampoline for memcpy
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+static void *memcpy_wrap(void *dst, size_t max, const void *src, size_t n) { return memcpy(dst, src, n); }
+#endif  // ENABLE_MEMCPY_ENV_VAR_CHECK
+
+/**
+ * memcpy_setup
+ * Will be called on the first memcpy_s_fast invocation only.
+ *
+ * @brief                Set up which memcpy routine will be used at runtime
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n) {
+  // Default to SSE2_memcpy
+  p_memcpy = SSE2_memcpy;
+
+//
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+  char *pmemcpy = getenv(USE_MEMCPY_ENV);
+
+  if (pmemcpy) {
+    if (!strcasecmp(pmemcpy, "libc")) {
+      p_memcpy = memcpy_wrap;
+    } else if (!strcasecmp(pmemcpy, "sse2")) {
+      p_memcpy = SSE2_memcpy;
+    } else if (!strcasecmp(pmemcpy, "memcpy_s")) {
+      p_memcpy = (memcpy_fn_t)memcpy_s;
+    }
+  }
+#endif  // #ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+
+  return p_memcpy(dst, max, src, n);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
new file mode 100644
index 0000000..08056d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
@@ -0,0 +1,69 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef MEMCPY_S_FAST_H_
+#define MEMCPY_S_FAST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Constants needed in memcpy routines
+// Arbitrary crossover point for using SSE2 over rep movsb
+#define MIN_SSE2_SIZE 4096
+
+// TODO: hidden environment variables to experiment with performance
+// in production software are not a good idea in my opinion. Commenting out
+// for now but hopefully can remove this code completely in the long term.
+//#define USE_MEMCPY_ENV        "PAC_MEMCPY"
+
+#define CACHE_LINE_SIZE 64
+#define ALIGN_TO_CL(x) ((uint64_t)(x) & ~(CACHE_LINE_SIZE - 1))
+#define IS_CL_ALIGNED(x) (((uint64_t)(x) & (CACHE_LINE_SIZE - 1)) == 0)
+
+// Convenience macros
+#ifdef DEBUG_MEM
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+typedef void *(*memcpy_fn_t)(void *dst, size_t max, const void *src, size_t len);
+
+extern memcpy_fn_t p_memcpy;
+
+#define memcpy_s_fast(a, b, c, d) p_memcpy(a, b, c, d)
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // MEMCPY_S_FAST_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
new file mode 100644
index 0000000..92337a3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
@@ -0,0 +1,434 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+// disable dma and only use mmio.  this is very slow.
+//#define DISABLE_DMA
+
+// Each MSGDMA_BBB DFH is now 0x100 instead of 0x2_0000 (it needed to be 0x2_0000 previously because
+// the ASE component was within the msgdma_bbb.qsys).
+// Original addressing:
+//              board_afu_dfh: 0x0-0x3f.
+//              msgdma_bbb_csr: 0x2_0000-0x2_1fff.
+// Original range at board.ddr_board.msgdma_bbb: 0x2_0000- 0x2_1fff.
+//              DFH : 0x0-0x3f.
+//              ASE.cntl : 0x200-0x207.
+//              ASE.windowed_slave : 0x1000-0x1fff.
+// Current addressing (with ASE removed from the msgdma_bbb and now living on its own in ddr_board.qsys):
+//              From top-level board.qsys (base address 0x0):
+//                  board | dfh                             : 0x0_0000 - 0x0_003f
+//                  board | ddr_board.ase                   : 0x1_0000 - 0x1_1fff
+//                  board | ddr_board.msgdma_bbb_0          : 0x2_0000 - 0x2_007f
+//                  board | ddr_board.msgdma_bbb_1          : 0x2_0100 - 0x2_017f
+//                  board | ddr_board.null_dfh              : 0x2_0200 - 0x2_023f
+//              From ase.qsys (base address: 0x1_0000):
+//                  board.ddr_board.ase.dfh_csr             : 0x0-0x3f
+//                  board.ddr_board.ase.ASE.cntl            : 0x200-0x207
+//                  board.ddr_board.ase.ASE.windowed_slave  : 0x1000-0x1fff
+//              From msgdma_bbb.qsys inst0 (base address: 0x2_0000)
+//                  board.ddr_board.msgdma_bbb_inst_0.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+//              From msgdma_bbb.qsys inst1 (base address: 0x2_0100)
+//                  board.ddr_board.msgdma_bbb_inst_1.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+
+#define MEM_WINDOW_CRTL 0x200
+#define MEM_WINDOW_MEM 0x1000
+#define MEM_WINDOW_SPAN (4 * 1024)
+#define MEM_WINDOW_SPAN_MASK ((long)(MEM_WINDOW_SPAN - 1))
+#define MINIMUM_DMA_SIZE 256
+#define DMA_ALIGNMENT 256
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_DMA(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_DMA(...)
+#endif
+
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg,
+                 int mmd_handle,
+                 uint64_t dfh_offset_arg,
+                 uint64_t ase_bbb_addr_arg,
+                 int interrupt_num_arg)
+    : m_initialized(false),
+      m_dma_op_mutex(),
+      m_status_handler_fn(NULL),
+      m_status_handler_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      dfh_offset(dfh_offset_arg),
+      interrupt_num(interrupt_num_arg),
+      dma_h(NULL),
+      msgdma_bbb_base_addr(0),
+      ase_bbb_base_addr(ase_bbb_addr_arg) {
+#ifndef DISABLE_DMA
+
+  fpga_result res;
+  res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+  if (res != FPGA_OK) {
+    m_dma_work_thread = NULL;
+    fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+    return;
+  }
+#endif  // DISABLE_DMA
+
+  m_dma_work_thread = new dma_work_thread(*this);
+  if (!m_dma_work_thread->initialized()) {
+    return;
+  }
+
+  m_initialized = true;
+}
+
+mmd_dma::~mmd_dma() {
+  // kill the thread
+  if (m_dma_work_thread) {
+    delete m_dma_work_thread;
+    m_dma_work_thread = NULL;
+  }
+
+  if (dma_h) {
+    if (fpgaDmaClose(dma_h) != FPGA_OK) fprintf(stderr, "Error closing DMA\n");
+  }
+  m_initialized = false;
+}
+
+void mmd_dma::reinit_dma() {
+  if (!m_initialized) return;
+
+  if (dma_h) {
+    m_initialized = false;
+
+    fpga_result res;
+    res = fpgaDmaClose(dma_h);
+    dma_h = NULL;
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error closing DMA\n");
+      return;
+    }
+
+    res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+      return;
+    }
+
+    m_initialized = true;
+  }
+}
+
+void mmd_dma::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  m_status_handler_fn = fn;
+  m_status_handler_user_data = user_data;
+}
+
+void mmd_dma::event_update_fn(aocl_mmd_op_t op, int status) {
+  m_status_handler_fn(m_mmd_handle, m_status_handler_user_data, op, status);
+}
+
+fpga_result mmd_dma::do_dma(dma_work_item &item) {
+  // main dma function needs to be thread safe because dma csr operations
+  // are not thread safe
+  std::lock_guard<std::mutex> lock(m_dma_op_mutex);
+
+  fpga_result res = FPGA_OK;
+  assert(item.rd_host_addr != NULL || item.wr_host_addr != NULL);
+
+  // Tell the kernel we'll need these and they're sequential
+  uint64_t addr = item.rd_host_addr ? (uint64_t)item.rd_host_addr : (uint64_t)item.wr_host_addr;
+  addr = addr & ~((uint64_t)getpagesize() - 1);  // Align to page boundary
+  size_t remainder = ((size_t)getpagesize() - (addr & getpagesize())) & ~(getpagesize() - 1);
+  madvise((void *)addr, item.size + remainder, MADV_SEQUENTIAL);
+
+  if (item.rd_host_addr) {
+    res = read_memory(item.rd_host_addr, item.dev_addr, item.size);
+  } else {
+    assert(item.wr_host_addr);
+    res = write_memory(item.wr_host_addr, item.dev_addr, item.size);
+  }
+
+  if (item.op) {
+    // TODO: check what 'status' value should really be.  Right now just
+    // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+    event_update_fn(item.op, 0);
+  }
+
+  return res;
+}
+
+fpga_result mmd_dma::enqueue_dma(dma_work_item &item) {
+  return static_cast<fpga_result>(m_dma_work_thread->enqueue_dma(item));
+}
+
+fpga_result mmd_dma::read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = host_addr;
+  item.wr_host_addr = NULL;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = NULL;
+  item.wr_host_addr = host_addr;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::read_memory(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      read_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = read_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return read_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return read_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+#ifdef DISABLE_DMA
+  res = read_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, (uint64_t)host_addr /*dst*/, dev_addr /*src*/, dma_size, FPGA_TO_HOST_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = read_memory_mmio(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(host_addr, size, ((char *)(&read_tmp)) + shift, size);
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: read data %8p %08lx %16p\n", host_addr, dev_addr, host_addr);
+    res = fpgaReadMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = read_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory_mmio done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      DCP_DEBUG_DMA("DCP DEBUG: write_memory %ld %ld %ld\n", mmio_shift, unaligned_size, size);
+      write_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = write_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return write_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return write_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+// TODO: make switch for MMIO
+#ifdef DISABLE_DMA
+  res = write_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, dev_addr /*dst*/, (uint64_t)host_addr /*src*/, dma_size, HOST_TO_FPGA_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = write_memory(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::write_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, size, host_addr, size);
+
+  // write back to device
+  res = fpgaWriteMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_aligned_addr & MEM_WINDOW_SPAN_MASK), read_tmp);
+  if (res != FPGA_OK) return res;
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: write data %8p %08lx %016lx\n", host_addr, dev_addr, *host_addr);
+    res = fpgaWriteMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), *host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = write_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: aocl_mmd_write done!\n");
+  return FPGA_OK;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
new file mode 100644
index 0000000..ff33aed
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
@@ -0,0 +1,97 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _MMD_DMA_H
+#define _MMD_DMA_H
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+
+#include <mutex>
+
+#include "aocl_mmd.h"
+#include "dma_work_thread.h"
+#include "fpga_dma.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg,
+          int mmd_handle,
+          uint64_t dfh_offset_arg,
+          uint64_t ase_bbb_addr_arg,
+          int interrupt_num_arg);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  fpga_result read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result do_dma(dma_work_item &item);
+
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+
+  // used after reconfigation
+  void reinit_dma();
+
+  void bind_to_node(void);
+
+ private:
+  // Helper functions
+  fpga_result enqueue_dma(dma_work_item &item);
+  fpga_result read_memory(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size);
+
+  void event_update_fn(aocl_mmd_op_t op, int status);
+
+  bool m_initialized;
+
+  dma_work_thread *m_dma_work_thread;
+  std::mutex m_dma_op_mutex;
+
+  aocl_mmd_status_handler_fn m_status_handler_fn;
+  void *m_status_handler_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  uint64_t dfh_offset;
+  int interrupt_num;
+  fpga_dma_handle dma_h;
+  uint64_t msgdma_bbb_base_addr;
+  uint64_t ase_bbb_base_addr;
+
+  // not used and not implemented
+  mmd_dma(mmd_dma &other);
+  mmd_dma &operator=(const mmd_dma &other);
+};  // class mmd_dma
+
+};  // namespace intel_opae_mmd
+
+#endif  // _MMD_DMA_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
new file mode 100644
index 0000000..e1fb5d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
@@ -0,0 +1,269 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if defined(__i386__) || defined(__amd64__)
+
+.intel_syntax noprefix
+.text
+
+#define PREFETCH_DISTANCE 256
+
+.macro asm_function_helper function_name
+    .global \function_name
+.func \function_name
+\function_name:
+#ifdef __amd64__
+  #ifdef _WIN64
+    .set DST,  rcx
+    .set SRC,  rdx
+    .set SIZE, r8
+  #else
+    .set DST,  rdi
+    .set SRC,  rsi
+    .set SIZE, rdx
+  #endif
+#else
+    mov  eax,  [esp + 4]
+    mov  ecx,  [esp + 8]
+    mov  edx,  [esp + 12]
+    .set DST,  eax
+    .set SRC,  ecx
+    .set SIZE, edx
+#endif
+.endm
+
+.macro asm_function function_name
+#if defined(_WIN32) && !defined(_WIN64)
+    asm_function_helper _\function_name
+#else
+    asm_function_helper \function_name
+#endif
+.endm
+
+.macro push3 a, b, c
+    push \a
+    push \b
+    push \c
+.endm
+
+.macro pop3 a, b, c
+    pop \c
+    pop \b
+    pop \a
+.endm
+
+/*****************************************************************************/
+
+asm_function aligned_block_copy_movsb
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    rep movsb
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    rep movsb
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function aligned_block_copy_movsd
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    sar         rcx, 2
+    rep movsd
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    sar         ecx, 2
+    rep movsd
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function unaligned_block_copy_sse2
+0:
+    movdqu      xmm0,       [SRC + 0]
+    movdqu      xmm1,       [SRC + 16]
+    movdqu      xmm2,       [SRC + 32]
+    movdqu      xmm3,       [SRC + 48]
+    movdqu      [DST + 0],  xmm0
+    movdqu      [DST + 16], xmm1
+    movdqu      [DST + 32], xmm2
+    movdqu      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm0
+    movdqa      [DST + 32], xmm0
+    movdqa      [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_nt_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm0
+    movntdq     [DST + 32], xmm0
+    movntdq     [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+/*****************************************************************************/
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
new file mode 100644
index 0000000..6ebe2ef
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
@@ -0,0 +1,54 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_SSE2_H__
+#define __X86_SSE2_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aligned_block_copy_movsb(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_movsd(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void unaligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_nt_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
new file mode 100644
index 0000000..edb46c7
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
@@ -0,0 +1,489 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (C) 1992-2019 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device.  No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+#include <cstddef>  //size_t
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried.  This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "18.1"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data andy requires explicit function calls from the user
+ * to sychronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to sychronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * sychronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device.  The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support.  The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name.  The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES.  The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ *
+ * */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device.  This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ *      param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface.  If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,      /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,          /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,             /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,           /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                  /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                 /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,            /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,           /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,         /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11 /* total # of concurent operations read + writes*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes.  This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition.  For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+                                    aocl_mmd_info_t requested_info_id,
+                                    size_t param_value_size,
+                                    void* param_value,
+                                    size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.  Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signalled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signalled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed.  E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_copy(
+    int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device.  That means the kernels will be idle and no read/write/copy
+ * commands are active.  Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size.  The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again.  At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+
+/* Shared memory allocator
+ * Allocates memory that is shared between the host and the FPGA.  The
+ * host will access this memory using the pointer returned by
+ * aocl_mmd_shared_mem_alloc, while the FPGA will access the shared memory
+ * using device_ptr_out.  If shared memory is not supported this should return
+ * NULL.
+ *
+ * Shared memory survives FPGA reprogramming if the CPU is not rebooted.
+ *
+ * Arguments:
+ *   size - the size of the shared memory to allocate
+ *   device_ptr_out - will receive the pointer value used by the FPGA (the device)
+ *                    to access the shared memory.  Cannot be NULL.  The type is
+ *                    unsigned long long to handle the case where the host has a
+ *                    smaller pointer size than the device.
+ *
+ * Returns: The pointer value to be used by the host to access the shared
+ * memory if successful, otherwise NULL.
+ */
+AOCL_MMD_CALL void* aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long* device_ptr_out) WEAK;
+
+/* Shared memory de-allocator
+ * Frees previously allocated shared memory.  If shared memory is not supported,
+ * this function should do nothing.
+ *
+ * Arguments:
+ *   host_ptr - the host pointer that points to the shared memory, as returned by
+ *              aocl_mmd_shared_mem_alloc
+ *   size     - the size of the shared memory to free. Must match the size
+ *              originally passed to aocl_mmd_shared_mem_alloc
+ */
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void* host_ptr, size_t size) WEAK;
+
+/* DEPRECATED. Use aocl_mmd_program instead
+ * This reprogram API is only for mmd version previous than 18.1
+ */
+AOCL_MMD_CALL int aocl_mmd_reprogram(int handle, void* user_data, size_t size) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif