completed thesisHEAD master

author: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
committer: Eric Dao <eric@erickhangdao.com> 2025-03-10 17:54:31 -0400
commit: ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree: a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie
parent: 40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download: thesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
13 files changed, 3137 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
new file mode 100644
index 0000000..445a304
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
@@ -0,0 +1,62 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+add_definitions(-DI_DK_AFU_ID="11446C9D-AA42-4085-9B3D-4EEF9429A4AD")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/mmd.cpp
+   ./host/mmd_device.cpp
+   ./host/mmd_dma.cpp
+   ./host/mmd_helper.cpp
+   ./host/kernel_interrupt.cpp
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when linking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
new file mode 100755
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
new file mode 100755
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..97882d4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
@@ -0,0 +1,257 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "kernel_interrupt.h"
+
+#include <poll.h>
+#include <sys/eventfd.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+#include "mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+static const int mmd_kernel_interrupt_line_num = 1;
+static const uint32_t enable_int_mask = 0x00000001;
+static const uint32_t disable_int_mask = 0x00000000;
+
+bool KernelInterrupt::enable_thread = false;
+
+static const int debug_log_level = 0;
+
+// TODO: use consistent function throughout MMD for controlling debug
+// messages. This debug_print function is from OFS.
+static void debug_print(std::string &err_msg, int msglog) {
+  if (debug_log_level >= msglog) {
+    std::cerr << "KernelInterrupt: " << err_msg << std::endl;
+  }
+}
+
+static inline void check_result(fpga_result res, const char *err_str) {
+  if (res == FPGA_OK) {
+    return;
+  }
+  std::string opae_err_str =
+      std::string("KernelInterrupt: ") + std::string(err_str) + std::string(": ") + std::string(fpgaErrStr(res));
+}
+
+/** KernelInterrupt constructor
+ */
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_work_thread_active(false),
+      m_eventfd(0),
+      m_kernel_interrupt_fn(nullptr),
+      m_kernel_interrupt_user_data(nullptr),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(nullptr) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Constructor\n");
+  }
+  set_member_for_interrupts();
+  enable_interrupts();
+}
+
+/** KernelInterrupt destructor
+ *  calls disable_interrupts()
+ */
+KernelInterrupt::~KernelInterrupt() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Destructor\n");
+  }
+  try {
+    disable_interrupts();
+  } catch (...) {
+    std::string err("destructor error");
+    debug_print(err, 0);
+  }
+}
+
+/** disable_interrupts() function is used in KernelInterrupt destructor
+ *  if interupt not enabled , !enable_thread
+ *  then disable interrupt mask
+ *  else if interrupts are used,
+ *  call noftify_work_thread(), join the thread
+ *  we call OPAE API fpgaUnregisterEvent() to unregister FPGA event,
+ *  it tells driver caller is no longer interested in notification for event associated with m_event_handle
+ *  we call OPAE API fpgaDestroyEventHandle() to free resources
+ */
+void KernelInterrupt::disable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+    }
+    assert(m_work_thread_active == false);
+    return;
+  }
+
+  m_work_thread_active = false;
+  notify_work_thread();
+  m_work_thread->join();
+
+  if (m_event_handle != nullptr) {
+    fpga_result res;
+
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    check_result(res, "error fpgaUnregisterEvent");
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    check_result(res, "error fpgaDestroyEventHandle");
+  }
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+  }
+}
+
+/** notify_work_thread() function is called by disable_interrupts() function
+ *  eventfd object created by OPAE API fpgaGetOSObjectFromEventHandle() , m_eventfd,
+ *  can be used as an event wait/notify mechanism by user space applications and by kernel,
+ *  to notify user space applications of events
+ *  every time write() is performed on eventfd,
+ *  the value of uint64_t being written is added to count and wakeup is performed.
+ * We dont use read() below but read() will return count value to user space and reset count to 0
+ */
+void KernelInterrupt::notify_work_thread() {
+  uint64_t val = 1;
+  ssize_t res = write(m_eventfd, &val, sizeof(val));
+  if (res < 0) {
+    std::cerr << "Warning: KernelInterrupts::notify_work_thread()"
+                 " write to eventfd failed: "
+              << strerror(errno) << std::endl;
+  }
+}
+
+/** enable_interrupts() function is called by Kernel Interrupt constructor
+ *  if interrupt is not enabled it will disable interrupt mask , set thread active as false and return
+ *  if interrupt is enabled, it will use OPAE APIs to create event handle fpgaCreateEventHandle()
+ *  OPAE event APIs provide functions for handling asynchronous events such as errors and interrupts
+ *  Associated with every event a process has registered for is an fpga_event_handle,
+ *  which encapsulates OS specific data structure for event objects
+ *  On Linux fpga_event_handle can be used as file descriptor
+ *  and passed to select(), poll() and similar functions to wait for asynchronous events
+ *  OPAE API fpgaRegisterEvent() is used to tell driver that caller is interested in notification for event specified
+ *  OPAE API fpgaGetOSObjectFromEventHandle() checks validity of event handle and
+ *  gets OS object used to subscribe and unsubscribe to events
+ *  we create a thread and call work_thread()
+ */
+void KernelInterrupt::enable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+    }
+    m_work_thread_active = false;
+    return;
+  }
+
+  fpga_result res;
+
+  res = fpgaCreateEventHandle(&m_event_handle);
+  check_result(res, "error creating event handle");
+
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, mmd_kernel_interrupt_line_num);
+  check_result(res, "error registering event");
+
+  res = fpgaGetOSObjectFromEventHandle(m_event_handle, &m_eventfd);
+  check_result(res, "error getting event file handle");
+
+  m_work_thread_active = true;
+  m_work_thread = std::unique_ptr<std::thread>(new std::thread([this] { this->work_thread(); }));
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+  }
+}
+
+/** work_thread() is called from enable_interrupts() function while creating new thread
+ *  it calls wait_for_event(), disables interrupt mask
+ *  creates lock_guard with m_mutex, calls kernel interrupt function and then enables interrupt mask
+ */
+void KernelInterrupt::work_thread() {
+  while (m_work_thread_active) {
+    wait_for_event();
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (m_kernel_interrupt_fn != nullptr) {
+      m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+    }
+  }
+}
+
+/** wait_for_event() is called from work_thread() function
+ *  it uses poll() function to wait for event on a file descriptor,
+ *  the m_event_fd file descriptor which we got from fpgaOSObjectFromEventHandle()
+ *  poll() uses pollfd struct, which inncludes
+ *  fd - file descriptor, events - requested events, revents - returned events
+ *  timeout argument in poll() specifies number of milliseconds,
+ *  poll() will block waiting for file descriptor
+ *  On success, poll() returns a nonnegative value which is the
+ *  number of elements in the pollfds whose revents fields have been
+ *  set to a nonzero value (indicating an event or an error).  A
+ *  return value of zero indicates that the system call timed out
+ *  before any file descriptors became read
+ */
+void KernelInterrupt::wait_for_event() {
+  // Use timeout when polling eventfd because sometimes interrupts are missed.
+  // This may be caused by knonw race condition with runtime, or there may
+  // be occasional events lost from OPAE.
+
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt waiting for event using poll()\n");
+  const int timeout_ms = 250;
+  struct pollfd pfd = {.fd = m_eventfd, .events = POLLIN, .revents = 0};
+  int num_events = poll(&pfd, 1, timeout_ms);
+  if (num_events <= 0) {
+    std::string err(num_events < 0 ? strerror(errno) : "timed out");
+    std::string err_str("poll(): ");
+    debug_print(err_str.append(err), 1);
+  } else if (pfd.revents != POLLIN) {
+    std::string err("poll error num: ", pfd.revents);
+    debug_print(err, 0);
+  } else {
+    uint64_t val = 0;
+    ssize_t bytes_read = read(pfd.fd, &val, sizeof(val));
+    if (bytes_read < 0) {
+      std::string err(strerror(errno));
+      std::string err_str("read: ");
+      debug_print(err_str.append(err), 1);
+    }
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt setting kernel interrupt\n");
+  std::lock_guard<std::mutex> lock(m_mutex);
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
+
+/** Configure interrupts
+ *  set_member_for_interrupts() called from KernelInterrupts constructor
+ */
+void KernelInterrupt::set_member_for_interrupts() {
+  static bool initialized = false;
+  if (initialized) {
+    return;
+  }
+  // Use interrupts
+  MMD_DEBUG("DEBUG LOG : Using interrupts\n");
+
+  enable_thread = true;
+  initialized = true;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
new file mode 100644
index 0000000..9ea6e68
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
@@ -0,0 +1,68 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef KERNEL_INTERRUPT_H_
+#define KERNEL_INTERRUPT_H_
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <chrono>
+#include <mutex>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  void enable_interrupts();
+  void disable_interrupts();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+
+  KernelInterrupt(const KernelInterrupt &) = delete;
+  KernelInterrupt &operator=(const KernelInterrupt &) = delete;
+  KernelInterrupt(KernelInterrupt &&) = delete;
+  KernelInterrupt &operator=(KernelInterrupt &&) = delete;
+
+ private:
+  static void set_member_for_interrupts();
+
+  void notify_work_thread();
+  void wait_for_event();
+  void work_thread();
+
+  static bool enable_thread;
+
+  std::mutex m_mutex;
+  std::unique_ptr<std::thread> m_work_thread;
+  std::atomic<bool> m_work_thread_active;
+  int m_eventfd;
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void *m_kernel_interrupt_user_data;
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+  fpga_event_handle m_event_handle;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // KERNEL_INTERRUPT_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
new file mode 100644
index 0000000..58cd8e0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
@@ -0,0 +1,830 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+// On some systems MAP_HUGE_2MB is not defined. It should be defined for all
+// platforms that DCP supports, but we also want ability to compile MMD on
+// CentOS 6 systems.
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#endif
+
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include "aocl_mmd.h"
+#include "mmd_device.h"
+
+bool diagnose = 0;
+
+/** If the MMD is loaded dynamically, destructors in the MMD will execute before
+ *  the destructors in the runtime upon program termination. The DeviceMapManager
+ *  guards accesses to the device/handle maps to make sure the runtime doesn't
+ *  get to reference them after MMD destructors have been called. Destructor
+ *  makes sure that all devices are closed at program termination regardless of
+ *  what the runtime does. Implemented as a singleton.
+ */
+class DeviceMapManager final {
+ public:
+  /** C++ std map data structure to keep track of
+   *  object id -> handle and handle -> device
+   */
+  typedef std::map<int, Device *> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  /** Returns handle and device pointer to the device with the specified name
+   *  Creates a new entry for this device if it doesn't already exist
+   *  Return 0 on success, -1 on failure
+   */
+  int get_or_create_device(const char *board_name, int *handle, Device **device);
+
+  /** Return obj id based on ASP name.*/
+  uint64_t id_from_name(const char *board_name);
+
+  /** Return MMD handle based on obj id. Returned value is negative if board
+   *   doesn't exist
+   */
+  inline int handle_from_id(uint64_t obj_id);
+
+  /** Return pointer to device based on MMD handle. Returned value is null
+   *   if board doesn't exist
+   */
+  Device *device_from_handle(int handle);
+
+  /** Closes specified device if it exists */
+  void close_device_if_exists(int handle);
+
+  /* Returns a reference to the class singleton */
+  static DeviceMapManager &get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const &) = delete;
+  void operator=(DeviceMapManager const &) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated Device* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+#ifdef SIM
+      std::cout << "# mmd.cpp: When destroying DeviceMapManager in ASE, assume it worked.\n";
+      break;
+#endif
+      MMD_DEBUG("DEBUG LOG : In DeviceMapManager destructor, closing device with handle %d \n", handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+
+    MMD_DEBUG("DEBUG LOG : Constructing DeviceMapManager object\n");
+  }
+  t_handle_to_dev_map *handle_to_dev_map = nullptr;
+  t_id_to_handle_map *id_to_handle_map = nullptr;
+};
+static DeviceMapManager &device_manager = DeviceMapManager::get_instance();
+
+/** Returns handle and device pointer to the device with the specified name
+ *  Creates a new entry for this device if it doesn't already exist
+ *  Return 0 on success, -1 on failure
+ */
+int DeviceMapManager::get_or_create_device(const char *board_name, int *handle, Device **device) {
+  int _handle = MMD_INVALID_PARAM;
+  Device *_device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    MMD_DEBUG(
+        "DEBUG LOG : Failure in DeviceMapManager::get_or_create_device,id_to_handle_map or handle_to_dev_map is "
+        "NULL\n");
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device. obj_id : %ld \n", obj_id);
+    return false;
+  }
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new Device(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device %s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in creating new device object handle : %d \n", _handle);
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device metadata(handle , object) , handle : %d\n", _handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+
+  MMD_DEBUG("DEBUG LOG : Success in creating new device object , handle : %d\n", _handle);
+  return DeviceMapManager::SUCCESS;
+}
+
+/** Return obj id based on ASP name.*/
+uint64_t DeviceMapManager::id_from_name(const char *board_name) {
+  uint64_t obj_id = 0;
+  if (Device::parse_board_name(board_name, obj_id)) {
+    MMD_DEBUG("DEBUG LOG : Success in retrieving object id from board name\n");
+    return obj_id;
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve object id from board name\n");
+    return 0;
+  }
+}
+
+/** Return MMD handle based on obj id. Returned value is negative if board
+ *  doesn't exist
+ */
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving handle from object id. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve handle from object id \n");
+  }
+  return handle;
+}
+
+/** Return pointer to device based on MMD handle. Returned value is null
+ *  if board doesn't exist
+ */
+Device *DeviceMapManager::device_from_handle(int handle) {
+  Device *dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device from handle. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve device from handle\n");
+  }
+  return dev;
+}
+
+/** Closes specified device if it exists */
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      Device *dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+      MMD_DEBUG("DEBUG LOG : Closing device with handle : %d\n", handle);
+    } else {
+      MMD_DEBUG("DEBUG LOG : Nothing to close. Device with handle : %d already closed\n", handle);
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error, no handle to device map entry found for handle : %d \n", handle);
+  }
+}
+
+/** Interface for checking if AFU has ASP loaded */
+bool mmd_asp_loaded(const char *name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error, no object id found for board : %s \n", name);
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    Device *dev = device_manager.device_from_handle(handle);
+    if (dev) {
+      MMD_DEBUG("DEBUG LOG : ASP loaded for handle : %d \n", handle);
+      return dev->asp_loaded();
+    } else {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d \n", handle);
+      return false;
+    }
+  } else {
+    bool asp_loaded = false;
+    try {
+      Device dev(obj_id);
+      asp_loaded = dev.asp_loaded();
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d , %s\n", handle, e.what());
+      return false;
+    }
+
+    MMD_DEBUG("DEBUG LOG : ASP loaded : %d (0 - not loaded , 1 - loaded) for handle : %d \n", asp_loaded, handle);
+    return asp_loaded;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine number of baords in system
+ */
+static unsigned int get_offline_num_acl_boards(const char *asp_uuid) {
+  bool asp_only = true;
+  fpga_guid guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(asp_uuid, guid) < 0) {
+    MMD_DEBUG("Error parsing guid '%s'\n", asp_uuid);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (asp_only) {
+    res = fpgaPropertiesSetGUID(filter, guid);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine names of boards in the system
+ */
+static bool get_offline_board_names(std::string &boards, bool asp_only = true) {
+  boards = "dla_agx7_ofs_board";
+  return true;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strnlen(X, 4096) + 1;                                    \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy((void *)param_value, X, Xcpylen);                                 \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+/** Get information about the board using the enum aocl_mmd_offline_info_t for
+ *  offline info (called without a handle), and the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *    requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *    param_value_size - size of the param_value field in bytes. This should
+ *      match the size of the return type expected as indicated in the enum
+ *      definition.
+ *
+ *    param_value - pointer to the variable that will receive the returned info
+ *
+ *    param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+
+// From DLA perspective, only AOCL_MMD_BOARD_NAMES info we care
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  /** aocl_mmd_get_offline_info can be called many times by the runtime
+   *  and it is expensive to query the system.  Only compute values first
+   *  time aocl_mmd_get_offline_info called future iterations use saved results
+   */
+  static bool initialized = false;
+  static int mem_type_info;
+  static unsigned int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards(I_DK_AFU_ID);
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      RESULT_INT(num_acl_boards);
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+/** Get information about the board using the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *  requested_info_id - a value from the aocl_mmd_info_t enum
+ *
+ *  param_value_size - size of the param_value field in bytes. This should
+ *    match the size of the return type expected as indicated in the enum
+ *    definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *    the param_value_size should be set to sizeof(float) and you should
+ *    expect the same number of bytes returned in param_size_ret.
+ *
+ *  param_value - pointer to the variable that will receive the returned info
+ *
+ *  param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  MMD_DEBUG("DEBUG LOG : called aocl_mmd_get_info\n");
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel OFS Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float *ptr = static_cast<float *>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+
+    case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT:
+      RESULT_SIZE_T(64);
+      break;
+
+    case AOCL_MMD_HOST_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+    case AOCL_MMD_SHARED_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+
+    case AOCL_MMD_DEVICE_MEM_CAPABILITIES:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+/** Set the interrupt handler for the opened device.
+ *  The interrupt handler is called whenever the client needs to be notified
+ *  of an asynchronous event signaled by the device internals.
+ *  For example, the kernel has completed or is stalled.
+ *
+ *  Important: Interrupts from the kernel must be ignored until this handler is
+ *  set
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a kernel interrupt occurs
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set kernel interrupt handler for device handle : %d\n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error setting kernel interrupt handler for device handle : %d\n", handle);
+    return MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+/** Set the operation status handler for the opened device.
+ *  The operation status handler is called with
+ *     status 0 when the operation has completed successfully.
+ *     status negative when the operation completed with errors.
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a status update is to be
+ *    performed.
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_status_handler(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set status handler for device handle : %d\n", handle);
+  }
+  return 0;
+}
+
+/** Host to device-global-memory write (HOST DDR -> FPGA DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_write: handle : %d\t operation : %p\t len : 0x%zx\t src : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      src,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_write , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Host reading from device-global-memory (FPGA DDR -> HOST DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_read: handle : %d\t operation : %p\t len : 0x%zx\t dst : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      dst,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_read , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Open and initialize the named device.
+ *
+ *  The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ *  info.
+ *
+ *  Arguments:
+ *     name - open the board with this name (provided as a C-style string,
+ *            i.e. NUL terminated ASCII.)
+ *
+ *  Returns: the non-negative integer handle for the board, otherwise a
+ *  negative value to indicate error. Upon receiving the error, the OpenCL
+ *  runtime will proceed to open other known devices, hence the MMD mustn't
+ *  exit the application if an open call fails.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+
+  MMD_DEBUG("DEBUG LOG : aocl_mmd_open, Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, object id not found for board : %s\n", name);
+    return MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  Device *dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    if (std::getenv("MMD_PROGRAM_DEBUG") || std::getenv("MMD_DMA_DEBUG") || std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, device not found for board : %s\n", name);
+    }
+    return MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->asp_loaded()) {
+    if (!dev->initialize_asp()) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, Error initializing asp for board : %s\n", name);
+      return MMD_ASP_INIT_FAILED;
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, asp not loaded for board : %s\n", name);
+    return MMD_ASP_NOT_LOADED;
+  }
+  MMD_DEBUG("end of aocl_mmd_open \n");
+  MMD_DEBUG("DEBUG LOG : Success aocl_mmd_open for board : %s, handle : %d \n", name, handle);
+  return handle;
+}
+
+/** Close an opened device, by its handle.
+ *  Returns: 0 on success, negative values on error.
+ */
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+#ifndef SIM
+  device_manager.close_device_if_exists(handle);
+#else
+  std::cout << "# mmd.cpp: During simulation (ASE) we are not closing the device.\n";
+#endif
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; }
+
+// DLA can only uses 4GB DDR as of 2024.2
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() {
+  #ifdef USE_N6001_BOARD
+  return 300.0; // MHz
+  #else
+  return 333.333333; // MHz
+  #endif
+}
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x10000 + (0x800 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) {
+  #ifdef USE_N6001_BOARD
+  return (1ULL << 32) * instance + addr;
+  #else
+  return (1ULL << 33) * instance + addr;
+  #endif
+}
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
new file mode 100644
index 0000000..dd4ca42
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
@@ -0,0 +1,448 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <assert.h>
+#include <numa.h>
+
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_helper.h"
+
+int Device::next_mmd_handle{1};
+
+/**
+ * The Device object is created for each device/board opened and
+ * it has methods to interact with fpga device.
+ * The entry point for Device is in DeviceMapManager Class
+ * which maintains mapping between device names and handles.
+ * Device Object is foundation for interacting with device.
+ */
+Device::Device(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      asp_initialized(false),
+      mmio_is_mapped(false),
+      filter(NULL),
+      mmio_token(NULL),
+      mmio_handle(NULL),
+      fme_token(NULL),
+      guid(),
+      mmd_dma(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  MMD_DEBUG("DEBUG LOG : Constructing Device object\n");
+
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  fpga_properties filter = NULL;
+  uint32_t num_matches;
+  fpga_result r;
+
+  // Set up a filter that will search for an accelerator
+  fpgaGetProperties(NULL, &filter);
+  fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+
+  // Add the desired UUID to the filter
+  uuid_parse(I_DK_AFU_ID, guid);
+  fpgaPropertiesSetGUID(filter, guid);
+
+  // Do the search across the available FPGA contexts
+  num_matches = 1;
+  fpgaEnumerate(&filter, 1, &mmio_token, 1, &num_matches);
+
+  fpgaPropertiesGetParent(filter, &fme_token);
+
+  // Not needed anymore so we destroy the filter
+  fpgaDestroyProperties(&filter);
+
+  if (num_matches < 1) {
+    throw std::runtime_error(std::string("Cannot find accelerator"));
+  }
+
+  // Open accelerator
+  r = fpgaOpen(mmio_token, &mmio_handle, 0);
+  assert(FPGA_OK == r);
+
+  // While the token is available, check whether it is for HW
+  // or for ASE simulation.
+  fpga_properties accel_props;
+  uint16_t vendor_id, dev_id;
+  fpgaGetProperties(mmio_token, &accel_props);
+  fpgaPropertiesGetVendorID(accel_props, &vendor_id);
+  fpgaPropertiesGetDeviceID(accel_props, &dev_id);
+
+  afu_initialized = true;
+  MMD_DEBUG("DEBUG LOG : Done constructing Device object\n");
+}
+
+/** Return true if board name parses correctly, false if it does not
+ *  Return the parsed object_id in obj_id as an [out] parameter
+ */
+bool Device::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  MMD_DEBUG("DEBUG LOG : Parsing board name\n");
+  std::string prefix(ASP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    MMD_DEBUG("DEBUG LOG : Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+/** initialize_asp() function is used in aocl_mmd_open() API
+ *  It resets AFC and reinitializes DMA, Kernel Interrupts if in use
+ */
+bool Device::initialize_asp() {
+  MMD_DEBUG("DEBUG LOG : Initializing ASP ... \n");
+  if (asp_initialized) {
+    MMD_DEBUG("DEBUG LOG : ASP already initialized \n");
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(mmio_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  // Trigger an user reset
+  uint64_t reset = 1;
+  fpgaWriteMMIO64(mmio_handle, 0, 0x40000, reset);
+
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for
+  // performance that the pinned buffer is located on the NUMA node as the
+  // threads that performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  MMD_DEBUG("DEBUG LOG : Initializing HOST -> FPGA DMA channel \n");
+
+  mmd_dma = new intel_opae_mmd::mmd_dma(mmio_handle, mmd_handle);
+  if (!mmd_dma->initialized()) {
+    MMD_DEBUG("DEBUG LOG : Error initializing DMA channel \n");
+    delete mmd_dma;
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+// Do not enable interrupt if polling mode is enabled in the DLA runtime.
+#ifndef COREDLA_RUNTIME_POLLING
+  try {
+    kernel_interrupt_thread = new intel_opae_mmd::KernelInterrupt(mmio_handle, mmd_handle);
+  } catch (const std::system_error &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << e.code() << std::endl;
+    return false;
+  } catch (const std::exception &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << std::endl;
+    return false;
+  }
+#endif
+
+  asp_initialized = true;
+  MMD_DEBUG("DEBUG LOG : ASP Initialized ! \n");
+  return asp_initialized;
+}
+
+/** Device Class Destructor implementation
+ *  Properly releasing and free-ing memory
+ *  part of best coding practices and help
+ *  with stable system performance and
+ *  helps reduce bugs
+ */
+Device::~Device() {
+  MMD_DEBUG("DEBUG LOG : Destructing Device object \n");
+  int num_errors = 0;
+
+  if (kernel_interrupt_thread != nullptr) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (mmd_dma) {
+    delete mmd_dma;
+    mmd_dma = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(mmio_handle, 0)) {
+      MMD_DEBUG("DEBUG LOG :  fpgaUnmapMMIO failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_handle) {
+    if (fpgaClose(mmio_handle) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaClose mmio_handle failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_token) {
+    if (fpgaDestroyToken(&mmio_token) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyToken mmio_token failed\n");
+      num_errors++;
+    }
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyProperties filter failed\n");
+      num_errors++;
+    }
+  }
+
+  if (num_errors > 0) {
+    MMD_DEBUG("DEBUG LOG : Error freeing resources in Device destructor\n");
+  }
+}
+
+/** asp_loaded() function which checks if asp is loaded on board
+ *  it is used in aocl_mmd_open() API
+ */
+bool Device::asp_loaded() {
+  fpga_guid pci_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(I_DK_AFU_ID, pci_guid) < 0) {
+    MMD_DEBUG("DEBUG LOG : Error parsing guid\n");
+    return false;
+  }
+
+  res = fpgaGetProperties(mmio_token, &prop);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading properties: %s \n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  if (!mmio_token) {
+    fpgaDestroyProperties(&prop);
+    MMD_DEBUG("DEBUG LOG : Error reading the mmio_token\n");
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading GUID \n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(pci_guid, afu_guid) == 0) {
+    MMD_DEBUG("DEBUG LOG : asp loaded : true \n");
+    return true;
+  } else {
+    MMD_DEBUG("DEBUG LOG : asp loaded : false \n");
+    return false;
+  }
+}
+
+/** get_bdf() function is called
+ *  in aocl_mmd_get_info() API
+ */
+std::string Device::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << std::hex << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << std::hex << unsigned(device) << "." << std::hex << unsigned(function);
+
+  return bdf.str();
+}
+
+/** get_temperature() function is called
+ *  in aocl_mmd_get_info() API
+ *  We currently use hardcoded paths to retrieve temperature information
+ *  We will replace with OPAE APIs in future
+ */
+float Device::get_temperature() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : Reading temperature ... \n");
+  }
+  float temp = 0;
+  fpga_object obj;
+  const char *name;
+  name = "dfl_dev.*/spi_master/spi*/spi*.*/*-hwmon.*.auto/hwmon/hwmon*/temp1_input";
+  fpga_result res;
+  res = fpgaTokenGetObject(fme_token, name, &obj, FPGA_OBJECT_GLOB);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading temperature monitor from BMC :");
+    MMD_DEBUG(" %s \n", fpgaErrStr(res));
+    temp = -999;
+    return temp;
+  }
+
+  uint64_t value = 0;
+  fpgaObjectRead64(obj, &value, FPGA_OBJECT_SYNC);
+  fpgaDestroyObject(&obj);
+  temp = value / 1000;
+  return temp;
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_interrupt_handler() API
+ */
+void Device::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_kernel_interrupt() \n");
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_status_handler() API
+ */
+void Device::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_status_handler() \n");
+  event_update = fn;
+  event_update_user_data = user_data;
+}
+
+/** event_update_fn() is used in read_block(), write_block(), copy_block() functions
+ *  OPAE provides event API for handling asynchronous events sucj as errors and interrupts
+ *  under the hood those are used
+ */
+void Device::event_update_fn(aocl_mmd_op_t op, int status) {
+  MMD_DEBUG("DEBUG LOG : Device::event_update_fn() \n");
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+/** read_block() is used in aocl_mmd_read() API
+ *  as name suggests its used for fpga->host DMA and MMIO transfers
+ */
+int Device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::read_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read.
+
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to read block\n");
+    res = mmd_dma->fpga_to_host(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4);  // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = read_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block\n");
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+  return res;
+}
+
+/** write_block() is used in aocl_mmd_write() API
+ *  as name suggests its used for DMA and MMIO transfers
+ */
+int Device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::write_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to write block\n");
+    res = mmd_dma->host_to_fpga(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4); // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = write_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to write block\n");
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+
+  return res;
+}
+
+/** read_mmio() is used in read_block() function
+ *  it uses OPAE APIs fpgaReadMMIO64() and fpgaReadMMIO32()
+ */
+int Device::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::read_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
+
+/** write_mmio() is used in write_block() function
+ *  it uses OPAE APIs fpgaWriteMMIO64() and fpgaWriteMMIO32()
+ */
+int Device::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::write_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
new file mode 100644
index 0000000..1cded83
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
@@ -0,0 +1,151 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_DEVICE_H
+#define MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#include <opae/fpga.h>
+
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+#include "kernel_interrupt.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning ASP is not loaded.
+#define MMD_ASP_NOT_LOADED -2
+#define MMD_ASP_INIT_FAILED -3
+
+// Delay settings
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define ASP_NAME "ofs_"
+
+#define SVM_MMD_MPF 0x24000
+
+#define SVM_DDR_OFFSET 0x1000000000000
+#define PCI_DDR_OFFSET 0
+
+enum {
+  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0,
+  AOCL_MMD_MEMORY = 1,
+  AOCL_MMD_DLA_CSR = 2,
+};
+
+enum AfuStatu { MMD_INVALID_ID = 0, MMD_ASP, MMD_AFU };
+
+class Device final {
+ public:
+  Device(uint64_t);
+  Device(const Device &) = delete;
+  Device &operator=(const Device &) = delete;
+  ~Device();
+
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+
+  bool initialize_asp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool asp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+  void initialize_local_cpus_sysfs();
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool asp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_properties filter;
+  fpga_token mmio_token;
+  fpga_handle mmio_handle;
+  fpga_token fme_token;
+  fpga_guid guid;
+  intel_opae_mmd::mmd_dma *mmd_dma;
+  std::mutex m_dma_mutex;
+
+  // Helper functions
+  int read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  int write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
new file mode 100644
index 0000000..6a4e13c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
@@ -0,0 +1,573 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <memory.h>
+#include <sys/mman.h>
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+
+#include <inttypes.h>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+namespace intel_opae_mmd {
+
+/** mmd_dma class constructor
+ */
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle) : m_initialized(false), m_fpga_handle(fpga_handle_arg) {
+  MMD_DEBUG("DEBUG LOG : Constructing DMA \n");
+  // Initialize shared buffer
+  auto res = fpgaPrepareBuffer(m_fpga_handle, DMA_BUFFER_SIZE, (void **)&dma_buf_ptr, &dma_buf_wsid, 0);
+
+  assert(FPGA_OK == res && "Allocating DMA Buffer failed");
+
+  memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+  // Store virtual address of IO registers
+  res = fpgaGetIOAddress(m_fpga_handle, dma_buf_wsid, &dma_buf_iova);
+  assert(FPGA_OK == res && "getting dma DMA_BUF_IOVA failed");
+
+  m_initialized = true;
+}
+
+/** mmd_dma destructor
+ *  free-ing , releasing various resources created during object construction is a good idea
+ *  it helps with system stability and reduces code bugs
+ */
+mmd_dma::~mmd_dma() {
+  MMD_DEBUG("DEBUG LOG : Destructing DMA \n");
+  auto res = fpgaReleaseBuffer(m_fpga_handle, dma_buf_wsid);
+  assert(FPGA_OK == res && "Release DMA Buffer failed");
+  m_initialized = false;
+}
+
+// Called in dma_transfer() to send DMA descriptor
+int mmd_dma::send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc) {
+  // mmio requires 8 byte alignment
+  assert(mmio_dst % 8 == 0);
+
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.src_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.src_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.dest_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.dest_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.len);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.len, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.control);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.control, mmio_dst);
+
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::fpga_to_host(void *host_addr, uint64_t dev_src, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dev_src = dev_src;
+  void *curr_host_addr = host_addr;
+
+  if (dev_src % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src is non 64B aligned\n");
+    if (count_left < 64) {
+      MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src count < 64\n");
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+      return res;
+    } else {
+      aligned_addr = ((curr_dev_src / 64) + 1) * 64;
+      align_bytes = aligned_addr - curr_dev_src;
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dev_src += align_bytes;
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+
+      uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+      // Copy data from shared buffer to host addr
+      memcpy(curr_host_addr, (void *)dma_buf_ptr, DMA_BUFFER_SIZE);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dev_src += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+        dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+        // Copy data from shared buffer to host addr
+        memcpy(curr_host_addr, (void *)dma_buf_ptr, dma_tx_bytes);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+        // Update the address
+        curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + dma_tx_bytes);
+        curr_dev_src += dma_tx_bytes;
+        count_left -= dma_tx_bytes;
+      }
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+        if (FPGA_OK != res) {
+          MMD_DEBUG("DEBUG LOG : mmd_dma::_ase_fpga_to_host failed\n");
+          return -1;
+        }
+        count_left = 0;
+
+        // No need to update address as the transaction is done.
+      }
+    }
+  }
+  assert(count_left==0 && "fpga_to_host failed");
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dest = dev_dest;
+  const void *curr_host_addr = host_addr;
+
+  if (dev_dest % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga dev_dest is non 64B aligned\n");
+    if (count_left < 64) {
+      res = _ase_host_to_fpga(dev_dest, host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+      return res;
+    } else {
+      aligned_addr = ((dev_dest / 64) + 1) * 64;
+      align_bytes = aligned_addr - dev_dest;
+      res = _ase_host_to_fpga(dev_dest, host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dest += align_bytes;
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+      // Copy host_src value to the shared buffer
+      memcpy((void *)dma_buf_ptr, curr_host_addr, DMA_BUFFER_SIZE);
+      uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dest += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        // Copy host_src value to the shared buffer
+        memcpy((void *)dma_buf_ptr, curr_host_addr, dma_tx_bytes);
+        uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of dma_tx_bytes / DMA_LINE_SIZE
+        dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+      }
+
+      // Update the address
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + dma_tx_bytes);
+      curr_dest += dma_tx_bytes;
+      count_left -= dma_tx_bytes;
+
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_host_to_fpga(curr_dest, curr_host_addr, count_left);
+        assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+        count_left = 0;
+      }
+    }
+  }
+  assert(count_left==0 && "host_to_fpga failed");
+  return 0;
+}
+
+int mmd_dma::dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode) {
+
+  // Get debug information for thread id
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint64_t id = std::stoull(ss.str());
+  MMD_DEBUG("dma_transfer start current thread_id is %04lX\n", id);
+
+  // Native DMA transfer requires 64 byte alignment
+  assert(dev_src % 64 == 0);
+  assert(dev_dest % 64 == 0);
+
+  const uint64_t MASK_FOR_35BIT_ADDR = 0x7FFFFFFFF;
+
+  dma_descriptor_t desc;
+
+  MMD_DEBUG("DEBUG LOG : mmd_dma::dma_transfer starts\n");
+  MMD_DEBUG("DEBUG LOG dev_dest = %04lX\n", dev_dest);
+
+  desc.src_address = dev_src & MASK_FOR_35BIT_ADDR;
+  desc.dest_address = dev_dest & MASK_FOR_35BIT_ADDR;
+  desc.len = len;
+  desc.control = 0x80000000 | (descriptor_mode << MODE_SHIFT);
+
+  const uint64_t DMA_DESC_BASE = 8 * DMA_CSR_IDX_SRC_ADDR;
+  const uint64_t DMA_STATUS_BASE = 8 * DMA_CSR_IDX_STATUS;
+  uint64_t mmio_data = 0;
+
+  int desc_size = sizeof(desc);
+
+  MMD_DEBUG("Descriptor size   = %d\n", desc_size);
+  MMD_DEBUG("desc.src_address  = %04lX\n", desc.src_address);
+  MMD_DEBUG("desc.dest_address = %04lX\n", desc.dest_address);
+  MMD_DEBUG("desc.len          = %d\n", desc.len);
+  MMD_DEBUG("desc.control      = %04X\n", desc.control);
+  MMD_DEBUG("descriptor_mode   = %04X\n", descriptor_mode);
+
+  // send descriptor
+  send_descriptor(DMA_DESC_BASE, desc);
+
+  fpga_result r;
+  r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+  MMD_DEBUG("DMA_STATUS_BASE before = %04lX\n", mmio_data);
+  if (FPGA_OK != r) return -1;
+
+  // If the busy bit is empty, then we are done.
+  while ((mmio_data & 0x1) == 0x1) {
+    r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+    assert(FPGA_OK == r);
+  }
+  MMD_DEBUG("dma_transfer end current thread_id is %04lX\n", id);
+  return 0;
+}
+
+// Transfer "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO writes.
+fpga_result mmd_dma::_ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG: _ase_host_to_fpga is being called\n ");
+
+  MMD_DEBUG("DEBUG LOG : dev_dest is ");
+  MMD_DEBUG("%" PRIu64 "\n", dev_dest);
+
+  assert(count < 64);  // DLA only uses ASE transfer with less than 64 Byte transfer.
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  const void *curr_src_ptr = src_ptr;
+
+  if (count == 0) return res;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      uint64_t mmio_base_control = ASE_MMIO_BASE + ASE_MMIO_CTRL;
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, mmio_base_control, ase_window);
+      assert(res == FPGA_OK && "Write to ASE control failed");
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      MMD_DEBUG("DEBUG LOG  : _ase_host_to_fpga count is ");
+      MMD_DEBUG("%" PRIu64 "\n", count);
+
+      MMD_DEBUG("DEBUG LOG : dev addr is ");
+      MMD_DEBUG("%" PRIu64 "\n", dev_addr);
+
+      size_t size = (count > 8) ? 8 : count;
+      mmd_helper::write_mmio(m_fpga_handle, curr_src_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+
+    assert(count == 0);
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    MMD_DEBUG("DEBUG LOG  :  _ase_host_to_fpga count is ");
+    MMD_DEBUG("%" PRIu64 "\n", count);
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data, check if the shift is correct here
+    memcpy((reinterpret_cast<char *>(&read_tmp) + shift), src_ptr, unaligned_size);
+
+    // Write back data to the device
+    fpgaWriteMMIO64(m_fpga_handle, 0, dev_addr, read_tmp);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+
+    const void *curr_src_ptr = (const void *)(static_cast<const char *>(src_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned loop is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::write_mmio(m_fpga_handle,
+                             curr_src_ptr,
+                             dev_addr,
+                             size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+    assert(count_left == 0);
+  }
+
+  return FPGA_OK;
+}
+
+// Transfer "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO reads.
+fpga_result mmd_dma::_ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG  : _ase_fpga_to_host is being called\n ");
+
+  assert(count < 64);
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  if (count == 0) return res;
+
+  void *curr_host_ptr = host_ptr;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count > 8) ? 8 : count;
+
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG : ase_window is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data
+    memcpy(host_ptr, (reinterpret_cast<char *>(&read_tmp) + shift), unaligned_size);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+    curr_host_ptr = (void *)(static_cast<char *>(host_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+    assert(count_left == 0);
+  }
+  return FPGA_OK;
+}
+}  // namespace intel_opae_mmd
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
new file mode 100644
index 0000000..a2841b1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
@@ -0,0 +1,89 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+#ifndef MMD_DMA_H_
+#define MMD_DMA_H_
+
+#include <opae/fpga.h>
+#include <poll.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "aocl_mmd.h"
+#include "mmd_helper.h"
+
+#define DMA_CSR_IDX_SRC_ADDR 0x5
+#define DMA_CSR_IDX_STATUS 0x9
+#define MODE_SHIFT 26
+// For now limits to 16K to avoid DMA transfer hang in hw, further testing required to increase the value.
+#define DMA_BUFFER_SIZE (1024 * 16)
+#define DMA_LINE_SIZE 64
+#define DMA_HOST_MASK 0x2000000000000
+
+#define ASE_MMIO_BASE 0x20000
+#define ASE_MMIO_CTRL 0x200
+#define ASE_MMIO_WINDOW 0x1000
+
+namespace intel_opae_mmd {
+
+enum dma_mode { stand_by = 0x0, host_to_ddr = 0x1, ddr_to_host = 0x2, ddr_to_ddr = 0x3 };
+
+struct dma_descriptor_t {
+  uint64_t src_address;
+  uint64_t dest_address;
+  uint32_t len;
+  uint32_t control;
+};
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  int fpga_to_host(void *host_addr, uint64_t dev_src, size_t size);
+  int host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size);
+  int dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode);
+  fpga_result _ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count);
+  fpga_result _ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count);
+  mmd_dma(mmd_dma &other) = delete;
+  mmd_dma &operator=(const mmd_dma &other) = delete;
+
+ private:
+  // Helper functions
+  int send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc);
+  // Member variables
+  bool m_initialized;
+  fpga_handle m_fpga_handle;
+
+  // Shared buffer in host memory
+  uint64_t *dma_buf_ptr = NULL;
+  // Workspace ID used by OPAE to identify buffer
+  uint64_t dma_buf_wsid;
+  // IO virtual address
+  uint64_t dma_buf_iova;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // MMD_DMA_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
new file mode 100644
index 0000000..4af482a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
@@ -0,0 +1,163 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "mmd_helper.h"
+#include <inttypes.h>
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::read_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  if (mmio_addr % 4 != 0) {
+    MMD_DEBUG("DEBUG LOG : ead_mmio function doesn't support non 4 Byte aligned mmio_addr due to OPAE\n");
+    return -1;
+  }
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaReadMMIO64(mmio_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n", host_addr64, mmio_addr);
+      return -1;
+    }
+    MMD_DEBUG("DEBUG LOG : the host_addr64 value is ");
+    MMD_DEBUG("%" PRIu64 "\n", *host_addr64);
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n", host_addr32, mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr,
+              mmio_addr,
+              size);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr,
+                mmio_addr,
+                size);
+      MMD_DEBUG("result is %d \n", res);
+      return -1;
+    }
+
+    memcpy(host_addr32, &read_data, size);
+  }
+
+  return res;
+}
+
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::write_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaWriteMMIO64(mmio_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t \n",
+                host_addr64,
+                mmio_addr);
+      return -1;
+    }
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, *host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t\n",
+                host_addr32,
+                mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  while (size > 0) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr32,
+              mmio_addr,
+              size);
+    uint32_t tmp_data32 = 0;
+    fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &tmp_data32);  // First read the data back
+    size_t chunk_size = (size >= 4) ? 4 : size;
+
+    memcpy(&tmp_data32, host_addr32, chunk_size);  // Apply our data overlay
+
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr32,
+                mmio_addr,
+                size);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return 0;
+}
+
+};  // namespace mmd_helper
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
new file mode 100644
index 0000000..b7e2667
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
@@ -0,0 +1,41 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_HELPER_H
+#define MMD_HELPER_H
+
+#include <opae/fpga.h>
+#include <stdarg.h>
+
+inline void MMD_DEBUG(const char *format, ...) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    va_list arglist;
+    va_start(arglist, format);
+    vprintf(format, arglist);
+    va_end(arglist);
+    fflush(stdout);
+  }
+}
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size);
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size);
+
+};  // namespace mmd_helper
+
+#endif  // MMD_HELPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
new file mode 100644
index 0000000..16992da
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
@@ -0,0 +1,377 @@
+// Copyright 2022 Intel Corporation
+// SPDX-License-Identifier: MIT
+
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* TODO: this file comes from OpenCL SDK and should be formatted there first */
+/* clang-format off */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+// #ifndef AOCL_MMD_CALL
+// #if defined(_WIN32)
+// #define AOCL_MMD_CALL __declspec(dllimport)
+// #else
+// #define AOCL_MMD_CALL
+// #endif
+// #endif
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+#define WEAK __attribute__((weak))
+#endif
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>  //size_t
+#else
+#include <stddef.h> //size_t
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+   unsigned lo; /* 32 least significant bits of time value. */
+   unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+   AOCL_MMD_VERSION = 0,       /* Version of MMD (char*)*/
+   AOCL_MMD_NUM_BOARDS = 1,    /* Number of candidate boards (int)*/
+   AOCL_MMD_BOARD_NAMES = 2,   /* Names of boards available delimiter=; (char*)*/
+   AOCL_MMD_VENDOR_NAME = 3,   /* Name of vendor (char*) */
+   AOCL_MMD_VENDOR_ID = 4,     /* An integer ID for the vendor (int) */
+   AOCL_MMD_USES_YIELD = 5,    /* 1 if yield must be called to poll hw (int) */
+   /* The following can be combined in a bit field:
+    * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_SYSTEM.
+    * Prior to 14.1, all existing devices supported physical memory and no types of SVM memory, so this
+    * is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+    */
+   AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED      (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC         (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT     (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P            (1 << 3)
+
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+   AOCL_MMD_NUM_KERNEL_INTERFACES = 1,  /* Number of Kernel interfaces (int) */
+   AOCL_MMD_KERNEL_INTERFACES = 2,      /* Kernel interface (int*) */
+   AOCL_MMD_PLL_INTERFACES = 3,         /* Kernel clk handles (int*) */
+   AOCL_MMD_MEMORY_INTERFACE = 4,       /* Global memory handle (int) */
+   AOCL_MMD_TEMPERATURE = 5,            /* Temperature measurement (float) */
+   AOCL_MMD_PCIE_INFO = 6,              /* PCIe information (char*) */
+   AOCL_MMD_BOARD_NAME = 7,             /* Name of board (char*) */
+   AOCL_MMD_BOARD_UNIQUE_ID = 8,        /* Unique ID of board (int) */
+   AOCL_MMD_CONCURRENT_READS = 9,       /* # of parallel reads; 1 is serial*/
+   AOCL_MMD_CONCURRENT_WRITES = 10,     /* # of parallel writes; 1 is serial*/
+   AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/
+   AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,  /* Min alignment that the ASP supports for host allocations (size_t) */
+   AOCL_MMD_HOST_MEM_CAPABILITIES = 13,      /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+   AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,    /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+   AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,    /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+   AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16,   /*(size_t)*/
+   AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+   AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+   unsigned long long int exception_type;
+   void *user_private_info;
+   size_t user_cb;
+}aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)( int handle, void* user_data );
+typedef void (*aocl_mmd_device_interrupt_handler_fn)( int handle, aocl_mmd_interrupt_info* data_in, void* user_data );
+typedef void (*aocl_mmd_status_handler_fn)( int handle, void* user_data, aocl_mmd_op_t op, int status );
+
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(
+    aocl_mmd_offline_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(
+    int handle,
+    aocl_mmd_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char *name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler( int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data ) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler( int handle, aocl_mmd_status_handler_fn fn, void* user_data ) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      void* dst,
+      int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      const void* src,
+      int mmd_interface, size_t offset ) WEAK;
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS                 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE         -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY          -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT  -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY   -4
+#define AOCL_MMD_ERROR_INVALID_POINTER        -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/* clang-format on */
+#endif
author	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
committer	Eric Dao <eric@erickhangdao.com>	2025-03-10 17:54:31 -0400
commit	ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
tree	a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie
parent	40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
download	thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip