118 files changed, 26443 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h
new file mode 100644
index 0000000..76fd968
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/batch_job.h
@@ -0,0 +1,31 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef BATCH_JOB_H
+#define BATCH_JOB_H
+
+class BatchJob {
+ public:
+  // @param inputArray - ptr to CPU array containing input data to be copied to DDR
+  // blocking function
+  virtual void LoadInputFeatureToDDR(void* inputArray) = 0;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0;
+  virtual void ScheduleInputFeature() const = 0;
+  virtual void StartDla() = 0;
+  virtual ~BatchJob() {}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
new file mode 100644
index 0000000..7d91f0e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
@@ -0,0 +1,88 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "batch_job.h"    // BatchJob
+#include "mmd_wrapper.h"  // MmdWrapper
+
+// TODO:integrate with dla compiler later
+// #include "dla_types.h"
+// #include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  // uint64_t
+#include <memory>   // std::unique_ptr
+
+class StreamControllerComms;
+
+// BatchJob represents one batch execution
+// Contains input/output address and size in DDR for one batch
+// Contains functions to write feature data to DDR, start DLA and read output data from DDR
+class CoreDlaBatchJob : public BatchJob {
+ private:
+  // MMD access is required to handshake with CSR and transfer data between host/device memory
+  MmdWrapper* mmdWrapper_;
+  int instance_;
+  // size and address of graph config data allocated in DDR
+  uint64_t totalConfigWords_;
+  uint64_t configBaseAddrDDR_;
+  // size and address of input and output data allocated in DDR for 1 batch
+  uint64_t inputAddrDDR_;
+  uint64_t outputAddrDDR_;
+  uint64_t inputSizeDDR_;
+  uint64_t outputSizeDDR_;
+  const bool enableIstream_;
+  const bool enableOstream_;
+  uint64_t lastJobQueueNumber_;
+
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+
+  CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+                  uint64_t totalConfigWords,
+                  uint64_t configBaseAddrDDR,
+                  uint64_t inputAddrDDR,
+                  uint64_t outputAddrDDR,
+                  uint64_t inputSizeDDR,
+                  uint64_t outputSizeDDR,
+                  const bool enableIstream,
+                  const bool enableOstream,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+
+ public:
+  CoreDlaBatchJob(const CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob(CoreDlaBatchJob&) = delete;
+  CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete;
+  static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper,
+                                              uint64_t totalConfigWords,
+                                              uint64_t configBaseAddrDDR,
+                                              uint64_t inputAddrDDR,
+                                              uint64_t outputAddrDDR,
+                                              uint64_t inputSizeDDR,
+                                              uint64_t outputSizeDDR,
+                                              const bool enableIstream,
+                                              const bool enableOstream,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // @param inputArray - ptr to CPU array containing input data tp be copied to DDR
+  // blocking function
+  void LoadInputFeatureToDDR(void* inputArray) override;
+  void ScheduleInputFeature() const override;
+
+  // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data
+  void StartDla() override;
+  // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+  // outputArray must be allocated by the caller (size >= output_size_ddr)
+  // blocking function
+  void ReadOutputFeatureFromDDR(void* outputArray) const override;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h
new file mode 100644
index 0000000..2a04fa8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h
@@ -0,0 +1,144 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "device.h"                   //Device
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+#include <condition_variable>  //std::condition_variable
+#include <cstdint>             //uint64_t
+#include <map>                 //std::map
+#include <memory>              //std::unique_ptr
+#include <mutex>               //std::mutex
+#include <vector>              //std::vector
+
+class StreamControllerComms;
+
+// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
+// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
+// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
+// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
+// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
+// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
+// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
+struct InterruptServiceRoutineData {
+  MmdWrapper* mmdWrapper;
+  std::vector<uint64_t> jobsFinished;
+  std::vector<uint32_t> base_multiplier;
+  std::vector<uint32_t> prevCount;
+  std::vector<uint32_t> desc_queue_diag;
+  std::vector<std::mutex> isrMutex;
+  std::vector<std::condition_variable> isrCondVar;
+};
+
+/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
+ * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
+ * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
+ */
+class CoreDlaDevice : public Device {
+ public:
+  GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                           size_t numPipelines,
+#else
+                           uint64_t numPipelines,
+#endif
+                           int instance,
+                           std::string AES_key,
+                           std::string IV_key,
+                           bool encryption_enabled,
+                           // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
+                           // for both HW and SW emulator runtime. The software emulator has output file where as currently the
+                           // HW runtime does not.
+                           const std::string export_dir,
+                           const std::string parameter_rom_export_dir);
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  double GetActiveHWTimeMs(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  uint64_t GetNumFilterMemoryReads(int instance) const override;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;
+
+ private:
+  // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
+  // if the module number and address have not been implemented. The debug network is fault tolerant to both read
+  // requests never being accepted as well as read responses never being produced.
+  bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;
+
+#ifndef USE_OLD_COREDLA_DEVICE
+  // Must be called when there are no active jobs on DLA
+  // Returns total number of clocks by DLA jobs on hardware.
+  uint64_t GetClocksActive(int instance) const;
+
+  // Must be called when there are no active jobs on DLA
+  // Returns the clocks of all jobs
+  uint64_t GetClocksAllJobs(int instance) const;
+#endif
+
+  uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumFilterMemoryReadsTotal(int instance) const;
+
+  uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;
+
+ public:
+  // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+  // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+  // information the debug register contains, and the value is the data of the debug register.
+  DebugNetworkData ReadDebugNetwork(int instance) const override;
+
+  CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
+  ~CoreDlaDevice();
+  int GetSizeCsrDescriptorQueue() const override;
+  double GetCoreDlaClockFreq() const override;
+  int GetNumInstances() const override { return numInstances_; }
+  void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override;  // threadId is optional and for debugging purpose only
+  std::string SchedulerGetStatus() const override;
+  bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
+                           const std::string source_fifo_file="") override;
+
+ private:
+  std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
+  std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
+  int numInstances_;
+  MmdWrapper mmdWrapper_;
+  InterruptServiceRoutineData isrData_;
+  std::vector<uint64_t> jobsWaited_;
+#ifndef USE_OLD_COREDLA_DEVICE
+  std::vector<uint64_t> startClocksActive;
+  std::vector<uint64_t> startClockAllJobs;
+#endif
+  std::vector<uint64_t> startNumInputFeatureMemoryReads;
+  std::vector<uint64_t> startNumFilterMemoryReads;
+  std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
+  std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+  bool runtimePolling_;
+  uint32_t waitForDlaTimeoutSeconds_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
new file mode 100644
index 0000000..3dc91bc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
@@ -0,0 +1,83 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h"          //dla::CompiledResult
+#include "coredla_batch_job.h"        //BatchJob
+#include "device.h"                   //DLA_LOG
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "graph_job.h"                //GraphJob
+#include "mmd_wrapper.h"              //MmdWrapper
+
+// TODO:integrate with dla compiler later
+//#include "dla_types.h"
+//#include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint>  //uint64_t
+#include <memory>   //std::unique_ptr
+#include <mutex>    //std::mutex
+#include <vector>   //std::vector
+
+/*! GraphJob is a DLA compiled graph loaded onto a device
+ * Initialized with DlaDevice object
+ * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs
+ * It provides handle to "batch job" objects that are used to load input and start DLA for one batch
+ */
+
+class CoreDlaGraphJob : public GraphJob {
+ public:
+  // Function to construct and return a unique pointer GraphJob object to the runtime user
+  // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below
+  // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias
+  // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA
+  // compiler's output)
+  // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h
+  // "CONFIG_READER_DATA_BYTES")
+  // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers
+  // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously
+  // placed
+  // @param outputSizeDDR - size of one batch output data in DDR
+  // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs
+  // @param spStreamControllerComms - optional interface to stream controller
+  static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator,
+                                              MmdWrapper* mmdWrapper,
+                                              const dla::CompiledResult* compiled_result,
+                                              uint64_t numPipelines,
+                                              int instance,
+                                              std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  // Increments batchJobsRequested_
+  // Thread safe
+  BatchJob* GetBatchJob();
+  CoreDlaGraphJob(const GraphJob&) = delete;
+  CoreDlaGraphJob(CoreDlaGraphJob&) = delete;
+  CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete;
+
+ private:
+  uint64_t configFilterBiasBufferSizeDDR_;
+  uint64_t intermediateBufferSizeDDR_;
+  DeviceMemoryAllocator* ddrBufferAllocator_;
+  MmdWrapper* mmdWrapper_;
+  std::vector<std::unique_ptr<BatchJob>> batchJobs_;
+  unsigned int batchJobsRequested_;
+  unsigned int instance_;
+  std::mutex graphJobMutex;
+  CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator,
+                  MmdWrapper* mmdWrapper,
+                  const dla::CompiledResult* compiledResult,
+                  uint64_t numPipelines,
+                  int instance,
+                  std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+};
diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h
new file mode 100644
index 0000000..e506578
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device.h
@@ -0,0 +1,81 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+using namespace std;
+using DebugNetworkData = std::map<std::string, uint64_t>;
+
+// dla log macro
+#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__);
+#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__);
+
+class GraphJob;
+class arch_params;
+namespace dla {
+class CompiledResult;
+}
+class Device {
+ public:
+  static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds);
+  virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+                                   size_t numPipelines,
+                                   int instance,
+                                   std::string AES_key,
+                                   std::string IV_key,
+                                   bool encryption_enabled,
+                                   const std::string export_dir,
+                                   const std::string parameter_rom_export_dir) = 0;
+  // Return number of DLA jobs completed till now
+  // Used for debugging
+  virtual int GetNumInferencesCompleted(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+  virtual double GetActiveHWTimeMs(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the average of time taken per job (in milliseconds)
+  // Avg Time per job < Active Time
+  virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the input feature reader
+  virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory read made by the filter reader
+  virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0;
+  // Must be called when there are no active jobs on DLA
+  // Returns the number of memory writes made by the output feature writer
+  virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0;
+  // Waits for a job to finish on specified instance
+  virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0;
+  virtual int GetNumInstances() const = 0;
+  virtual double GetCoreDlaClockFreq() const = 0;
+  virtual int GetSizeCsrDescriptorQueue() const = 0;
+  virtual std::string SchedulerGetStatus() const = 0;
+  virtual bool InitializeScheduler(uint32_t sourceBufferSize,
+                                   uint32_t dropSourceBuffers,
+                                   uint32_t numInferenceRequests,
+                                   const std::string source_fifo_file="") = 0;
+  virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0;
+  virtual ~Device(){}
+};
+
+#endif  // DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
new file mode 100644
index 0000000..adc0a71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
@@ -0,0 +1,61 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "mmd_wrapper.h"  //MmdWrapper
+
+#include <cstdint>  //uint64_t
+
+/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR
+ * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O)
+ * A graph buffer is allocated in DDR from right to left
+ * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data
+ * This intermediate buffer space is allocated from left to right (starting address is 0)
+ * and is expanded based on graph's requirement
+ */
+class DeviceMemoryAllocator {
+ public:
+  void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper);
+  ~DeviceMemoryAllocator();
+
+  // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate
+  // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for
+  void AllocateSharedBuffer(uint64_t bufferSize, int instance);
+
+  // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is
+  // an example of this. We have decided to allocate this at the upper address and allocate downwards from there.
+  // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add
+  // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call
+  // to the allocator.
+  // @param bufferSize - the size of the buffer in bytes
+  // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2
+  // @param bufferAddr - the allocator indicates where it placed this buffer
+  void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr);
+
+  // Clears whole DDR space including the intermediate buffer
+  void Clear();
+
+ private:
+  // total DDR size (BSP parameter)
+  uint64_t totalGlobalMemSize_;
+  // For access to MMD
+  MmdWrapper *mmdWrapper_;
+  // current starting address of allocated graph buffer region
+  // graph buffers are allocated right to left
+  uint64_t currentStartAddressGraphBufferSpace_;
+  // current maximum allocated size for intermediate data
+  uint64_t currentIntermediateMaxBufferSizeAllocated_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
new file mode 100644
index 0000000..13fb56b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
@@ -0,0 +1,27 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+// save a copy
+#pragma push_macro("localparam")
+
+// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;"
+#undef localparam
+#define localparam constexpr
+
+// include the verilog header
+#include "dla_dma_constants.svh"
+
+// undo the syntax change
+#pragma pop_macro("localparam")
diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h
new file mode 100644
index 0000000..b04dde1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/graph_job.h
@@ -0,0 +1,28 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef GRAPH_JOB_H
+#define GRAPH_JOB_H
+
+#include "batch_job.h"
+using namespace std;
+class GraphJob {
+ public:
+  // Returns an unused batch job object
+  // If all batch jobs are used, returns null
+  virtual BatchJob* GetBatchJob() = 0;
+
+  virtual ~GraphJob(){}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
new file mode 100644
index 0000000..4014454
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
@@ -0,0 +1,63 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <cstdint>  //uint32_t
+
+using interrupt_service_routine_signature = void (*)(int handle, void *data);
+
+class MmdWrapper {
+ public:
+  MmdWrapper();
+  // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close()
+  // fails.  Ideally we would find some way to re-order the code so that it
+  // can throw an exception (before calling the destructor) if aocl_mmd_close()
+  // fails.
+  ~MmdWrapper();
+
+  // class cannot be copied
+  MmdWrapper(const MmdWrapper &) = delete;
+  MmdWrapper &operator=(const MmdWrapper &) = delete;
+
+  // Register a function to run as the interrupt service routine
+  void RegisterISR(interrupt_service_routine_signature func, void *data) const;
+
+  // 32-bit handshake with each CSR
+  void WriteToCsr(int instance, uint32_t addr, uint32_t data) const;
+  uint32_t ReadFromCsr(int instance, uint32_t addr) const;
+
+  // Copy data between host and device memory
+  void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const;
+  void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const;
+
+  // If the mmd layer supports accesses to the STREAM CONTROLLER
+  bool bIsStreamControllerValid(int instance) const;
+
+  // 32-bit handshake with each Stream Controller CSR
+  void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const;
+  void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const;
+
+  // Provide read-only access to board-specific constants
+  int GetMaxInstances() const { return maxInstances_; }
+  uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; }
+  double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; }
+  double GetDDRClockFreq() const { return ddrClockFreq_; }
+
+ private:
+  int handle_;
+  int maxInstances_;
+  uint64_t ddrSizePerInstance_;
+  double coreDlaClockFreq_;
+  double ddrClockFreq_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
new file mode 100644
index 0000000..e2fcdfc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
@@ -0,0 +1,69 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <mutex>
+#include <string>
+#include <vector>
+#include "mmd_wrapper.h"
+#include "stream_controller_messages.h"
+
+template <class T>
+struct Payload : public T {
+  void* GetPayload() { return this; }
+  size_t GetSize() { return sizeof(*this); }
+};
+
+class BusyFlag {
+ public:
+  bool Lock();
+  void Release();
+
+ private:
+  std::recursive_mutex _mutex;
+  bool _busy = false;
+};
+
+class BusyCheck {
+ public:
+  BusyCheck(BusyFlag& busyFlag);
+  ~BusyCheck();
+  operator bool();
+
+ private:
+  BusyFlag& _busyFlag;
+  bool _haveLocked;
+};
+
+class StreamControllerComms {
+ public:
+  StreamControllerComms();
+  bool IsPresent();
+  Payload<StatusMessagePayload> GetStatus();
+  std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload);
+  bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items);
+  bool Ping();
+  bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+
+ private:
+  bool StatusMessageHandler(uint32_t payloadOffset);
+  MessageType ReceiveMessage();
+  bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0);
+  MmdWrapper _mmdWrapper;
+  uint32_t _lastReceiveSequenceID = 0;
+  uint32_t _sendSequenceID = 0;
+  uint32_t _numBadMessages = 0;
+  const int _streamControllerInstance = 0;
+  Payload<StatusMessagePayload> _receivedStatusMessage;
+  BusyFlag _busyFlag;
+};
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
new file mode 100644
index 0000000..445a304
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt
@@ -0,0 +1,62 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+add_definitions(-DI_DK_AFU_ID="11446C9D-AA42-4085-9B3D-4EEF9429A4AD")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/mmd.cpp
+   ./host/mmd_device.cpp
+   ./host/mmd_dma.cpp
+   ./host/mmd_helper.cpp
+   ./host/kernel_interrupt.cpp
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when linking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
new file mode 100755
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
new file mode 100755
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..97882d4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp
@@ -0,0 +1,257 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "kernel_interrupt.h"
+
+#include <poll.h>
+#include <sys/eventfd.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+#include "mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+static const int mmd_kernel_interrupt_line_num = 1;
+static const uint32_t enable_int_mask = 0x00000001;
+static const uint32_t disable_int_mask = 0x00000000;
+
+bool KernelInterrupt::enable_thread = false;
+
+static const int debug_log_level = 0;
+
+// TODO: use consistent function throughout MMD for controlling debug
+// messages. This debug_print function is from OFS.
+static void debug_print(std::string &err_msg, int msglog) {
+  if (debug_log_level >= msglog) {
+    std::cerr << "KernelInterrupt: " << err_msg << std::endl;
+  }
+}
+
+static inline void check_result(fpga_result res, const char *err_str) {
+  if (res == FPGA_OK) {
+    return;
+  }
+  std::string opae_err_str =
+      std::string("KernelInterrupt: ") + std::string(err_str) + std::string(": ") + std::string(fpgaErrStr(res));
+}
+
+/** KernelInterrupt constructor
+ */
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_work_thread_active(false),
+      m_eventfd(0),
+      m_kernel_interrupt_fn(nullptr),
+      m_kernel_interrupt_user_data(nullptr),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(nullptr) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Constructor\n");
+  }
+  set_member_for_interrupts();
+  enable_interrupts();
+}
+
+/** KernelInterrupt destructor
+ *  calls disable_interrupts()
+ */
+KernelInterrupt::~KernelInterrupt() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt Destructor\n");
+  }
+  try {
+    disable_interrupts();
+  } catch (...) {
+    std::string err("destructor error");
+    debug_print(err, 0);
+  }
+}
+
+/** disable_interrupts() function is used in KernelInterrupt destructor
+ *  if interupt not enabled , !enable_thread
+ *  then disable interrupt mask
+ *  else if interrupts are used,
+ *  call noftify_work_thread(), join the thread
+ *  we call OPAE API fpgaUnregisterEvent() to unregister FPGA event,
+ *  it tells driver caller is no longer interested in notification for event associated with m_event_handle
+ *  we call OPAE API fpgaDestroyEventHandle() to free resources
+ */
+void KernelInterrupt::disable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+    }
+    assert(m_work_thread_active == false);
+    return;
+  }
+
+  m_work_thread_active = false;
+  notify_work_thread();
+  m_work_thread->join();
+
+  if (m_event_handle != nullptr) {
+    fpga_result res;
+
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    check_result(res, "error fpgaUnregisterEvent");
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    check_result(res, "error fpgaDestroyEventHandle");
+  }
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n");
+  }
+}
+
+/** notify_work_thread() function is called by disable_interrupts() function
+ *  eventfd object created by OPAE API fpgaGetOSObjectFromEventHandle() , m_eventfd,
+ *  can be used as an event wait/notify mechanism by user space applications and by kernel,
+ *  to notify user space applications of events
+ *  every time write() is performed on eventfd,
+ *  the value of uint64_t being written is added to count and wakeup is performed.
+ * We dont use read() below but read() will return count value to user space and reset count to 0
+ */
+void KernelInterrupt::notify_work_thread() {
+  uint64_t val = 1;
+  ssize_t res = write(m_eventfd, &val, sizeof(val));
+  if (res < 0) {
+    std::cerr << "Warning: KernelInterrupts::notify_work_thread()"
+                 " write to eventfd failed: "
+              << strerror(errno) << std::endl;
+  }
+}
+
+/** enable_interrupts() function is called by Kernel Interrupt constructor
+ *  if interrupt is not enabled it will disable interrupt mask , set thread active as false and return
+ *  if interrupt is enabled, it will use OPAE APIs to create event handle fpgaCreateEventHandle()
+ *  OPAE event APIs provide functions for handling asynchronous events such as errors and interrupts
+ *  Associated with every event a process has registered for is an fpga_event_handle,
+ *  which encapsulates OS specific data structure for event objects
+ *  On Linux fpga_event_handle can be used as file descriptor
+ *  and passed to select(), poll() and similar functions to wait for asynchronous events
+ *  OPAE API fpgaRegisterEvent() is used to tell driver that caller is interested in notification for event specified
+ *  OPAE API fpgaGetOSObjectFromEventHandle() checks validity of event handle and
+ *  gets OS object used to subscribe and unsubscribe to events
+ *  we create a thread and call work_thread()
+ */
+void KernelInterrupt::enable_interrupts() {
+  if (!enable_thread) {
+    if (std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+    }
+    m_work_thread_active = false;
+    return;
+  }
+
+  fpga_result res;
+
+  res = fpgaCreateEventHandle(&m_event_handle);
+  check_result(res, "error creating event handle");
+
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, mmd_kernel_interrupt_line_num);
+  check_result(res, "error registering event");
+
+  res = fpgaGetOSObjectFromEventHandle(m_event_handle, &m_eventfd);
+  check_result(res, "error getting event file handle");
+
+  m_work_thread_active = true;
+  m_work_thread = std::unique_ptr<std::thread>(new std::thread([this] { this->work_thread(); }));
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n");
+  }
+}
+
+/** work_thread() is called from enable_interrupts() function while creating new thread
+ *  it calls wait_for_event(), disables interrupt mask
+ *  creates lock_guard with m_mutex, calls kernel interrupt function and then enables interrupt mask
+ */
+void KernelInterrupt::work_thread() {
+  while (m_work_thread_active) {
+    wait_for_event();
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (m_kernel_interrupt_fn != nullptr) {
+      m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+    }
+  }
+}
+
+/** wait_for_event() is called from work_thread() function
+ *  it uses poll() function to wait for event on a file descriptor,
+ *  the m_event_fd file descriptor which we got from fpgaOSObjectFromEventHandle()
+ *  poll() uses pollfd struct, which inncludes
+ *  fd - file descriptor, events - requested events, revents - returned events
+ *  timeout argument in poll() specifies number of milliseconds,
+ *  poll() will block waiting for file descriptor
+ *  On success, poll() returns a nonnegative value which is the
+ *  number of elements in the pollfds whose revents fields have been
+ *  set to a nonzero value (indicating an event or an error).  A
+ *  return value of zero indicates that the system call timed out
+ *  before any file descriptors became read
+ */
+void KernelInterrupt::wait_for_event() {
+  // Use timeout when polling eventfd because sometimes interrupts are missed.
+  // This may be caused by knonw race condition with runtime, or there may
+  // be occasional events lost from OPAE.
+
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt waiting for event using poll()\n");
+  const int timeout_ms = 250;
+  struct pollfd pfd = {.fd = m_eventfd, .events = POLLIN, .revents = 0};
+  int num_events = poll(&pfd, 1, timeout_ms);
+  if (num_events <= 0) {
+    std::string err(num_events < 0 ? strerror(errno) : "timed out");
+    std::string err_str("poll(): ");
+    debug_print(err_str.append(err), 1);
+  } else if (pfd.revents != POLLIN) {
+    std::string err("poll error num: ", pfd.revents);
+    debug_print(err, 0);
+  } else {
+    uint64_t val = 0;
+    ssize_t bytes_read = read(pfd.fd, &val, sizeof(val));
+    if (bytes_read < 0) {
+      std::string err(strerror(errno));
+      std::string err_str("read: ");
+      debug_print(err_str.append(err), 1);
+    }
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : KernelInterrupt setting kernel interrupt\n");
+  std::lock_guard<std::mutex> lock(m_mutex);
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
+
+/** Configure interrupts
+ *  set_member_for_interrupts() called from KernelInterrupts constructor
+ */
+void KernelInterrupt::set_member_for_interrupts() {
+  static bool initialized = false;
+  if (initialized) {
+    return;
+  }
+  // Use interrupts
+  MMD_DEBUG("DEBUG LOG : Using interrupts\n");
+
+  enable_thread = true;
+  initialized = true;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
new file mode 100644
index 0000000..9ea6e68
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h
@@ -0,0 +1,68 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef KERNEL_INTERRUPT_H_
+#define KERNEL_INTERRUPT_H_
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <chrono>
+#include <mutex>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  void enable_interrupts();
+  void disable_interrupts();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+
+  KernelInterrupt(const KernelInterrupt &) = delete;
+  KernelInterrupt &operator=(const KernelInterrupt &) = delete;
+  KernelInterrupt(KernelInterrupt &&) = delete;
+  KernelInterrupt &operator=(KernelInterrupt &&) = delete;
+
+ private:
+  static void set_member_for_interrupts();
+
+  void notify_work_thread();
+  void wait_for_event();
+  void work_thread();
+
+  static bool enable_thread;
+
+  std::mutex m_mutex;
+  std::unique_ptr<std::thread> m_work_thread;
+  std::atomic<bool> m_work_thread_active;
+  int m_eventfd;
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void *m_kernel_interrupt_user_data;
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+  fpga_event_handle m_event_handle;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // KERNEL_INTERRUPT_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
new file mode 100644
index 0000000..58cd8e0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp
@@ -0,0 +1,830 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+// On some systems MAP_HUGE_2MB is not defined. It should be defined for all
+// platforms that DCP supports, but we also want ability to compile MMD on
+// CentOS 6 systems.
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#endif
+
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include "aocl_mmd.h"
+#include "mmd_device.h"
+
+bool diagnose = 0;
+
+/** If the MMD is loaded dynamically, destructors in the MMD will execute before
+ *  the destructors in the runtime upon program termination. The DeviceMapManager
+ *  guards accesses to the device/handle maps to make sure the runtime doesn't
+ *  get to reference them after MMD destructors have been called. Destructor
+ *  makes sure that all devices are closed at program termination regardless of
+ *  what the runtime does. Implemented as a singleton.
+ */
+class DeviceMapManager final {
+ public:
+  /** C++ std map data structure to keep track of
+   *  object id -> handle and handle -> device
+   */
+  typedef std::map<int, Device *> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  /** Returns handle and device pointer to the device with the specified name
+   *  Creates a new entry for this device if it doesn't already exist
+   *  Return 0 on success, -1 on failure
+   */
+  int get_or_create_device(const char *board_name, int *handle, Device **device);
+
+  /** Return obj id based on ASP name.*/
+  uint64_t id_from_name(const char *board_name);
+
+  /** Return MMD handle based on obj id. Returned value is negative if board
+   *   doesn't exist
+   */
+  inline int handle_from_id(uint64_t obj_id);
+
+  /** Return pointer to device based on MMD handle. Returned value is null
+   *   if board doesn't exist
+   */
+  Device *device_from_handle(int handle);
+
+  /** Closes specified device if it exists */
+  void close_device_if_exists(int handle);
+
+  /* Returns a reference to the class singleton */
+  static DeviceMapManager &get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const &) = delete;
+  void operator=(DeviceMapManager const &) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated Device* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+#ifdef SIM
+      std::cout << "# mmd.cpp: When destroying DeviceMapManager in ASE, assume it worked.\n";
+      break;
+#endif
+      MMD_DEBUG("DEBUG LOG : In DeviceMapManager destructor, closing device with handle %d \n", handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+
+    MMD_DEBUG("DEBUG LOG : Constructing DeviceMapManager object\n");
+  }
+  t_handle_to_dev_map *handle_to_dev_map = nullptr;
+  t_id_to_handle_map *id_to_handle_map = nullptr;
+};
+static DeviceMapManager &device_manager = DeviceMapManager::get_instance();
+
+/** Returns handle and device pointer to the device with the specified name
+ *  Creates a new entry for this device if it doesn't already exist
+ *  Return 0 on success, -1 on failure
+ */
+int DeviceMapManager::get_or_create_device(const char *board_name, int *handle, Device **device) {
+  int _handle = MMD_INVALID_PARAM;
+  Device *_device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    MMD_DEBUG(
+        "DEBUG LOG : Failure in DeviceMapManager::get_or_create_device,id_to_handle_map or handle_to_dev_map is "
+        "NULL\n");
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device. obj_id : %ld \n", obj_id);
+    return false;
+  }
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new Device(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device %s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in creating new device object handle : %d \n", _handle);
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device metadata(handle , object) , handle : %d\n", _handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+
+  MMD_DEBUG("DEBUG LOG : Success in creating new device object , handle : %d\n", _handle);
+  return DeviceMapManager::SUCCESS;
+}
+
+/** Return obj id based on ASP name.*/
+uint64_t DeviceMapManager::id_from_name(const char *board_name) {
+  uint64_t obj_id = 0;
+  if (Device::parse_board_name(board_name, obj_id)) {
+    MMD_DEBUG("DEBUG LOG : Success in retrieving object id from board name\n");
+    return obj_id;
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve object id from board name\n");
+    return 0;
+  }
+}
+
+/** Return MMD handle based on obj id. Returned value is negative if board
+ *  doesn't exist
+ */
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving handle from object id. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve handle from object id \n");
+  }
+  return handle;
+}
+
+/** Return pointer to device based on MMD handle. Returned value is null
+ *  if board doesn't exist
+ */
+Device *DeviceMapManager::device_from_handle(int handle) {
+  Device *dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+    MMD_DEBUG("DEBUG LOG : Success in retrieving device from handle. handle : %d \n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Failed to retrieve device from handle\n");
+  }
+  return dev;
+}
+
+/** Closes specified device if it exists */
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      Device *dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+      MMD_DEBUG("DEBUG LOG : Closing device with handle : %d\n", handle);
+    } else {
+      MMD_DEBUG("DEBUG LOG : Nothing to close. Device with handle : %d already closed\n", handle);
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error, no handle to device map entry found for handle : %d \n", handle);
+  }
+}
+
+/** Interface for checking if AFU has ASP loaded */
+bool mmd_asp_loaded(const char *name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error, no object id found for board : %s \n", name);
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    Device *dev = device_manager.device_from_handle(handle);
+    if (dev) {
+      MMD_DEBUG("DEBUG LOG : ASP loaded for handle : %d \n", handle);
+      return dev->asp_loaded();
+    } else {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d \n", handle);
+      return false;
+    }
+  } else {
+    bool asp_loaded = false;
+    try {
+      Device dev(obj_id);
+      asp_loaded = dev.asp_loaded();
+    } catch (std::runtime_error &e) {
+      MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d , %s\n", handle, e.what());
+      return false;
+    }
+
+    MMD_DEBUG("DEBUG LOG : ASP loaded : %d (0 - not loaded , 1 - loaded) for handle : %d \n", asp_loaded, handle);
+    return asp_loaded;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine number of baords in system
+ */
+static unsigned int get_offline_num_acl_boards(const char *asp_uuid) {
+  bool asp_only = true;
+  fpga_guid guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(asp_uuid, guid) < 0) {
+    MMD_DEBUG("Error parsing guid '%s'\n", asp_uuid);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (asp_only) {
+    res = fpgaPropertiesSetGUID(filter, guid);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+/** Function called as part of aocl_mmd_get_offline_info()
+ *  to determine names of boards in the system
+ */
+static bool get_offline_board_names(std::string &boards, bool asp_only = true) {
+  boards = "dla_agx7_ofs_board";
+  return true;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strnlen(X, 4096) + 1;                                    \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy((void *)param_value, X, Xcpylen);                                 \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+/** Get information about the board using the enum aocl_mmd_offline_info_t for
+ *  offline info (called without a handle), and the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *    requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *    param_value_size - size of the param_value field in bytes. This should
+ *      match the size of the return type expected as indicated in the enum
+ *      definition.
+ *
+ *    param_value - pointer to the variable that will receive the returned info
+ *
+ *    param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+
+// From DLA perspective, only AOCL_MMD_BOARD_NAMES info we care
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  /** aocl_mmd_get_offline_info can be called many times by the runtime
+   *  and it is expensive to query the system.  Only compute values first
+   *  time aocl_mmd_get_offline_info called future iterations use saved results
+   */
+  static bool initialized = false;
+  static int mem_type_info;
+  static unsigned int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards(I_DK_AFU_ID);
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      RESULT_INT(num_acl_boards);
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+/** Get information about the board using the enum aocl_mmd_info_t for
+ *  info specific to a certain board.
+ *  Arguments:
+ *
+ *  requested_info_id - a value from the aocl_mmd_info_t enum
+ *
+ *  param_value_size - size of the param_value field in bytes. This should
+ *    match the size of the return type expected as indicated in the enum
+ *    definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *    the param_value_size should be set to sizeof(float) and you should
+ *    expect the same number of bytes returned in param_size_ret.
+ *
+ *  param_value - pointer to the variable that will receive the returned info
+ *
+ *  param_size_ret - receives the number of bytes of data actually returned
+ *
+ *  Returns: a negative value to indicate error.
+ */
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  MMD_DEBUG("DEBUG LOG : called aocl_mmd_get_info\n");
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel OFS Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float *ptr = static_cast<float *>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+
+    case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT:
+      RESULT_SIZE_T(64);
+      break;
+
+    case AOCL_MMD_HOST_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+    case AOCL_MMD_SHARED_MEM_CAPABILITIES: {
+      RESULT_INT(0);
+      break;
+    }
+
+    case AOCL_MMD_DEVICE_MEM_CAPABILITIES:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+/** Set the interrupt handler for the opened device.
+ *  The interrupt handler is called whenever the client needs to be notified
+ *  of an asynchronous event signaled by the device internals.
+ *  For example, the kernel has completed or is stalled.
+ *
+ *  Important: Interrupts from the kernel must be ignored until this handler is
+ *  set
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a kernel interrupt occurs
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set kernel interrupt handler for device handle : %d\n", handle);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error setting kernel interrupt handler for device handle : %d\n", handle);
+    return MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+/** Set the operation status handler for the opened device.
+ *  The operation status handler is called with
+ *     status 0 when the operation has completed successfully.
+ *     status negative when the operation completed with errors.
+ *
+ *  Arguments:
+ *    fn - the callback function to invoke when a status update is to be
+ *    performed.
+ *    user_data - the data that should be passed to fn when it is called.
+ *
+ *  Returns: 0 if successful, negative on error
+ */
+
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_status_handler(fn, user_data);
+    MMD_DEBUG("DEBUG LOG : Set status handler for device handle : %d\n", handle);
+  }
+  return 0;
+}
+
+/** Host to device-global-memory write (HOST DDR -> FPGA DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_write: handle : %d\t operation : %p\t len : 0x%zx\t src : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      src,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_write , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Host reading from device-global-memory (FPGA DDR -> HOST DDR)
+ *  If op is NULL
+ *     - Then these calls must block until the operation is complete.
+ *     - The status handler is not called for this operation.
+ *
+ *  If op is non-NULL, then:
+ *     - These may be non-blocking calls
+ *     - The status handler must be called upon completion, with status 0
+ *     for success, and a negative value for failure.
+ *
+ *  Arguments:
+ *    op - the operation object used to track this operations progress
+ *
+ *    len - the size in bytes to transfer
+ *
+ *    src - the host buffer being read from
+ *
+ *    dst - the host buffer being written to
+ *
+ *    mmd_interface - the handle to the interface being accessed. E.g. To
+ *    access global memory this handle will be whatever is returned by
+ *    aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *    offset/src_offset/dst_offset - the byte offset within the interface that
+ *    the transfer will begin at.
+ *
+ *  The return value is 0 if the operation launch was successful, and
+ *  negative otherwise.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  MMD_DEBUG(
+      "DEBUG LOG : aocl_mmd_read: handle : %d\t operation : %p\t len : 0x%zx\t dst : %p\t mmd_interface : %d\t offset "
+      ": 0x%zx\n",
+      handle,
+      op,
+      len,
+      dst,
+      mmd_interface,
+      offset);
+  Device *dev = device_manager.device_from_handle(handle);
+  if (dev){
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  }
+  else {
+    MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_read , device not found for handle : %d\n", handle);
+    return -1;
+  }
+}
+
+/** Open and initialize the named device.
+ *
+ *  The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ *  info.
+ *
+ *  Arguments:
+ *     name - open the board with this name (provided as a C-style string,
+ *            i.e. NUL terminated ASCII.)
+ *
+ *  Returns: the non-negative integer handle for the board, otherwise a
+ *  negative value to indicate error. Upon receiving the error, the OpenCL
+ *  runtime will proceed to open other known devices, hence the MMD mustn't
+ *  exit the application if an open call fails.
+ */
+
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+
+  MMD_DEBUG("DEBUG LOG : aocl_mmd_open, Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, object id not found for board : %s\n", name);
+    return MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  Device *dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    if (std::getenv("MMD_PROGRAM_DEBUG") || std::getenv("MMD_DMA_DEBUG") || std::getenv("MMD_ENABLE_DEBUG")) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, device not found for board : %s\n", name);
+    }
+    return MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->asp_loaded()) {
+    if (!dev->initialize_asp()) {
+      MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, Error initializing asp for board : %s\n", name);
+      return MMD_ASP_INIT_FAILED;
+    }
+  } else {
+    MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, asp not loaded for board : %s\n", name);
+    return MMD_ASP_NOT_LOADED;
+  }
+  MMD_DEBUG("end of aocl_mmd_open \n");
+  MMD_DEBUG("DEBUG LOG : Success aocl_mmd_open for board : %s, handle : %d \n", name, handle);
+  return handle;
+}
+
+/** Close an opened device, by its handle.
+ *  Returns: 0 on success, negative values on error.
+ */
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+#ifndef SIM
+  device_manager.close_device_if_exists(handle);
+#else
+  std::cout << "# mmd.cpp: During simulation (ASE) we are not closing the device.\n";
+#endif
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; }
+
+// DLA can only uses 4GB DDR as of 2024.2
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() {
+  #ifdef USE_N6001_BOARD
+  return 300.0; // MHz
+  #else
+  return 333.333333; // MHz
+  #endif
+}
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x10000 + (0x800 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) {
+  #ifdef USE_N6001_BOARD
+  return (1ULL << 32) * instance + addr;
+  #else
+  return (1ULL << 33) * instance + addr;
+  #endif
+}
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_DLA_CSR, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
new file mode 100644
index 0000000..dd4ca42
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp
@@ -0,0 +1,448 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <assert.h>
+#include <numa.h>
+
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_helper.h"
+
+int Device::next_mmd_handle{1};
+
+/**
+ * The Device object is created for each device/board opened and
+ * it has methods to interact with fpga device.
+ * The entry point for Device is in DeviceMapManager Class
+ * which maintains mapping between device names and handles.
+ * Device Object is foundation for interacting with device.
+ */
+Device::Device(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      asp_initialized(false),
+      mmio_is_mapped(false),
+      filter(NULL),
+      mmio_token(NULL),
+      mmio_handle(NULL),
+      fme_token(NULL),
+      guid(),
+      mmd_dma(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  MMD_DEBUG("DEBUG LOG : Constructing Device object\n");
+
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  fpga_properties filter = NULL;
+  uint32_t num_matches;
+  fpga_result r;
+
+  // Set up a filter that will search for an accelerator
+  fpgaGetProperties(NULL, &filter);
+  fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+
+  // Add the desired UUID to the filter
+  uuid_parse(I_DK_AFU_ID, guid);
+  fpgaPropertiesSetGUID(filter, guid);
+
+  // Do the search across the available FPGA contexts
+  num_matches = 1;
+  fpgaEnumerate(&filter, 1, &mmio_token, 1, &num_matches);
+
+  fpgaPropertiesGetParent(filter, &fme_token);
+
+  // Not needed anymore so we destroy the filter
+  fpgaDestroyProperties(&filter);
+
+  if (num_matches < 1) {
+    throw std::runtime_error(std::string("Cannot find accelerator"));
+  }
+
+  // Open accelerator
+  r = fpgaOpen(mmio_token, &mmio_handle, 0);
+  assert(FPGA_OK == r);
+
+  // While the token is available, check whether it is for HW
+  // or for ASE simulation.
+  fpga_properties accel_props;
+  uint16_t vendor_id, dev_id;
+  fpgaGetProperties(mmio_token, &accel_props);
+  fpgaPropertiesGetVendorID(accel_props, &vendor_id);
+  fpgaPropertiesGetDeviceID(accel_props, &dev_id);
+
+  afu_initialized = true;
+  MMD_DEBUG("DEBUG LOG : Done constructing Device object\n");
+}
+
+/** Return true if board name parses correctly, false if it does not
+ *  Return the parsed object_id in obj_id as an [out] parameter
+ */
+bool Device::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  MMD_DEBUG("DEBUG LOG : Parsing board name\n");
+  std::string prefix(ASP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    MMD_DEBUG("DEBUG LOG : Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+/** initialize_asp() function is used in aocl_mmd_open() API
+ *  It resets AFC and reinitializes DMA, Kernel Interrupts if in use
+ */
+bool Device::initialize_asp() {
+  MMD_DEBUG("DEBUG LOG : Initializing ASP ... \n");
+  if (asp_initialized) {
+    MMD_DEBUG("DEBUG LOG : ASP already initialized \n");
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(mmio_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  // Trigger an user reset
+  uint64_t reset = 1;
+  fpgaWriteMMIO64(mmio_handle, 0, 0x40000, reset);
+
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for
+  // performance that the pinned buffer is located on the NUMA node as the
+  // threads that performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  MMD_DEBUG("DEBUG LOG : Initializing HOST -> FPGA DMA channel \n");
+
+  mmd_dma = new intel_opae_mmd::mmd_dma(mmio_handle, mmd_handle);
+  if (!mmd_dma->initialized()) {
+    MMD_DEBUG("DEBUG LOG : Error initializing DMA channel \n");
+    delete mmd_dma;
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+// Do not enable interrupt if polling mode is enabled in the DLA runtime.
+#ifndef COREDLA_RUNTIME_POLLING
+  try {
+    kernel_interrupt_thread = new intel_opae_mmd::KernelInterrupt(mmio_handle, mmd_handle);
+  } catch (const std::system_error &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << e.code() << std::endl;
+    return false;
+  } catch (const std::exception &e) {
+    std::cerr << "Error initializing kernel interrupt thread: " << e.what() << std::endl;
+    return false;
+  }
+#endif
+
+  asp_initialized = true;
+  MMD_DEBUG("DEBUG LOG : ASP Initialized ! \n");
+  return asp_initialized;
+}
+
+/** Device Class Destructor implementation
+ *  Properly releasing and free-ing memory
+ *  part of best coding practices and help
+ *  with stable system performance and
+ *  helps reduce bugs
+ */
+Device::~Device() {
+  MMD_DEBUG("DEBUG LOG : Destructing Device object \n");
+  int num_errors = 0;
+
+  if (kernel_interrupt_thread != nullptr) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (mmd_dma) {
+    delete mmd_dma;
+    mmd_dma = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(mmio_handle, 0)) {
+      MMD_DEBUG("DEBUG LOG :  fpgaUnmapMMIO failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_handle) {
+    if (fpgaClose(mmio_handle) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaClose mmio_handle failed\n");
+      num_errors++;
+    }
+  }
+
+  if (mmio_token) {
+    if (fpgaDestroyToken(&mmio_token) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyToken mmio_token failed\n");
+      num_errors++;
+    }
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG :  fpgaDestroyProperties filter failed\n");
+      num_errors++;
+    }
+  }
+
+  if (num_errors > 0) {
+    MMD_DEBUG("DEBUG LOG : Error freeing resources in Device destructor\n");
+  }
+}
+
+/** asp_loaded() function which checks if asp is loaded on board
+ *  it is used in aocl_mmd_open() API
+ */
+bool Device::asp_loaded() {
+  fpga_guid pci_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(I_DK_AFU_ID, pci_guid) < 0) {
+    MMD_DEBUG("DEBUG LOG : Error parsing guid\n");
+    return false;
+  }
+
+  res = fpgaGetProperties(mmio_token, &prop);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading properties: %s \n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  if (!mmio_token) {
+    fpgaDestroyProperties(&prop);
+    MMD_DEBUG("DEBUG LOG : Error reading the mmio_token\n");
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading GUID \n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(pci_guid, afu_guid) == 0) {
+    MMD_DEBUG("DEBUG LOG : asp loaded : true \n");
+    return true;
+  } else {
+    MMD_DEBUG("DEBUG LOG : asp loaded : false \n");
+    return false;
+  }
+}
+
+/** get_bdf() function is called
+ *  in aocl_mmd_get_info() API
+ */
+std::string Device::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << std::hex << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << std::hex << unsigned(device) << "." << std::hex << unsigned(function);
+
+  return bdf.str();
+}
+
+/** get_temperature() function is called
+ *  in aocl_mmd_get_info() API
+ *  We currently use hardcoded paths to retrieve temperature information
+ *  We will replace with OPAE APIs in future
+ */
+float Device::get_temperature() {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    MMD_DEBUG("DEBUG LOG : Reading temperature ... \n");
+  }
+  float temp = 0;
+  fpga_object obj;
+  const char *name;
+  name = "dfl_dev.*/spi_master/spi*/spi*.*/*-hwmon.*.auto/hwmon/hwmon*/temp1_input";
+  fpga_result res;
+  res = fpgaTokenGetObject(fme_token, name, &obj, FPGA_OBJECT_GLOB);
+  if (res != FPGA_OK) {
+    MMD_DEBUG("DEBUG LOG : Error reading temperature monitor from BMC :");
+    MMD_DEBUG(" %s \n", fpgaErrStr(res));
+    temp = -999;
+    return temp;
+  }
+
+  uint64_t value = 0;
+  fpgaObjectRead64(obj, &value, FPGA_OBJECT_SYNC);
+  fpgaDestroyObject(&obj);
+  temp = value / 1000;
+  return temp;
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_interrupt_handler() API
+ */
+void Device::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_kernel_interrupt() \n");
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+/** set_kernel_interrupt() function is used in aocl_mmd_set_status_handler() API
+ */
+void Device::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  MMD_DEBUG("DEBUG LOG : Device::set_status_handler() \n");
+  event_update = fn;
+  event_update_user_data = user_data;
+}
+
+/** event_update_fn() is used in read_block(), write_block(), copy_block() functions
+ *  OPAE provides event API for handling asynchronous events sucj as errors and interrupts
+ *  under the hood those are used
+ */
+void Device::event_update_fn(aocl_mmd_op_t op, int status) {
+  MMD_DEBUG("DEBUG LOG : Device::event_update_fn() \n");
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+/** read_block() is used in aocl_mmd_read() API
+ *  as name suggests its used for fpga->host DMA and MMIO transfers
+ */
+int Device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::read_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read.
+
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to read block\n");
+    res = mmd_dma->fpga_to_host(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4);  // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = read_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block\n");
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+  return res;
+}
+
+/** write_block() is used in aocl_mmd_write() API
+ *  as name suggests its used for DMA and MMIO transfers
+ */
+int Device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  MMD_DEBUG("DEBUG LOG : Device::write_block()\n");
+  int res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+    MMD_DEBUG("DEBUG LOG : Using DMA to write block\n");
+    res = mmd_dma->host_to_fpga(host_addr, (uint64_t)offset, size);
+  } else if (mmd_interface == AOCL_MMD_DLA_CSR) {
+    assert(size == 4); // DLA CSR read should be always size ==4 as of 2024.2
+    MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n");
+    res = write_mmio(host_addr, offset, size);
+  } else {
+    MMD_DEBUG("DEBUG LOG : Using MMIO to write block\n");
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+    if (op) {
+      this->event_update_fn(op, res);
+    }
+  }
+
+  return res;
+}
+
+/** read_mmio() is used in read_block() function
+ *  it uses OPAE APIs fpgaReadMMIO64() and fpgaReadMMIO32()
+ */
+int Device::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::read_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
+
+/** write_mmio() is used in write_block() function
+ *  it uses OPAE APIs fpgaWriteMMIO64() and fpgaWriteMMIO32()
+ */
+int Device::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  return mmd_helper::write_mmio(mmio_handle, host_addr, mmio_addr, size);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
new file mode 100644
index 0000000..1cded83
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h
@@ -0,0 +1,151 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_DEVICE_H
+#define MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#include <opae/fpga.h>
+
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+#include "kernel_interrupt.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning ASP is not loaded.
+#define MMD_ASP_NOT_LOADED -2
+#define MMD_ASP_INIT_FAILED -3
+
+// Delay settings
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define ASP_NAME "ofs_"
+
+#define SVM_MMD_MPF 0x24000
+
+#define SVM_DDR_OFFSET 0x1000000000000
+#define PCI_DDR_OFFSET 0
+
+enum {
+  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0,
+  AOCL_MMD_MEMORY = 1,
+  AOCL_MMD_DLA_CSR = 2,
+};
+
+enum AfuStatu { MMD_INVALID_ID = 0, MMD_ASP, MMD_AFU };
+
+class Device final {
+ public:
+  Device(uint64_t);
+  Device(const Device &) = delete;
+  Device &operator=(const Device &) = delete;
+  ~Device();
+
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+
+  bool initialize_asp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool asp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+  void initialize_local_cpus_sysfs();
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool asp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_properties filter;
+  fpga_token mmio_token;
+  fpga_handle mmio_handle;
+  fpga_token fme_token;
+  fpga_guid guid;
+  intel_opae_mmd::mmd_dma *mmd_dma;
+  std::mutex m_dma_mutex;
+
+  // Helper functions
+  int read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  int write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
new file mode 100644
index 0000000..6a4e13c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp
@@ -0,0 +1,573 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include <memory.h>
+#include <sys/mman.h>
+#include <cassert>
+#include <chrono>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+
+#include <inttypes.h>
+#include <sstream>
+
+#include "mmd_device.h"
+#include "mmd_dma.h"
+#include "mmd_helper.h"
+
+namespace intel_opae_mmd {
+
+/** mmd_dma class constructor
+ */
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle) : m_initialized(false), m_fpga_handle(fpga_handle_arg) {
+  MMD_DEBUG("DEBUG LOG : Constructing DMA \n");
+  // Initialize shared buffer
+  auto res = fpgaPrepareBuffer(m_fpga_handle, DMA_BUFFER_SIZE, (void **)&dma_buf_ptr, &dma_buf_wsid, 0);
+
+  assert(FPGA_OK == res && "Allocating DMA Buffer failed");
+
+  memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+  // Store virtual address of IO registers
+  res = fpgaGetIOAddress(m_fpga_handle, dma_buf_wsid, &dma_buf_iova);
+  assert(FPGA_OK == res && "getting dma DMA_BUF_IOVA failed");
+
+  m_initialized = true;
+}
+
+/** mmd_dma destructor
+ *  free-ing , releasing various resources created during object construction is a good idea
+ *  it helps with system stability and reduces code bugs
+ */
+mmd_dma::~mmd_dma() {
+  MMD_DEBUG("DEBUG LOG : Destructing DMA \n");
+  auto res = fpgaReleaseBuffer(m_fpga_handle, dma_buf_wsid);
+  assert(FPGA_OK == res && "Release DMA Buffer failed");
+  m_initialized = false;
+}
+
+// Called in dma_transfer() to send DMA descriptor
+int mmd_dma::send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc) {
+  // mmio requires 8 byte alignment
+  assert(mmio_dst % 8 == 0);
+
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.src_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.src_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.dest_address);
+  MMD_DEBUG("Writing %lX to address %lX\n", desc.dest_address, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.len);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.len, mmio_dst);
+  mmio_dst += 8;
+  fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.control);
+  MMD_DEBUG("Writing %X to address %lX\n", desc.control, mmio_dst);
+
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::fpga_to_host(void *host_addr, uint64_t dev_src, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dev_src = dev_src;
+  void *curr_host_addr = host_addr;
+
+  if (dev_src % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src is non 64B aligned\n");
+    if (count_left < 64) {
+      MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src count < 64\n");
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+      return res;
+    } else {
+      aligned_addr = ((curr_dev_src / 64) + 1) * 64;
+      align_bytes = aligned_addr - curr_dev_src;
+      res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_fpga_to_host failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dev_src += align_bytes;
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+
+      uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+      // Copy data from shared buffer to host addr
+      memcpy(curr_host_addr, (void *)dma_buf_ptr, DMA_BUFFER_SIZE);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dev_src += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK;
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+        dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host);
+
+        // Copy data from shared buffer to host addr
+        memcpy(curr_host_addr, (void *)dma_buf_ptr, dma_tx_bytes);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+        // Update the address
+        curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + dma_tx_bytes);
+        curr_dev_src += dma_tx_bytes;
+        count_left -= dma_tx_bytes;
+      }
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left);
+        if (FPGA_OK != res) {
+          MMD_DEBUG("DEBUG LOG : mmd_dma::_ase_fpga_to_host failed\n");
+          return -1;
+        }
+        count_left = 0;
+
+        // No need to update address as the transaction is done.
+      }
+    }
+  }
+  assert(count_left==0 && "fpga_to_host failed");
+  return 0;
+}
+
+// Use ASE to handle unaligned transfer and DMA to do aligned transfer.
+int mmd_dma::host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size) {
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = size;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  uint64_t curr_dest = dev_dest;
+  const void *curr_host_addr = host_addr;
+
+  if (dev_dest % 64 != 0) {
+    // We use ASE to handle unaligned DMA transfer
+    MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga dev_dest is non 64B aligned\n");
+    if (count_left < 64) {
+      res = _ase_host_to_fpga(dev_dest, host_addr, count_left);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+      return res;
+    } else {
+      aligned_addr = ((dev_dest / 64) + 1) * 64;
+      align_bytes = aligned_addr - dev_dest;
+      res = _ase_host_to_fpga(dev_dest, host_addr, align_bytes);
+      assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+
+      // Update the processed data
+      count_left -= align_bytes;
+      curr_dest += align_bytes;
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + align_bytes);
+    }
+  }
+
+  if (count_left) {
+    uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE;
+    for (uint64_t i = 0; i < dma_chunks; i++) {
+      // constant size transfer
+      // Copy host_src value to the shared buffer
+      memcpy((void *)dma_buf_ptr, curr_host_addr, DMA_BUFFER_SIZE);
+      uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+      int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of test_buffer_size / DMA_LINE_SIZE
+
+      dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+      memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+
+      // Update the curr source and dest
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + DMA_BUFFER_SIZE);
+      curr_dest += DMA_BUFFER_SIZE;
+    }
+
+    // Updated the count_left for the for loop
+    count_left -= (dma_chunks * DMA_BUFFER_SIZE);
+
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / 64) * 64;
+      if (dma_tx_bytes != 0) {
+        assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n");
+
+        // Copy host_src value to the shared buffer
+        memcpy((void *)dma_buf_ptr, curr_host_addr, dma_tx_bytes);
+        uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK;
+
+        int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1;  // Ceiling of dma_tx_bytes / DMA_LINE_SIZE
+        dma_transfer(dev_src, curr_dest, len, host_to_ddr);
+
+        memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE);
+      }
+
+      // Update the address
+      curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + dma_tx_bytes);
+      curr_dest += dma_tx_bytes;
+      count_left -= dma_tx_bytes;
+
+      if (count_left) {
+        MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga count_left after DMA transfer is ");
+        MMD_DEBUG("%" PRIu64 "\n", count_left);
+        // Handle the rest unaligned transfer using ASE
+        res = _ase_host_to_fpga(curr_dest, curr_host_addr, count_left);
+        assert(FPGA_OK == res && "_ase_host_to_fpga failed");
+        count_left = 0;
+      }
+    }
+  }
+  assert(count_left==0 && "host_to_fpga failed");
+  return 0;
+}
+
+int mmd_dma::dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode) {
+
+  // Get debug information for thread id
+  std::stringstream ss;
+  ss << std::this_thread::get_id();
+  uint64_t id = std::stoull(ss.str());
+  MMD_DEBUG("dma_transfer start current thread_id is %04lX\n", id);
+
+  // Native DMA transfer requires 64 byte alignment
+  assert(dev_src % 64 == 0);
+  assert(dev_dest % 64 == 0);
+
+  const uint64_t MASK_FOR_35BIT_ADDR = 0x7FFFFFFFF;
+
+  dma_descriptor_t desc;
+
+  MMD_DEBUG("DEBUG LOG : mmd_dma::dma_transfer starts\n");
+  MMD_DEBUG("DEBUG LOG dev_dest = %04lX\n", dev_dest);
+
+  desc.src_address = dev_src & MASK_FOR_35BIT_ADDR;
+  desc.dest_address = dev_dest & MASK_FOR_35BIT_ADDR;
+  desc.len = len;
+  desc.control = 0x80000000 | (descriptor_mode << MODE_SHIFT);
+
+  const uint64_t DMA_DESC_BASE = 8 * DMA_CSR_IDX_SRC_ADDR;
+  const uint64_t DMA_STATUS_BASE = 8 * DMA_CSR_IDX_STATUS;
+  uint64_t mmio_data = 0;
+
+  int desc_size = sizeof(desc);
+
+  MMD_DEBUG("Descriptor size   = %d\n", desc_size);
+  MMD_DEBUG("desc.src_address  = %04lX\n", desc.src_address);
+  MMD_DEBUG("desc.dest_address = %04lX\n", desc.dest_address);
+  MMD_DEBUG("desc.len          = %d\n", desc.len);
+  MMD_DEBUG("desc.control      = %04X\n", desc.control);
+  MMD_DEBUG("descriptor_mode   = %04X\n", descriptor_mode);
+
+  // send descriptor
+  send_descriptor(DMA_DESC_BASE, desc);
+
+  fpga_result r;
+  r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+  MMD_DEBUG("DMA_STATUS_BASE before = %04lX\n", mmio_data);
+  if (FPGA_OK != r) return -1;
+
+  // If the busy bit is empty, then we are done.
+  while ((mmio_data & 0x1) == 0x1) {
+    r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data);
+    assert(FPGA_OK == r);
+  }
+  MMD_DEBUG("dma_transfer end current thread_id is %04lX\n", id);
+  return 0;
+}
+
+// Transfer "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO writes.
+fpga_result mmd_dma::_ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG: _ase_host_to_fpga is being called\n ");
+
+  MMD_DEBUG("DEBUG LOG : dev_dest is ");
+  MMD_DEBUG("%" PRIu64 "\n", dev_dest);
+
+  assert(count < 64);  // DLA only uses ASE transfer with less than 64 Byte transfer.
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  const void *curr_src_ptr = src_ptr;
+
+  if (count == 0) return res;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      uint64_t mmio_base_control = ASE_MMIO_BASE + ASE_MMIO_CTRL;
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, mmio_base_control, ase_window);
+      assert(res == FPGA_OK && "Write to ASE control failed");
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      MMD_DEBUG("DEBUG LOG  : _ase_host_to_fpga count is ");
+      MMD_DEBUG("%" PRIu64 "\n", count);
+
+      MMD_DEBUG("DEBUG LOG : dev addr is ");
+      MMD_DEBUG("%" PRIu64 "\n", dev_addr);
+
+      size_t size = (count > 8) ? 8 : count;
+      mmd_helper::write_mmio(m_fpga_handle, curr_src_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+
+    assert(count == 0);
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    MMD_DEBUG("DEBUG LOG  :  _ase_host_to_fpga count is ");
+    MMD_DEBUG("%" PRIu64 "\n", count);
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data, check if the shift is correct here
+    memcpy((reinterpret_cast<char *>(&read_tmp) + shift), src_ptr, unaligned_size);
+
+    // Write back data to the device
+    fpgaWriteMMIO64(m_fpga_handle, 0, dev_addr, read_tmp);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+
+    const void *curr_src_ptr = (const void *)(static_cast<const char *>(src_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG  : ase_window in non-aligned loop is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::write_mmio(m_fpga_handle,
+                             curr_src_ptr,
+                             dev_addr,
+                             size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size);
+    }
+    assert(count_left == 0);
+  }
+
+  return FPGA_OK;
+}
+
+// Transfer "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+// calls to handle unaligned and aligned MMIO reads.
+fpga_result mmd_dma::_ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count) {
+  MMD_DEBUG("DEBUG LOG  : _ase_fpga_to_host is being called\n ");
+
+  assert(count < 64);
+
+  fpga_result res = FPGA_OK;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  // For ASE window
+
+  uint64_t ase_window;
+  uint64_t ase_addr;
+  uint64_t dev_addr;
+
+  if (count == 0) return res;
+
+  void *curr_host_ptr = host_ptr;
+
+  if (dev_dest % 8 == 0) {
+    while (count > 0) {
+      ase_window = dev_dest & ~(0xfff);
+      ase_addr = (dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      MMD_DEBUG("DEBUG LOG : ase_window is ");
+      MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned.
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count > 8) ? 8 : count;
+
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count -= size;
+      dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+  } else {
+    // First we need to handle the non byte aligned transfer
+
+    // Aligns address to 8 byte using dst masking method
+    unaligned_size = 8 - (dev_dest % 8);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+
+    // Write to the unaligned address
+    assert(unaligned_size < 8);
+    uint64_t shift = dev_dest % 8;
+
+    // Write to ASE control to switch page.
+    ase_window = dev_dest & ~(0xfff);
+
+    MMD_DEBUG("DEBUG LOG : ase_window is ");
+    MMD_DEBUG("%" PRIu64 "\n", ase_window);
+
+    fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+    // Get aligned dest address
+    uint64_t dev_aligned_addr = dev_dest - shift;
+    assert(dev_aligned_addr % 8 == 0);
+
+    // read data from device memory with aligned dev dest
+    ase_addr = (dev_aligned_addr & 0xfff);
+    dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+    uint64_t read_tmp = 0;
+    fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp);
+
+    // overlay our data
+    memcpy(host_ptr, (reinterpret_cast<char *>(&read_tmp) + shift), unaligned_size);
+
+    count_left -= unaligned_size;
+
+    // Check if there is any byte left
+    if (count_left == 0) {
+      return res;
+    }
+
+    // Now the dest address should be byte aligned now
+    // Start the regular ASE transfer
+    curr_host_ptr = (void *)(static_cast<char *>(host_ptr) + unaligned_size);
+    uint64_t next_dev_dest = dev_dest + unaligned_size;
+
+    while (count_left > 0) {
+      ase_window = next_dev_dest & ~(0xfff);
+      ase_addr = (next_dev_dest & 0xfff);  // only keep the lower 12 bits.
+
+      // Write to ASE control to switch page.
+      fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window);
+
+      // Set final dev_addr
+      dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr;
+
+      assert(dev_addr % 8 == 0);
+
+      size_t size = (count_left > 8) ? 8 : count_left;
+      mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size);
+
+      count_left -= size;
+      next_dev_dest += size;
+      curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size);
+    }
+
+    assert(count_left == 0);
+  }
+  return FPGA_OK;
+}
+}  // namespace intel_opae_mmd
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
new file mode 100644
index 0000000..a2841b1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h
@@ -0,0 +1,89 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+#ifndef MMD_DMA_H_
+#define MMD_DMA_H_
+
+#include <opae/fpga.h>
+#include <poll.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+
+#include "aocl_mmd.h"
+#include "mmd_helper.h"
+
+#define DMA_CSR_IDX_SRC_ADDR 0x5
+#define DMA_CSR_IDX_STATUS 0x9
+#define MODE_SHIFT 26
+// For now limits to 16K to avoid DMA transfer hang in hw, further testing required to increase the value.
+#define DMA_BUFFER_SIZE (1024 * 16)
+#define DMA_LINE_SIZE 64
+#define DMA_HOST_MASK 0x2000000000000
+
+#define ASE_MMIO_BASE 0x20000
+#define ASE_MMIO_CTRL 0x200
+#define ASE_MMIO_WINDOW 0x1000
+
+namespace intel_opae_mmd {
+
+enum dma_mode { stand_by = 0x0, host_to_ddr = 0x1, ddr_to_host = 0x2, ddr_to_ddr = 0x3 };
+
+struct dma_descriptor_t {
+  uint64_t src_address;
+  uint64_t dest_address;
+  uint32_t len;
+  uint32_t control;
+};
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  int fpga_to_host(void *host_addr, uint64_t dev_src, size_t size);
+  int host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size);
+  int dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode);
+  fpga_result _ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count);
+  fpga_result _ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count);
+  mmd_dma(mmd_dma &other) = delete;
+  mmd_dma &operator=(const mmd_dma &other) = delete;
+
+ private:
+  // Helper functions
+  int send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc);
+  // Member variables
+  bool m_initialized;
+  fpga_handle m_fpga_handle;
+
+  // Shared buffer in host memory
+  uint64_t *dma_buf_ptr = NULL;
+  // Workspace ID used by OPAE to identify buffer
+  uint64_t dma_buf_wsid;
+  // IO virtual address
+  uint64_t dma_buf_iova;
+};
+
+};  // namespace intel_opae_mmd
+
+#endif  // MMD_DMA_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
new file mode 100644
index 0000000..4af482a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp
@@ -0,0 +1,163 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#include "mmd_helper.h"
+#include <inttypes.h>
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::read_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  if (mmio_addr % 4 != 0) {
+    MMD_DEBUG("DEBUG LOG : ead_mmio function doesn't support non 4 Byte aligned mmio_addr due to OPAE\n");
+    return -1;
+  }
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaReadMMIO64(mmio_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n", host_addr64, mmio_addr);
+      return -1;
+    }
+    MMD_DEBUG("DEBUG LOG : the host_addr64 value is ");
+    MMD_DEBUG("%" PRIu64 "\n", *host_addr64);
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG(
+          "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n", host_addr32, mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr,
+              mmio_addr,
+              size);
+    res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr,
+                mmio_addr,
+                size);
+      MMD_DEBUG("result is %d \n", res);
+      return -1;
+    }
+
+    memcpy(host_addr32, &read_data, size);
+  }
+
+  return res;
+}
+
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  MMD_DEBUG("DEBUG LOG : Device::write_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+            host_addr,
+            mmio_addr,
+            size);
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO64()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr64,
+              mmio_addr);
+    res = fpgaWriteMMIO64(mmio_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t \n",
+                host_addr64,
+                mmio_addr);
+      return -1;
+    }
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+
+  while (size >= 4) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t \n",
+              host_addr32,
+              mmio_addr);
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, *host_addr32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t\n",
+                host_addr32,
+                mmio_addr);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  while (size > 0) {
+    MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32()       host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+              host_addr32,
+              mmio_addr,
+              size);
+    uint32_t tmp_data32 = 0;
+    fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &tmp_data32);  // First read the data back
+    size_t chunk_size = (size >= 4) ? 4 : size;
+
+    memcpy(&tmp_data32, host_addr32, chunk_size);  // Apply our data overlay
+
+    res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) {
+      MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n",
+                host_addr32,
+                mmio_addr,
+                size);
+      return -1;
+    }
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return 0;
+}
+
+};  // namespace mmd_helper
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
new file mode 100644
index 0000000..b7e2667
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h
@@ -0,0 +1,41 @@
+// (c) 1992-2024 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+#ifndef MMD_HELPER_H
+#define MMD_HELPER_H
+
+#include <opae/fpga.h>
+#include <stdarg.h>
+
+inline void MMD_DEBUG(const char *format, ...) {
+  if (std::getenv("MMD_ENABLE_DEBUG")) {
+    va_list arglist;
+    va_start(arglist, format);
+    vprintf(format, arglist);
+    va_end(arglist);
+    fflush(stdout);
+  }
+}
+
+namespace mmd_helper {
+
+int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size);
+int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size);
+
+};  // namespace mmd_helper
+
+#endif  // MMD_HELPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
new file mode 100644
index 0000000..16992da
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h
@@ -0,0 +1,377 @@
+// Copyright 2022 Intel Corporation
+// SPDX-License-Identifier: MIT
+
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* TODO: this file comes from OpenCL SDK and should be formatted there first */
+/* clang-format off */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+// #ifndef AOCL_MMD_CALL
+// #if defined(_WIN32)
+// #define AOCL_MMD_CALL __declspec(dllimport)
+// #else
+// #define AOCL_MMD_CALL
+// #endif
+// #endif
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+#define WEAK __attribute__((weak))
+#endif
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>  //size_t
+#else
+#include <stddef.h> //size_t
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+   unsigned lo; /* 32 least significant bits of time value. */
+   unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+   AOCL_MMD_VERSION = 0,       /* Version of MMD (char*)*/
+   AOCL_MMD_NUM_BOARDS = 1,    /* Number of candidate boards (int)*/
+   AOCL_MMD_BOARD_NAMES = 2,   /* Names of boards available delimiter=; (char*)*/
+   AOCL_MMD_VENDOR_NAME = 3,   /* Name of vendor (char*) */
+   AOCL_MMD_VENDOR_ID = 4,     /* An integer ID for the vendor (int) */
+   AOCL_MMD_USES_YIELD = 5,    /* 1 if yield must be called to poll hw (int) */
+   /* The following can be combined in a bit field:
+    * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_SYSTEM.
+    * Prior to 14.1, all existing devices supported physical memory and no types of SVM memory, so this
+    * is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+    */
+   AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED      (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC         (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT     (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P            (1 << 3)
+
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+   AOCL_MMD_NUM_KERNEL_INTERFACES = 1,  /* Number of Kernel interfaces (int) */
+   AOCL_MMD_KERNEL_INTERFACES = 2,      /* Kernel interface (int*) */
+   AOCL_MMD_PLL_INTERFACES = 3,         /* Kernel clk handles (int*) */
+   AOCL_MMD_MEMORY_INTERFACE = 4,       /* Global memory handle (int) */
+   AOCL_MMD_TEMPERATURE = 5,            /* Temperature measurement (float) */
+   AOCL_MMD_PCIE_INFO = 6,              /* PCIe information (char*) */
+   AOCL_MMD_BOARD_NAME = 7,             /* Name of board (char*) */
+   AOCL_MMD_BOARD_UNIQUE_ID = 8,        /* Unique ID of board (int) */
+   AOCL_MMD_CONCURRENT_READS = 9,       /* # of parallel reads; 1 is serial*/
+   AOCL_MMD_CONCURRENT_WRITES = 10,     /* # of parallel writes; 1 is serial*/
+   AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/
+   AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,  /* Min alignment that the ASP supports for host allocations (size_t) */
+   AOCL_MMD_HOST_MEM_CAPABILITIES = 13,      /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+   AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,    /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+   AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,    /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+   AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16,   /*(size_t)*/
+   AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+   AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+   unsigned long long int exception_type;
+   void *user_private_info;
+   size_t user_cb;
+}aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)( int handle, void* user_data );
+typedef void (*aocl_mmd_device_interrupt_handler_fn)( int handle, aocl_mmd_interrupt_info* data_in, void* user_data );
+typedef void (*aocl_mmd_status_handler_fn)( int handle, void* user_data, aocl_mmd_op_t op, int status );
+
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(
+    aocl_mmd_offline_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(
+    int handle,
+    aocl_mmd_info_t requested_info_id,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_size_ret ) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char *name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler( int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data ) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler( int handle, aocl_mmd_status_handler_fn fn, void* user_data ) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      void* dst,
+      int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+      int handle,
+      aocl_mmd_op_t op,
+      size_t len,
+      const void* src,
+      int mmd_interface, size_t offset ) WEAK;
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS                 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE         -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY          -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT  -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY   -4
+#define AOCL_MMD_ERROR_INVALID_POINTER        -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/* clang-format on */
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
new file mode 100644
index 0000000..66e06bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore
@@ -0,0 +1,18 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
new file mode 100644
index 0000000..28dcfa4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt
@@ -0,0 +1,63 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+find_package(OPAE REQUIRED)
+find_package(NUMA REQUIRED)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/ccip_mmd.cpp
+   ./host/ccip_mmd_device.cpp
+   ./host/dma_work_thread.cpp
+   ./host/fpga_dma.c
+   ./host/kernel_interrupt.cpp
+   ./host/mmd_dma.cpp
+   ./host/memcpy_s_fast.c
+   ./host/x86-sse2.S
+)
+
+# Add a shared library target called intel_opae_mmd
+# and build it from the MMD_SRC files
+add_library(intel_opae_mmd SHARED ${MMD_SRC})
+
+# Specify the include directories to be used when compiling intel_opae_mmd library
+target_include_directories(intel_opae_mmd PUBLIC
+                            ${CMAKE_CURRENT_SOURCE_DIR}/include
+                            )
+
+# Specify libraries needed when liking the intel_opae_mmd library
+target_link_libraries(intel_opae_mmd
+   libopae-c
+   libnuma
+)
+
+# Set the installation rules for the project
+install(TARGETS intel_opae_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT intel_opae_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
new file mode 100644
index 0000000..c981150
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake
@@ -0,0 +1,34 @@
+# - Try to find libnuma
+# Once done will define:
+#
+# NUMA_FOUND - system has libnuma
+# NUMA_INCLUDE_DIRS - include directory with numa.h
+# NUMA_LIBRARIES - link with this for libnuma
+
+find_path(NUMA_INCLUDE_DIRS
+  NAMES numa.h
+  PATHS
+  ${LIBNUMA_ROOT}/include
+  /usr/include
+  /p/psg/swip/dla/resources/numactl/2.0.16/include
+
+  )
+
+find_library(NUMA_LIBRARIES
+  NAMES numa
+  PATHS
+  ${LIBNUMA_ROOT}/lib
+  ${LIBNUMA_ROOT}/lib64
+  /usr/lib
+  /usr/lib64
+  /p/psg/swip/dla/resources/numactl/2.0.16/lib
+
+  )
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA
+                                  REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
+
+add_library(libnuma IMPORTED SHARED)
+set_target_properties(libnuma PROPERTIES
+                    IMPORTED_LOCATION ${NUMA_LIBRARIES}
+                    INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
new file mode 100644
index 0000000..6395d7c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake
@@ -0,0 +1,44 @@
+# - Try to find libintelfpga
+# Once done, this will define
+#
+#  libopae-c_FOUND - system has libopae-c
+#  libopae-c_INCLUDE_DIRS - the libopae-c include directories
+#  libopae-c_LIBRARIES - link these to use libopae-c
+
+find_package(PkgConfig)
+pkg_check_modules(PC_OPAE QUIET opae-c)
+
+# Use pkg-config to get hints about paths
+execute_process(COMMAND pkg-config --cflags opae-c --silence-errors
+  COMMAND cut -d I -f 2
+  OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS)
+set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library")
+
+# Include dir
+find_path(libopae-c_INCLUDE_DIRS
+  NAMES opae/fpga.h
+  PATHS ${LIBOPAE-C_ROOT}/include
+  ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}
+  /usr/local/include
+  /usr/include
+  ${CMAKE_EXTRA_INCLUDES})
+
+# The library itself
+find_library(libopae-c_LIBRARIES
+  NAMES opae-c
+  PATHS ${LIBOPAE-C_ROOT}/lib
+  ${LIBOPAE-C_ROOT}/lib64
+  /usr/local/lib
+  /usr/lib
+  /lib
+  /usr/lib/x86_64-linux-gnu
+  ${CMAKE_EXTRA_LIBS})
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE
+                                  REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS)
+
+add_library(libopae-c IMPORTED SHARED)
+set_target_properties(libopae-c PROPERTIES
+                      IMPORTED_LOCATION ${libopae-c_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS})
+
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
new file mode 100644
index 0000000..1530978
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore
@@ -0,0 +1 @@
+*.o
+\ No newline at end of file
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
new file mode 100644
index 0000000..6d8f9fa
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h
@@ -0,0 +1,123 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Driver does not support Address Span Extender
+ * - Implementation is not optimized for performance.
+ *   User buffer data is copied into a DMA-able buffer before the transfer
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef AFU_BBB_UTIL_H__
+#define AFU_BBB_UTIL_H__
+
+#include <assert.h>
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#define DFH_FEATURE_EOL(dfh) (((dfh >> 40) & 1) == 1)
+#define DFH_FEATURE(dfh) ((dfh >> 60) & 0xf)
+#define DFH_FEATURE_IS_PRIVATE(dfh) (DFH_FEATURE(dfh) == 3)
+#define DFH_FEATURE_IS_BBB(dfh) (DFH_FEATURE(dfh) == 2)
+#define DFH_FEATURE_IS_AFU(dfh) (DFH_FEATURE(dfh) == 1)
+#define DFH_FEATURE_NEXT(dfh) ((dfh >> 16) & 0xffffff)
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             uint64_t find_id_l,
+                             uint64_t find_id_h,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  assert(find_id_l);
+  assert(find_id_h);
+
+  uint64_t offset = 0;
+  if (result_offset) {
+    offset = *result_offset;
+  }
+  uint64_t dfh = 0;
+
+  // Limit the maximum number of DFH search iterations to avoid getting stuck
+  // in an infinte loop in case the DFH_FEATURE_EOL is not found.  Limit of
+  // 5000 is very conservaitve.  In practice search should terminate in 3 or
+  // fewer iterations.
+  int MAX_DFH_SEARCHES = 5000;
+  int dfh_search_iterations = 0;
+
+  do {
+    fpgaReadMMIO64(afc_handle, 0, offset, &dfh);
+
+    int is_bbb = DFH_FEATURE_IS_BBB(dfh);
+    int is_afu = DFH_FEATURE_IS_AFU(dfh);
+
+    if (is_afu || is_bbb) {
+      uint64_t id_l = 0;
+      uint64_t id_h = 0;
+      fpgaReadMMIO64(afc_handle, 0, offset + 8, &id_l);
+      fpgaReadMMIO64(afc_handle, 0, offset + 16, &id_h);
+
+      if (find_id_l == id_l && find_id_h == id_h) {
+        if (result_offset) *result_offset = offset;
+        if (result_next_offset) *result_next_offset = DFH_FEATURE_NEXT(dfh);
+        return true;
+      }
+    }
+    offset += DFH_FEATURE_NEXT(dfh);
+
+    dfh_search_iterations++;
+    if (dfh_search_iterations > MAX_DFH_SEARCHES) {
+      return false;
+    }
+  } while (!DFH_FEATURE_EOL(dfh));
+
+  return false;
+}
+
+static bool find_dfh_by_guid(fpga_handle afc_handle,
+                             const char *guid_str,
+                             uint64_t *result_offset = NULL,
+                             uint64_t *result_next_offset = NULL) {
+  fpga_guid guid;
+
+  if (uuid_parse(guid_str, guid) < 0) return 0;
+
+  uint32_t i;
+  uint32_t s;
+
+  uint64_t find_id_l = 0;
+  uint64_t find_id_h = 0;
+
+  // The API expects the MSB of the GUID at [0] and the LSB at [15].
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_h = ((find_id_h << 8) | (0xff & guid[i]));
+  }
+
+  s = 64;
+  for (i = 0; i < 8; ++i) {
+    s -= 8;
+    find_id_l = ((find_id_l << 8) | (0xff & guid[8 + i]));
+  }
+
+  return find_dfh_by_guid(afc_handle, find_id_l, find_id_h, result_offset, result_next_offset);
+}
+
+#endif  // AFU_BBB_UTIL_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
new file mode 100644
index 0000000..b7cd06a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp
@@ -0,0 +1,655 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include <cassert>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "aocl_mmd.h"
+#include "ccip_mmd_device.h"
+
+using namespace intel_opae_mmd;
+
+#define ACL_DCP_ERROR_IF(COND, NEXT, ...)  \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define ACL_PKG_SECTION_DCP_GBS_GZ ".acl.gbs.gz"
+
+// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime
+// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure
+// the runtime doesn't get to reference them after MMD destructors have been called.
+// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does.
+// Implemented as a singleton.
+class DeviceMapManager final {
+ public:
+  typedef std::map<int, CcipDevice*> t_handle_to_dev_map;
+  typedef std::map<uint64_t, int> t_id_to_handle_map;
+
+  static const int SUCCESS = 0;
+  static const int FAILURE = -1;
+
+  // Returns handle and device pointer to the device with the specified name
+  // Creates a new entry for this device if it doesn't already exist
+  // Return 0 on success, -1 on failure
+  int get_or_create_device(const char* board_name, int* handle, CcipDevice** device);
+
+  // Return obj id based on BSP name.
+  uint64_t id_from_name(const char* board_name);
+
+  // Return MMD handle based on obj id. Returned value is negative if board doesn't exist
+  inline int handle_from_id(uint64_t obj_id);
+
+  // Return pointer to CCIP device based on MMD handle. Returned value is null if board doesn't exist
+  CcipDevice* device_from_handle(int handle);
+
+  // Closes specified device if it exists
+  void close_device_if_exists(int handle);
+
+  // Returns a reference to the class singleton
+  static DeviceMapManager& get_instance() {
+    static DeviceMapManager instance;
+    return instance;
+  }
+
+  DeviceMapManager(DeviceMapManager const&) = delete;
+  void operator=(DeviceMapManager const&) = delete;
+  ~DeviceMapManager() {
+    // delete all allocated CcipDevice* entries
+    while (handle_to_dev_map->size() > 0) {
+      int handle = handle_to_dev_map->begin()->first;
+      aocl_mmd_close(handle);
+    }
+    delete handle_to_dev_map;
+    delete id_to_handle_map;
+    handle_to_dev_map = nullptr;
+    id_to_handle_map = nullptr;
+  }
+
+ private:
+  DeviceMapManager() {
+    handle_to_dev_map = new t_handle_to_dev_map();
+    id_to_handle_map = new t_id_to_handle_map();
+  }
+  t_handle_to_dev_map* handle_to_dev_map = nullptr;
+  t_id_to_handle_map* id_to_handle_map = nullptr;
+};
+static DeviceMapManager& device_manager = DeviceMapManager::get_instance();
+
+int DeviceMapManager::get_or_create_device(const char* board_name, int* handle, CcipDevice** device) {
+  int _handle = CCIP_MMD_INVALID_PARAM;
+  CcipDevice* _device = nullptr;
+
+  if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) {
+    return DeviceMapManager::FAILURE;
+  }
+
+  uint64_t obj_id = id_from_name(board_name);
+  if (id_to_handle_map->count(obj_id) == 0) {
+    try {
+      _device = new CcipDevice(obj_id);
+      _handle = _device->get_mmd_handle();
+      id_to_handle_map->insert({obj_id, _handle});
+      handle_to_dev_map->insert({_handle, _device});
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      delete _device;
+      return DeviceMapManager::FAILURE;
+    }
+  } else {
+    _handle = id_to_handle_map->at(obj_id);
+    _device = handle_to_dev_map->at(_handle);
+  }
+
+  (*handle) = _handle;
+  (*device) = _device;
+  return DeviceMapManager::SUCCESS;
+}
+
+uint64_t DeviceMapManager::id_from_name(const char* board_name) {
+  uint64_t obj_id = 0;
+  if (CcipDevice::parse_board_name(board_name, obj_id)) {
+    return obj_id;
+  } else {
+    // TODO: add error hanlding for DeviceMapManager (make sure 0 is marked as invalid device)
+    return 0;
+  }
+}
+
+inline int DeviceMapManager::handle_from_id(uint64_t obj_id) {
+  int handle = CCIP_MMD_INVALID_PARAM;
+  if (id_to_handle_map) {
+    auto it = id_to_handle_map->find(obj_id);
+    if (it != id_to_handle_map->end()) {
+      handle = it->second;
+    }
+  }
+  return handle;
+}
+
+CcipDevice* DeviceMapManager::device_from_handle(int handle) {
+  CcipDevice* dev = nullptr;
+  if (handle_to_dev_map) {
+    auto it = handle_to_dev_map->find(handle);
+    if (it != handle_to_dev_map->end()) {
+      return it->second;
+    }
+  }
+  return dev;
+}
+
+void DeviceMapManager::close_device_if_exists(int handle) {
+  if (handle_to_dev_map) {
+    if (handle_to_dev_map->count(handle) > 0) {
+      CcipDevice* dev = handle_to_dev_map->at(handle);
+      uint64_t obj_id = dev->get_fpga_obj_id();
+      delete dev;
+      handle_to_dev_map->erase(handle);
+      id_to_handle_map->erase(obj_id);
+    }
+  }
+}
+
+// Interface for checking if AFU has BSP loaded
+bool ccip_mmd_bsp_loaded(const char* name) {
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return false;
+  }
+
+  int handle = device_manager.handle_from_id(obj_id);
+  if (handle > 0) {
+    CcipDevice* dev = device_manager.device_from_handle(handle);
+    if (dev)
+      return dev->bsp_loaded();
+    else
+      return false;
+  } else {
+    bool bsp_loaded = false;
+    try {
+      CcipDevice dev(obj_id);
+      bsp_loaded = dev.bsp_loaded();
+    } catch (std::runtime_error& e) {
+      LOG_ERR("%s\n", e.what());
+      return false;
+    }
+    return bsp_loaded;
+  }
+}
+
+static int get_offline_num_acl_boards(bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  bool ret_err = false;
+  fpga_properties filter = NULL;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      ret_err = true;
+      goto out;
+    }
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    ret_err = true;
+    goto out;
+  }
+
+out:
+  if (filter) fpgaDestroyProperties(&filter);
+
+  if (ret_err) {
+    return CCIP_MMD_AOCL_ERR;
+  } else {
+    return num_matches;
+  }
+}
+
+bool static get_offline_board_names(std::string& boards, bool bsp_only = true) {
+  fpga_guid dcp_guid;
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches = 0;
+  fpga_properties filter = nullptr;
+  fpga_properties prop = nullptr;
+  std::ostringstream board_name;
+  fpga_token* toks = nullptr;
+  uint64_t obj_id;
+  bool success = true;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  if (bsp_only) {
+    res = fpgaPropertiesSetGUID(filter, dcp_guid);
+    if (res != FPGA_OK) {
+      LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res));
+      success = false;
+      goto cleanup;
+    }
+  }
+  res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  toks = static_cast<fpga_token*>(calloc(num_matches, sizeof(fpga_token)));
+  if (toks == NULL) {
+    LOG_ERR("Error allocating memory\n");
+    success = false;
+    goto cleanup;
+  }
+
+  res = fpgaEnumerate(&filter, 1, toks, num_matches, &num_matches);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res));
+    success = false;
+    goto cleanup;
+  }
+
+  for (unsigned int i = 0; i < num_matches; i++) {
+    if (prop) fpgaDestroyProperties(&prop);
+    res = fpgaGetProperties(toks[i], &prop);
+    if (res == FPGA_OK) {
+      res = fpgaPropertiesGetObjectID(prop, &obj_id);
+      if (res != FPGA_OK) {
+        LOG_ERR("Error reading object ID: %s\n", fpgaErrStr(res));
+        success = false;
+        break;
+      }
+      boards.append(CcipDevice::get_board_name(BSP_NAME, obj_id));
+      if (i < num_matches - 1) boards.append(";");
+    } else {
+      success = false;
+      LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    }
+  }
+
+cleanup:
+  if (prop) {
+    fpgaDestroyProperties(&prop);
+  }
+  if (filter) {
+    fpgaDestroyProperties(&filter);
+  }
+  if (toks) {
+    for (unsigned i = 0; i < num_matches; i++) {
+      if (toks[i]) {
+        fpgaDestroyToken(&toks[i]);
+      }
+    }
+    free(toks);
+  }
+
+  return success;
+}
+
+int aocl_mmd_yield(int handle) {
+  DEBUG_PRINT("* Called: aocl_mmd_yield\n");
+  YIELD_DELAY();
+
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  assert(dev);
+  if (dev) {
+    return dev->yield();
+  }
+
+  return 0;
+}
+
+// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int*)param_value) = X;                          \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_STR(X)                                                        \
+  do {                                                                       \
+    unsigned Xlen = strlen(X) + 1;                                           \
+    unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \
+    memcpy_s_fast((void*)param_value, param_value_size, X, Xcpylen);         \
+    if (param_size_ret) *param_size_ret = Xcpylen;                           \
+  } while (0)
+
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void* param_value,
+                              size_t* param_size_ret) {
+  // aocl_mmd_get_offline_info can be called many times by the runtime
+  // and it is expensive to query the system.  Only compute values first
+  // time aocl_mmd_get_offline_info called future iterations use saved results
+  static bool initialized = false;
+  static int mem_type_info;
+  static int num_acl_boards;
+  static std::string boards;
+  static bool success;
+
+  if (!initialized) {
+    mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY;
+    num_acl_boards = get_offline_num_acl_boards();
+    success = get_offline_board_names(boards, true);
+    initialized = true;
+  }
+
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(AOCL_MMD_VERSION_STRING);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      if (num_acl_boards >= 0) {
+        RESULT_INT(num_acl_boards);
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME:
+      RESULT_STR("Intel Corp");
+      break;
+    case AOCL_MMD_BOARD_NAMES: {
+      if (success) {
+        RESULT_STR(boards.c_str());
+      } else {
+        return CCIP_MMD_AOCL_ERR;
+      }
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(KernelInterrupt::yield_is_enabled());
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(mem_type_info);
+      break;
+  }
+
+  return 0;
+}
+
+int ccip_mmd_get_offline_board_names(size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  std::string boards;
+  bool success = get_offline_board_names(boards, false);
+  if (success) {
+    RESULT_STR(boards.c_str());
+  } else {
+    RESULT_INT(-1);
+  }
+
+  return 0;
+}
+
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void* param_value, size_t* param_size_ret) {
+  DEBUG_PRINT("called aocl_mmd_get_info\n");
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev == NULL) return 0;
+
+  assert(param_value);
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << "Intel PAC Platform"
+                 << " (" << dev->get_dev_name() << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+#ifdef SIM
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#else
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(-1);
+      break;
+#endif
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO: {
+      RESULT_STR(dev->get_bdf().c_str());
+      break;
+    }
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_TEMPERATURE: {
+      if (param_value_size == sizeof(float)) {
+        float* ptr = static_cast<float*>(param_value);
+        *ptr = dev->get_temperature();
+        if (param_size_ret) *param_size_ret = sizeof(float);
+      }
+      break;
+    }
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(2);
+      break;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) {
+    dev->set_kernel_interrupt(fn, user_data);
+  } else {
+    return CCIP_MMD_AOCL_ERR;
+  }
+  return 0;
+}
+
+int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) {
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev) dev->set_status_handler(fn, user_data);
+  // TODO: handle error condition if dev null
+  return 0;
+}
+
+// Host to device-global-memory write
+int aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n- aocl_mmd_write: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, src, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->write_block(op, mmd_interface, src, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) {
+  DCP_DEBUG_MEM("\n+ aocl_mmd_read: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, dst, mmd_interface, offset);
+  CcipDevice* dev = device_manager.device_from_handle(handle);
+  if (dev)
+    return dev->read_block(op, mmd_interface, dst, offset, len);
+  else
+    return -1;
+  // TODO: handle error condition if dev null
+}
+
+int aocl_mmd_open(const char* name) {
+  DEBUG_PRINT("Opening device: %s\n", name);
+
+  uint64_t obj_id = device_manager.id_from_name(name);
+  if (!obj_id) {
+    return CCIP_MMD_INVALID_PARAM;
+  }
+
+  int handle;
+  CcipDevice* dev = nullptr;
+  if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) {
+    delete dev;
+    return CCIP_MMD_AOCL_ERR;
+  }
+
+  assert(dev);
+  if (dev->bsp_loaded()) {
+    if (!dev->initialize_bsp()) {
+      LOG_ERR("Error initializing bsp\n");
+      return CCIP_MMD_BSP_INIT_FAILED;
+    }
+  } else {
+    return CCIP_MMD_BSP_NOT_LOADED;
+  }
+
+  return handle;
+}
+
+int aocl_mmd_close(int handle) {
+  device_manager.close_device_if_exists(handle);
+
+  return 0;
+}
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 2; }
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 266.666667; }  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 32) * instance + addr; }
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) {
+  return aocl_mmd_write(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) {
+  return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) {
+  return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) {
+  return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr));
+}
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_KERNEL, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
new file mode 100644
index 0000000..9bc055a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp
@@ -0,0 +1,579 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <numa.h>
+
+#include <unistd.h>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+
+// TODO: better encapsulation of afu_bbb_util functions
+#include "afu_bbb_util.h"
+
+#define MMD_COPY_BUFFER_SIZE (1024 * 1024)
+
+#define MEM_WINDOW_BBB_GUID "72347537-7821-4125-442a-472d4b615064"
+#define MEM_WINDOW_BBB_SIZE 8192
+
+#define MSGDMA_BBB_GUID "ef82def7-f6ec-40fc-a914-9a35bace01ea"
+#define MSGDMA_BBB_SIZE 256
+
+#define NULL_DFH_BBB_GUID "da1182b1-b344-4e23-90fe-6aab12a0132f"
+#define BSP_AFU_GUID "96ef4230-dafa-cb5f-18b7-9ffa2ee54aa0"
+
+using namespace intel_opae_mmd;
+
+int CcipDevice::next_mmd_handle{1};
+
+std::string CcipDevice::get_board_name(std::string prefix, uint64_t obj_id) {
+  std::ostringstream stream;
+  stream << prefix << std::setbase(16) << obj_id;
+  return stream.str();
+}
+
+CcipDevice::CcipDevice(uint64_t obj_id)
+    : fpga_obj_id(obj_id),
+      kernel_interrupt_thread(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      enable_set_numa(false),
+      fme_sysfs_temp_initialized(false),
+      bus(0),
+      device(0),
+      function(0),
+      afu_initialized(false),
+      bsp_initialized(false),
+      mmio_is_mapped(false),
+      afc_handle(NULL),
+      filter(NULL),
+      afc_token(NULL),
+      dma_ch0_dfh_offset(0),
+      dma_ch1_dfh_offset(0),
+      dma_ase_dfh_offset(0),
+      dma_host_to_fpga(NULL),
+      dma_fpga_to_host(NULL),
+      mmd_copy_buffer(NULL) {
+  // Note that this constructor is not thread-safe because next_mmd_handle
+  // is shared between all class instances
+  mmd_handle = next_mmd_handle;
+  if (next_mmd_handle == std::numeric_limits<int>::max())
+    next_mmd_handle = 1;
+  else
+    next_mmd_handle++;
+
+  mmd_copy_buffer = (char *)malloc(MMD_COPY_BUFFER_SIZE);
+  if (mmd_copy_buffer == NULL) {
+    throw std::runtime_error(std::string("malloc failed for mmd_copy_buffer"));
+  }
+
+  fpga_result res = FPGA_OK;
+  uint32_t num_matches;
+
+  res = fpgaGetProperties(NULL, &filter);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error creating properties object: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object type: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaPropertiesSetObjectID(filter, obj_id);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error setting object ID: ") + std::string(fpgaErrStr(res)));
+  }
+
+  res = fpgaEnumerate(&filter, 1, &afc_token, 1, &num_matches);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error enumerating AFCs: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (num_matches < 1) {
+    res = fpgaDestroyProperties(&filter);
+    throw std::runtime_error("AFC not found");
+  }
+
+  res = fpgaOpen(afc_token, &afc_handle, 0);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error opening AFC: ") + std::string(fpgaErrStr(res)));
+  }
+
+  fpga_properties prop = nullptr;
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    throw std::runtime_error(std::string("Error reading properties: ") + std::string(fpgaErrStr(res)));
+  }
+
+  if (prop) {
+    res = fpgaPropertiesGetBus(prop, &bus);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading bus: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetDevice(prop, &device);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading device: ") + std::string(fpgaErrStr(res)));
+    }
+    res = fpgaPropertiesGetFunction(prop, &function);
+    if (res != FPGA_OK) {
+      throw std::runtime_error(std::string("Error reading function: ") + std::string(fpgaErrStr(res)));
+    }
+    fpgaDestroyProperties(&prop);
+  }
+
+  initialize_fme_sysfs();
+
+  mmd_dev_name = get_board_name(BSP_NAME, obj_id);
+  afu_initialized = true;
+}
+
+// Return true if board name parses correctly, false if it does not
+// Return the parsed object_id in obj_id as an [out] parameter
+bool CcipDevice::parse_board_name(const char *board_name_str, uint64_t &obj_id) {
+  std::string prefix(BSP_NAME);
+  std::string board_name(board_name_str);
+
+  obj_id = 0;
+  if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) {
+    LOG_ERR("Error parsing device name '%s'\n", board_name_str);
+    return false;
+  }
+
+  std::string device_num_str = board_name.substr(prefix.length());
+  obj_id = std::stol(device_num_str, 0, 16);
+
+  // Assume that OPAE does not use 0 as a valid object ID. This is true for now
+  // but relies somewhat on an implementaion dependent feature.
+  assert(obj_id > 0);
+  return true;
+}
+
+// Read information directly from sysfs.  This is non-portable and relies on
+// paths set in driver (will not interoperate between DFH driver in up-stream
+// kernel and Intel driver distributed with PAC cards).  In the future hopefully
+// OPAE can provide SDK to read this information
+void CcipDevice::initialize_fme_sysfs() {
+  const int MAX_LEN = 250;
+  char temp_fmepath[MAX_LEN];
+  char numa_path[MAX_LEN];
+
+  // HACK: currently ObjectID is constructed using its lower 20 bits
+  // as the device minor number.  The device minor number also matches
+  // the device ID in sysfs.  This is a simple way to construct a path
+  // to the device FME using information that is already available (object_id).
+  // Eventually this code should be replaced with a direct call to OPAE C API,
+  // but API does not currently expose the device temperature.
+  int dev_num = 0xFFFFF & fpga_obj_id;
+
+  // Path to temperature value
+  snprintf(temp_fmepath,
+           MAX_LEN,
+           "/sys/class/fpga/intel-fpga-dev.%d/intel-fpga-fme.%d/thermal_mgmt/temperature",
+           dev_num,
+           dev_num);
+  // Path to NUMA node
+  snprintf(numa_path, MAX_LEN, "/sys/class/fpga/intel-fpga-dev.%d/device/numa_node", dev_num);
+
+  // Try to open the sysfs file. If open succeeds then set as initialized
+  // to be able to read temperature in future.  If open fails then not
+  // initalized and skip attempt to read temperature in future.
+  FILE *tmp;
+  tmp = fopen(temp_fmepath, "r");
+  if (tmp) {
+    fme_sysfs_temp_path = std::string(temp_fmepath);
+    fme_sysfs_temp_initialized = true;
+    fclose(tmp);
+  }
+
+  // Read NUMA node and set value for future use. If not available set to -1
+  // and disable use of NUMA setting
+  std::ifstream sysfs_numa_node(numa_path, std::ifstream::in);
+  if (sysfs_numa_node.is_open()) {
+    sysfs_numa_node >> fpga_numa_node;
+    sysfs_numa_node.close();
+    if (std::stoi(fpga_numa_node) >= 0) {
+      enable_set_numa = true;
+    } else {
+      enable_set_numa = false;
+    }
+  } else {
+    enable_set_numa = false;
+    fpga_numa_node = "-1";
+  }
+}
+
+bool CcipDevice::find_dma_dfh_offsets() {
+  uint64_t dfh_offset = 0;
+  uint64_t next_dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch0_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH1 offset: 0x%lX\t GUID: %s\n", dma_ch0_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA: Cannot find DMA channel 0 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset += next_dfh_offset;
+  if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ch1_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA CH2 offset: 0x%lX\t GUID: %s\n", dma_ch1_dfh_offset, MSGDMA_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find DMA channel 2 DFH offset\n");
+    return false;
+  }
+
+  dfh_offset = 0;
+  if (find_dfh_by_guid(afc_handle, MEM_WINDOW_BBB_GUID, &dfh_offset, &next_dfh_offset)) {
+    dma_ase_dfh_offset = dfh_offset;
+    DEBUG_PRINT("DMA ASE offset: 0x%lX\t GUID: %s\n", dma_ase_dfh_offset, MEM_WINDOW_BBB_GUID);
+  } else {
+    fprintf(stderr, "Error initalizing DMA. Cannot find ASE DFH offset\n");
+    return false;
+  }
+
+  assert(dma_ch0_dfh_offset != 0);
+  assert(dma_ch1_dfh_offset != 0);
+  assert(dma_ase_dfh_offset != 0);
+  assert(dma_ch0_dfh_offset != dma_ch1_dfh_offset);
+
+  return true;
+}
+
+bool CcipDevice::initialize_bsp() {
+  if (bsp_initialized) {
+    return true;
+  }
+
+  fpga_result res = fpgaMapMMIO(afc_handle, 0, NULL);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error mapping MMIO space: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  mmio_is_mapped = true;
+
+  /* Reset AFC */
+  res = fpgaReset(afc_handle);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error resetting AFC: %s\n", fpgaErrStr(res));
+    return false;
+  }
+  AFU_RESET_DELAY();
+
+  // DMA performance is heavily dependent on the memcpy operation that transfers
+  // data from user allocated buffer to the pinned buffer that is used for
+  // DMA.  On some machines with multiple NUMA nodes it is critical for performance
+  // that the pinned buffer is located on the NUMA node as the threads that
+  // performs the DMA operation.
+  //
+  // The performance also improves slighlty if the DMA threads are on the same
+  // NUMA node as the FPGA PCI device.
+  //
+  // This code pins memory allocation to occur from FPGA NUMA node prior to
+  // initializing the DMA buffers.  It also pins all threads in the process
+  // to run on this same node.
+  struct bitmask *mask = NULL;
+  if (enable_set_numa) {
+    mask = numa_parse_nodestring(fpga_numa_node.c_str());
+    numa_set_membind(mask);
+    int ret = numa_run_on_node_mask_all(mask);
+    if (ret < 0) {
+      fprintf(stderr, " Error setting NUMA node mask\n");
+    }
+  }
+
+  find_dma_dfh_offsets();
+
+  const int dma_ch0_interrupt_num = 0;  // DMA channel 0 hardcoded to interrupt 0
+  dma_host_to_fpga = new mmd_dma(afc_handle, mmd_handle, dma_ch0_dfh_offset, dma_ase_dfh_offset, dma_ch0_interrupt_num);
+  if (!dma_host_to_fpga->initialized()) {
+    LOG_ERR("Error initializing mmd dma\n");
+    delete dma_host_to_fpga;
+    return false;
+  }
+
+  const int dma_ch1_interrupt_num = 2;  // DMA channel 1 hardcoded to interrupt 2
+  dma_fpga_to_host = new mmd_dma(afc_handle, mmd_handle, dma_ch1_dfh_offset, dma_ase_dfh_offset, dma_ch1_interrupt_num);
+  if (!dma_fpga_to_host->initialized()) {
+    fprintf(stderr, "Error initializing mmd dma\n");
+    return false;
+  }
+
+  // Turn off membind restriction in order to allow future allocation to
+  // occur on different NUMA nodes if needed.  Hypothesis is that only
+  // the pinned buffers are performance critical for the memcpy. Other
+  // allocations in the process can occur on other NUMA nodes if needed.
+  if (enable_set_numa) {
+    numa_set_membind(numa_nodes_ptr);
+    numa_free_nodemask(mask);
+  }
+
+  kernel_interrupt_thread = new KernelInterrupt(afc_handle, mmd_handle);
+
+  if (!kernel_interrupt_thread->initialized()) {
+    LOG_ERR("Error initializing kernel interrupts\n");
+    delete kernel_interrupt_thread;
+    return false;
+  }
+
+  bsp_initialized = true;
+  return bsp_initialized;
+}
+
+CcipDevice::~CcipDevice() {
+  int num_errors = 0;
+  if (mmd_copy_buffer) {
+    free(mmd_copy_buffer);
+    mmd_copy_buffer = NULL;
+  }
+
+  if (kernel_interrupt_thread) {
+    delete kernel_interrupt_thread;
+    kernel_interrupt_thread = NULL;
+  }
+
+  if (dma_host_to_fpga) {
+    delete dma_host_to_fpga;
+    dma_host_to_fpga = NULL;
+  }
+
+  if (dma_fpga_to_host) {
+    delete dma_fpga_to_host;
+    dma_fpga_to_host = NULL;
+  }
+
+  if (mmio_is_mapped) {
+    if (fpgaUnmapMMIO(afc_handle, 0)) num_errors++;
+  }
+
+  if (afc_handle) {
+    if (fpgaClose(afc_handle) != FPGA_OK) num_errors++;
+  }
+
+  if (afc_token) {
+    if (fpgaDestroyToken(&afc_token) != FPGA_OK) num_errors++;
+  }
+
+  if (filter) {
+    if (fpgaDestroyProperties(&filter) != FPGA_OK) num_errors++;
+  }
+
+  if (num_errors > 0) {
+    DEBUG_PRINT("Error freeing resources in destructor\n");
+  }
+}
+
+int CcipDevice::yield() {
+  if (kernel_interrupt_thread) kernel_interrupt_thread->yield();
+  return 0;
+}
+
+bool CcipDevice::bsp_loaded() {
+  fpga_guid dcp_guid;
+  fpga_guid afu_guid;
+  fpga_properties prop;
+  fpga_result res;
+
+  if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) {
+    LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID);
+    return false;
+  }
+
+  res = fpgaGetProperties(afc_token, &prop);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res));
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  res = fpgaPropertiesGetGUID(prop, &afu_guid);
+  if (res != FPGA_OK) {
+    LOG_ERR("Error reading GUID\n");
+    fpgaDestroyProperties(&prop);
+    return false;
+  }
+
+  fpgaDestroyProperties(&prop);
+  if (uuid_compare(dcp_guid, afu_guid) == 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::string CcipDevice::get_bdf() {
+  std::ostringstream bdf;
+  bdf << std::setfill('0') << std::setw(2) << unsigned(bus) << ":" << std::setfill('0') << std::setw(2)
+      << unsigned(device) << "." << unsigned(function);
+
+  return bdf.str();
+}
+
+float CcipDevice::get_temperature() {
+  float temp = 0;
+  if (fme_sysfs_temp_initialized) {
+    std::ifstream sysfs_temp(fme_sysfs_temp_path, std::ifstream::in);
+    sysfs_temp >> temp;
+    sysfs_temp.close();
+  }
+  return temp;
+}
+
+void CcipDevice::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  if (kernel_interrupt_thread) {
+    kernel_interrupt_thread->set_kernel_interrupt(fn, user_data);
+  }
+}
+
+void CcipDevice::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  event_update = fn;
+  event_update_user_data = user_data;
+  dma_host_to_fpga->set_status_handler(fn, user_data);
+  dma_fpga_to_host->set_status_handler(fn, user_data);
+}
+
+void CcipDevice::event_update_fn(aocl_mmd_op_t op, int status) {
+  event_update(mmd_handle, event_update_user_data, op, status);
+}
+
+int CcipDevice::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO read of
+  // base address + offset
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_fpga_to_host->read_memory(op, static_cast<uint64_t *>(host_addr), offset, size);
+  } else {
+    res = read_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what status value should really be instead of just using 0
+      // Also handle case when op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaReadMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+int CcipDevice::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) {
+  fpga_result res;
+
+  // The mmd_interface is defined as the base address of the MMIO write.  Access
+  // to memory requires special functionality.  Otherwise do direct MMIO write
+  if (mmd_interface == AOCL_MMD_MEMORY) {
+    res = dma_host_to_fpga->write_memory(op, static_cast<const uint64_t *>(host_addr), offset, size);
+  } else {
+    res = write_mmio(host_addr, mmd_interface + offset, size);
+
+    if (op) {
+      // TODO: check what 'status' value should really be.  Right now just
+      // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+      this->event_update_fn(op, 0);
+    }
+  }
+
+  // TODO: check what status values aocl wants and also parse the result
+  if (res != FPGA_OK) {
+    LOG_ERR("fpgaWriteMMIO error: %s\n", fpgaErrStr(res));
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+fpga_result CcipDevice::read_mmio(void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DCP_DEBUG_MEM("read_mmio start: %p\t %lx\t %lu\n", host_addr, mmio_addr, size);
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaReadMMIO64(afc_handle, 0, mmio_addr, host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64);
+  while (size >= 4) {
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, host_addr32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += 4;
+    size -= 4;
+  }
+
+  if (size > 0) {
+    uint32_t read_data;
+    res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, &read_data);
+    if (res != FPGA_OK) return res;
+    memcpy_s_fast(host_addr32, size, &read_data, size);
+  }
+
+  return res;
+}
+
+fpga_result CcipDevice::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) {
+  fpga_result res = FPGA_OK;
+
+  DEBUG_PRINT("write_mmio\n");
+
+  // HACK: need extra delay for opencl sw reset
+  if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY();
+
+  const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr);
+  while (size >= 8) {
+    res = fpgaWriteMMIO64(afc_handle, 0, mmio_addr, *host_addr64);
+    if (res != FPGA_OK) return res;
+    host_addr64 += 1;
+    mmio_addr += 8;
+    size -= 8;
+  }
+
+  const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64);
+  while (size > 0) {
+    uint32_t tmp_data32 = 0;
+    size_t chunk_size = (size >= 4) ? 4 : size;
+    memcpy_s_fast(&tmp_data32, sizeof(tmp_data32), host_addr32, chunk_size);
+    res = fpgaWriteMMIO32(afc_handle, 0, mmio_addr, tmp_data32);
+    if (res != FPGA_OK) return res;
+    host_addr32 += 1;
+    mmio_addr += chunk_size;
+    size -= chunk_size;
+  }
+
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
new file mode 100644
index 0000000..f8088ac
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h
@@ -0,0 +1,187 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _CCIP_MMD_DEVICE_H
+#define _CCIP_MMD_DEVICE_H
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+#include <uuid/uuid.h>
+
+#include "aocl_mmd.h"
+#include "kernel_interrupt.h"
+#include "mmd_dma.h"
+
+// Tune delay for simulation or HW. Eventually delay
+// should be removed for HW, may still be needed for ASE simulation
+#ifdef SIM
+#define DELAY_MULTIPLIER 100
+#else
+#define DELAY_MULTIPLIER 1
+#endif
+
+// Most AOCL_MMD_CALL functions return negative number in case of error,
+// CCIP_MMD_AOCL_ERR is used to indicate an error from the MMD that is being
+// returned to the runtime.  Simply set to -2 for now since neither interface
+// defines a meaning to return codes for errors.
+#define CCIP_MMD_AOCL_ERR -1
+
+// NOTE: some of the code relies on invalid handle returning -1
+// future TODO eliminate dependency on specific error values
+#define CCIP_MMD_INVALID_PARAM -1
+
+// Our diagnostic script relies on handle values < -1 to determine when
+// a valid device is present but a functioning BSP is not loaded.
+#define CCIP_MMD_BSP_NOT_LOADED -2
+#define CCIP_MMD_BSP_INIT_FAILED -3
+
+// Delay settings
+// TODO: Figure out why these delays are needed and
+// have requirement removed (at least for HW)
+#define MMIO_DELAY()
+#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER)
+#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER)
+#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER)
+
+#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30)
+
+#define DCP_OPENCL_BSP_AFU_ID "63B3779B-8BDD-4F03-9CEB-0301181D6AEF"
+
+#define BSP_NAME "pac_"
+
+// LOG ERRORS
+#define CCIP_MMD_ERR_LOGGING 1
+#ifdef CCIP_MMD_ERR_LOGGING
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define LOG_ERR(...)
+#endif
+
+// debugging
+#ifdef DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_MEM(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_MEM(...)
+#endif
+
+enum {
+#ifndef DLA_MMD                    // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  AOCL_IRQ_POLLING_BASE = 0x0100,  // CSR to polling interrupt status
+  AOCL_IRQ_MASKING_BASE = 0x0108,  // CSR to set/unset interrupt mask
+  AOCL_MMD_KERNEL = 0x4000,        /* Control interface into kernel interface */
+#else
+  AOCL_MMD_KERNEL = 0,  // CoreDLA completely removes the Opencl kernel interface, repurposed for CSRs
+#endif
+  AOCL_MMD_MEMORY = 0x100000 /* Data interface to device memory */
+};
+
+enum AfuStatu { CCIP_MMD_INVALID_ID = 0, CCIP_MMD_BSP, CCIP_MMD_AFU };
+
+class CcipDevice final {
+ public:
+  CcipDevice(uint64_t);
+  CcipDevice(const CcipDevice &) = delete;
+  CcipDevice &operator=(const CcipDevice &) = delete;
+  ~CcipDevice();
+
+  static std::string get_board_name(std::string prefix, uint64_t obj_id);
+  static bool parse_board_name(const char *board_name, uint64_t &obj_id);
+
+  int get_mmd_handle() { return mmd_handle; }
+  uint64_t get_fpga_obj_id() { return fpga_obj_id; }
+  std::string get_dev_name() { return mmd_dev_name; }
+  std::string get_bdf();
+  float get_temperature();
+  bool initialize_bsp();
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+  int yield();
+  void event_update_fn(aocl_mmd_op_t op, int status);
+  bool bsp_loaded();
+
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size);
+
+ private:
+  static int next_mmd_handle;
+
+  int mmd_handle;
+  uint64_t fpga_obj_id;
+  std::string mmd_dev_name;
+  intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+
+  // HACK: use the sysfs path to read temperature value and NUMA node
+  // this should be replaced with OPAE call once that is
+  // available
+  std::string fme_sysfs_temp_path;
+  std::string fpga_numa_node;
+  bool enable_set_numa;
+  bool fme_sysfs_temp_initialized;
+  void initialize_fme_sysfs();
+
+  void initialize_local_cpus_sysfs();
+
+  bool find_dma_dfh_offsets();
+
+  uint8_t bus;
+  uint8_t device;
+  uint8_t function;
+
+  bool afu_initialized;
+  bool bsp_initialized;
+  bool mmio_is_mapped;
+
+  fpga_handle afc_handle;
+  fpga_properties filter;
+  fpga_token afc_token;
+  uint64_t dma_ch0_dfh_offset;
+  uint64_t dma_ch1_dfh_offset;
+  uint64_t dma_ase_dfh_offset;
+  intel_opae_mmd::mmd_dma *dma_host_to_fpga;
+  intel_opae_mmd::mmd_dma *dma_fpga_to_host;
+
+  char *mmd_copy_buffer;
+
+  // Helper functions
+  fpga_result read_mmio(void *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_mmio(const void *host_addr, size_t dev_addr, size_t size);
+};
+
+#endif  // _CCIP_MMD_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
new file mode 100644
index 0000000..30113eb
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp
@@ -0,0 +1,151 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include "dma_work_thread.h"
+#include <assert.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+dma_work_thread::dma_work_thread(mmd_dma &mmd_dma_arg)
+    : m_initialized(false),
+      m_thread_wake_event(NULL),
+      m_thread(NULL),
+      m_work_queue_mutex(),
+      m_work_queue(),
+      m_mmd_dma(mmd_dma_arg) {
+  m_thread_wake_event = new eventfd_wrapper();
+  if (!m_thread_wake_event->initialized()) return;
+
+  m_thread = new std::thread(work_thread, std::ref(*this));
+
+  m_initialized = true;
+}
+
+dma_work_thread::~dma_work_thread() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_thread_wake_event->notify(UINT64_MAX - 1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_thread_wake_event) {
+    delete m_thread_wake_event;
+    m_thread_wake_event = NULL;
+  }
+
+  m_initialized = false;
+}
+
+void dma_work_thread::work_thread(dma_work_thread &obj) {
+  int res;
+
+  // get eventfd handle
+  int thread_signal_fd = obj.m_thread_wake_event->get_fd();
+
+  struct pollfd pollfd_setup;
+  while (1) {
+    pollfd_setup.fd = thread_signal_fd;
+    pollfd_setup.events = POLLIN;
+    pollfd_setup.revents = 0;
+    res = poll(&pollfd_setup, 1, -1);
+    if (res < 0) {
+      fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    } else if (res > 0 && pollfd_setup.revents == POLLIN) {
+      uint64_t count_work_items = 0;
+      ssize_t bytes_read = read(thread_signal_fd, &count_work_items, sizeof(count_work_items));
+      if (bytes_read > 0) {
+        DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+      } else {
+        // TODO: the MMD should not exit.  But I have a different branch
+        // I'm working on that will change synchronization to use
+        // condition variable instead of eventfd in synchronization
+        // within the same process.  Will remove this exit() call at
+        // when PR for that change is submitted.
+        fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+        exit(-1);
+      }
+
+      // Ensure count is in proper range
+      const unsigned long MAX_WORK_ITEMS = 1000000000;
+      if (count_work_items > MAX_WORK_ITEMS && count_work_items != (UINT64_MAX - 1)) {
+          fprintf(stderr, "Error: poll value is out of range");
+          exit(-1);
+      }
+
+      obj.m_work_queue_mutex.lock();
+      if (obj.m_work_queue.empty() && count_work_items == UINT64_MAX - 1) {
+        // The maximum value of count is set when there is no work left
+        // The work queue must also be empty
+        // This thread can break out of the loop
+        obj.m_work_queue_mutex.unlock();
+        break;
+      }
+
+      std::queue<dma_work_item> items;
+      for (uint64_t i = 0; i < count_work_items; i++) {
+        // Check if there are enough jobs in the work queue as requested (count)
+        if (obj.m_work_queue.empty()) {
+          fprintf(stderr, "Poll error. Not enough tasks in queue.");
+          exit(-1);
+        }
+        dma_work_item item = obj.m_work_queue.front();
+        items.push(item);
+        obj.m_work_queue.pop();
+      }
+      obj.m_work_queue_mutex.unlock();
+
+      while (!items.empty()) {
+        dma_work_item item = items.front();
+        obj.do_dma(item);
+        items.pop();
+      }
+    }
+  }
+}
+
+int dma_work_thread::enqueue_dma(dma_work_item &item) {
+  if (item.op) {
+    m_work_queue_mutex.lock();
+    m_work_queue.push(item);
+    m_work_queue_mutex.unlock();
+    // send message to thread to wake it
+    // setting count to 1 as only 1 job is pushed to the work queue
+    m_thread_wake_event->notify(1);
+    return 0;
+  } else {
+    // if op is not specified, it is a blocking operation and we don't use
+    // the thread
+    return do_dma(item);
+  }
+}
+
+int dma_work_thread::do_dma(dma_work_item &item) { return m_mmd_dma.do_dma(item); }
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
new file mode 100644
index 0000000..0afb036
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h
@@ -0,0 +1,73 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _DMA_WORK_THREAD_H
+#define _DMA_WORK_THREAD_H
+
+#include <opae/fpga.h>
+
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+// forward class definitions
+class eventfd_wrapper;
+class mmd_dma;
+
+class dma_work_item {
+ public:
+  aocl_mmd_op_t op;
+  uint64_t *rd_host_addr;
+  const uint64_t *wr_host_addr;
+  size_t dev_addr;
+  size_t size;
+};
+
+class dma_work_thread final {
+ public:
+  dma_work_thread(mmd_dma &mmd_dma_arg);
+  ~dma_work_thread();
+
+  bool initialized() { return m_initialized; }
+
+  int enqueue_dma(dma_work_item &item);
+  int do_dma(dma_work_item &item);
+
+ private:
+  static void work_thread(dma_work_thread &obj);
+
+  bool m_initialized;
+
+  eventfd_wrapper *m_thread_wake_event;
+  std::thread *m_thread;
+  std::mutex m_work_queue_mutex;
+  std::queue<dma_work_item> m_work_queue;
+
+  mmd_dma &m_mmd_dma;
+
+  // not used and not implemented
+  dma_work_thread(dma_work_thread &other);
+  dma_work_thread &operator=(const dma_work_thread &other);
+};  // class dma_work_thread
+
+};  // namespace intel_opae_mmd
+
+#endif  // _DMA_WORK_THREAD_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
new file mode 100644
index 0000000..2de3f74
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h
@@ -0,0 +1,74 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _EVENTFD_WRAPPER_H
+#define _EVENTFD_WRAPPER_H
+
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+namespace intel_opae_mmd {
+
+// simple wrapper class for managing eventfd objects
+class eventfd_wrapper final {
+ public:
+  eventfd_wrapper() {
+    m_initialized = false;
+    // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set
+    // The implementation of functions using eventfd assumes that
+    m_fd = eventfd(0, 0);
+    if (m_fd < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return;
+    }
+
+    m_initialized = true;
+  }
+
+  ~eventfd_wrapper() {
+    if (m_initialized) {
+      if (close(m_fd) < 0) {
+        fprintf(stderr, "eventfd : %s", strerror(errno));
+      }
+    }
+  }
+
+  bool notify(uint64_t count) {
+    ssize_t res = write(m_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+    return true;
+  }
+
+  int get_fd() { return m_fd; }
+  bool initialized() { return m_initialized; }
+
+ private:
+  // not used and not implemented
+  eventfd_wrapper(eventfd_wrapper& other);
+  eventfd_wrapper& operator=(const eventfd_wrapper& other);
+
+  // member varaibles
+  int m_fd;
+  int m_initialized;
+};  // class eventfd_wrapper
+
+};  // namespace intel_opae_mmd
+
+#endif  // _EVENTFD_WRAPPER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
new file mode 100644
index 0000000..6c8df30
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c
@@ -0,0 +1,1313 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.c
+ * \brief FPGA DMA User-mode driver
+ */
+
+#include "fpga_dma.h"
+#include <assert.h>
+#include <errno.h>
+#include <opae/fpga.h>
+#include <poll.h>
+#include <safe_string/safe_string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include "fpga_dma_internal.h"
+#include "memcpy_s_fast.h"
+
+#ifdef SIM
+#define USE_ASE
+#else
+// TODO:  Need this until we can adequately sync MMIO R/W with pointer accesses.
+// Causes module to use fpgaMMIORead32() instead of foo = *ptr;
+#define USE_ASE
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+static int err_cnt = 0;
+#endif
+
+#ifdef CHECK_DELAYS
+double poll_wait_count = 0;
+double buf_full_count = 0;
+#endif
+
+/*
+ * macro for checking return codes
+ */
+#define ON_ERR_GOTO(res, label, desc)                         \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      goto label;                                             \
+    }                                                         \
+  } while (0)
+
+#define ON_ERR_RETURN(res, desc)                              \
+  do {                                                        \
+    if ((res) != FPGA_OK) {                                   \
+      error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \
+      return (res);                                           \
+    }                                                         \
+  } while (0)
+
+// Internal Functions
+
+/**
+ * MMIOWrite64Blk
+ *
+ * @brief                Writes a block of 64-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIOWrite32Blk
+ *
+ * @brief                Writes a block of 32-bit values to FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIOWrite32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaWriteMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, *haddr);
+    ON_ERR_RETURN(res, "fpgaWriteMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *dev_addr++ = *haddr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead64Blk
+ *
+ * @brief                Reads a block of 64-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_QWORD(device));
+  assert(IS_ALIGNED_QWORD(bytes));
+
+  uint64_t *haddr = (uint64_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint64_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO64");
+    haddr++;
+    device += sizeof(uint64_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+/**
+ * MMIORead32Blk
+ *
+ * @brief                Reads a block of 32-bit values from FPGA MMIO space
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] device     FPGA address
+ * @param[in] host       Host buffer address
+ * @param[in] count      Size in bytes
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result MMIORead32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) {
+  assert(IS_ALIGNED_DWORD(device));
+  assert(IS_ALIGNED_DWORD(bytes));
+
+  uint32_t *haddr = (uint32_t *)host;
+  uint64_t i;
+  fpga_result res = FPGA_OK;
+
+#ifndef USE_ASE
+  volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device);
+#endif
+
+  debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr);
+  for (i = 0; i < bytes / sizeof(uint32_t); i++) {
+#ifdef USE_ASE
+    res = fpgaReadMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, haddr);
+    ON_ERR_RETURN(res, "fpgaReadMMIO32");
+    haddr++;
+    device += sizeof(uint32_t);
+#else
+    *haddr++ = *dev_addr++;
+#endif
+  }
+  return res;
+}
+
+// Feature type is BBB
+static inline bool fpga_dma_feature_is_bbb(uint64_t dfh) {
+  // BBB is type 2
+  return ((dfh >> AFU_DFH_TYPE_OFFSET) & 0xf) == FPGA_DMA_BBB;
+}
+
+/**
+ * _switch_to_ase_page
+ *
+ * @brief                Updates the current page of ASE to the address given
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] addr       Address to which the ASE page should be switched
+ * @return Nothing.  Side-effect is to update the current page in the DMA handle.
+ *
+ */
+static inline void _switch_to_ase_page(fpga_dma_handle dma_h, uint64_t addr) {
+  uint64_t requested_page = addr & ~DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  if (requested_page != dma_h->cur_ase_page) {
+    MMIOWrite64Blk(dma_h, ASE_CNTL_BASE(dma_h), (uint64_t)&requested_page, sizeof(requested_page));
+    dma_h->cur_ase_page = requested_page;
+  }
+}
+
+/**
+ * _send_descriptor
+ *
+ * @brief                Queues a DMA descriptor to the FPGA
+ * @param[in] dma_h      Handle to the FPGA DMA object
+ * @param[in] desc       Pointer to a descriptor structure to send
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _send_descriptor(fpga_dma_handle dma_h, msgdma_ext_desc_t *desc) {
+  fpga_result res = FPGA_OK;
+  msgdma_status_t status = {0};
+
+  debug_print("desc.rd_address = %x\n", desc->rd_address);
+  debug_print("desc.wr_address = %x\n", desc->wr_address);
+  debug_print("desc.len = %x\n", desc->len);
+  debug_print("desc.wr_burst_count = %x\n", desc->wr_burst_count);
+  debug_print("desc.rd_burst_count = %x\n", desc->rd_burst_count);
+  debug_print("desc.wr_stride %x\n", desc->wr_stride);
+  debug_print("desc.rd_stride %x\n", desc->rd_stride);
+  debug_print("desc.rd_address_ext %x\n", desc->rd_address_ext);
+  debug_print("desc.wr_address_ext %x\n", desc->wr_address_ext);
+
+  debug_print("SGDMA_CSR_BASE = %lx SGDMA_DESC_BASE=%lx\n", dma_h->dma_csr_base, dma_h->dma_desc_base);
+
+#ifdef CHECK_DELAYS
+  bool first = true;
+#endif
+  do {
+    res = MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+    ON_ERR_GOTO(res, out, "MMIORead32Blk");
+#ifdef CHECK_DELAYS
+    if (first && status.st.desc_buf_full) {
+      buf_full_count++;
+      first = false;
+    }
+#endif
+  } while (status.st.desc_buf_full);
+
+  res = MMIOWrite64Blk(dma_h, dma_h->dma_desc_base, (uint64_t)desc, sizeof(*desc));
+  ON_ERR_GOTO(res, out, "MMIOWrite64Blk");
+
+out:
+  return res;
+}
+
+/**
+ * _do_dma
+ *
+ * @brief                    Performs a DMA transaction with the FPGA
+ * @param[in] dma_h          Handle to the FPGA DMA object
+ * @param[in] dst            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] src            Pointer to a host or FPGA buffer to send or retrieve
+ * @param[in] count          Number of bytes
+ * @param[in] is_last_desc   True if this is the last buffer of a batch
+ * @param[in] type           Direction of transfer
+ * @param[in] intr_en        True means to ask for an interrupt from the FPGA
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _do_dma(fpga_dma_handle dma_h,
+                           uint64_t dst,
+                           uint64_t src,
+                           int count,
+                           int is_last_desc,
+                           fpga_dma_transfer_t type,
+                           bool intr_en) {
+  msgdma_ext_desc_t desc = {0};
+  fpga_result res = FPGA_OK;
+  int alignment_offset = 0;
+  int segment_size = 0;
+
+  // src, dst and count must be 64-byte aligned
+  if (dst % FPGA_DMA_ALIGN_BYTES != 0 || src % FPGA_DMA_ALIGN_BYTES != 0 || count % FPGA_DMA_ALIGN_BYTES != 0) {
+    return FPGA_INVALID_PARAM;
+  }
+  // these fields are fixed for all DMA transfers
+  desc.seq_num = 0;
+  desc.wr_stride = 1;
+  desc.rd_stride = 1;
+
+  desc.control.go = 1;
+  if (intr_en)
+    desc.control.transfer_irq_en = 1;
+  else
+    desc.control.transfer_irq_en = 0;
+
+  // Enable "earlyreaddone" in the control field of the descriptor except the last.
+  // Setting early done causes the read logic to move to the next descriptor
+  // before the previous descriptor completes.
+  // This elminates a few hundred clock cycles of waiting between transfers.
+  if (!is_last_desc)
+    desc.control.early_done_en = 1;
+  else
+    desc.control.early_done_en = 0;
+
+  if (type == FPGA_TO_FPGA_MM) {
+    desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+    desc.len = count;
+    desc.wr_burst_count = 4;
+    desc.rd_burst_count = 4;
+    desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+    desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+    res = _send_descriptor(dma_h, &desc);
+    ON_ERR_GOTO(res, out, "_send_descriptor");
+  }
+  // either FPGA to Host or Host to FPGA transfer so we need to make sure the DMA transaction is aligned to the burst
+  // size (CCIP restriction)
+  else {
+    // need to determine if the CCIP (host) address is aligned to 4CL (256B).  When 0 the CCIP address is aligned.
+    alignment_offset =
+        (type == HOST_TO_FPGA_MM) ? (src % (4 * FPGA_DMA_ALIGN_BYTES)) : (dst % (4 * FPGA_DMA_ALIGN_BYTES));
+
+    // not aligned to 4CL so performing a short transfer to get aligned
+    if (alignment_offset != 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // count isn't large enough to hit next 4CL boundary
+      if (((4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset) >= count) {
+        segment_size = count;
+        count = 0;  // only had to transfer count amount of data to reach the end of the provided buffer
+      } else {
+        segment_size = (4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset;
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;  // subtract the segment size from count since the transfer below will bring us into 4CL
+                                // alignment
+        desc.control.transfer_irq_en = 0;
+      }
+
+      // will post short transfer to align to a 4CL (256 byte) boundary
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we are 4CL (256 byte) aligned
+    // if there is at least 4CL (256 bytes) of data to transfer, post bursts of 4
+    if (count >= (4 * FPGA_DMA_ALIGN_BYTES)) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.wr_burst_count = 4;
+      desc.rd_burst_count = 4;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+
+      // buffer ends on 4CL boundary
+      if ((count % (4 * FPGA_DMA_ALIGN_BYTES)) == 0) {
+        segment_size = count;
+        count = 0;  // transfer below will move the remainder of the buffer
+      }
+      // buffers do not end on 4CL boundary so transfer only up to the last 4CL boundary leaving a segment at the end to
+      // finish later
+      else {
+        segment_size = count - (count % (4 * FPGA_DMA_ALIGN_BYTES));  // round count down to the nearest multiple of 4CL
+        src += segment_size;
+        dst += segment_size;
+        count -= segment_size;
+        desc.control.transfer_irq_en = 0;
+      }
+
+      desc.len = segment_size;
+
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+    // at this point we have posted all the bursts of length 4 we can but there might be 64, 128, or 192 bytes of data
+    // to transfer still if buffer did not end on 4CL (256 byte) boundary post short transfer to handle the remainder
+    if (count > 0) {
+      desc.rd_address = src & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address = dst & FPGA_DMA_MASK_32_BIT;
+      desc.len = count;
+      desc.wr_burst_count = 1;
+      desc.rd_burst_count = 1;
+      desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT;
+      desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT;
+      if (intr_en) desc.control.transfer_irq_en = 1;
+      // will post short transfer to move the remainder of the buffer
+      res = _send_descriptor(dma_h, &desc);
+      ON_ERR_GOTO(res, out, "_send_descriptor");
+    }
+
+  }  // end of FPGA --> Host or Host --> FPGA transfer
+
+out:
+  return res;
+}
+
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dfh_offset, int interrupt_num, fpga_dma_handle *dma_p) {
+  fpga_result res = FPGA_OK;
+  fpga_dma_handle dma_h = NULL;
+  int i = 0;
+  if (!fpga) {
+    return FPGA_INVALID_PARAM;
+  }
+  if (!dma_p) {
+    return FPGA_INVALID_PARAM;
+  }
+  // init the dma handle
+  dma_h = (fpga_dma_handle)malloc(sizeof(struct _dma_handle_t));
+  if (!dma_h) {
+    return FPGA_NO_MEMORY;
+  }
+  dma_h->fpga_h = fpga;
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) dma_h->dma_buf_ptr[i] = NULL;
+  dma_h->mmio_num = 0;
+  dma_h->cur_ase_page = 0xffffffffffffffffUll;
+
+  // Discover DMA BBB by traversing the device feature list
+  bool dma_found = false;
+
+#ifndef USE_ASE
+  res = fpgaMapMMIO(dma_h->fpga_h, 0, (uint64_t **)&dma_h->mmio_va);
+  ON_ERR_GOTO(res, out, "fpgaMapMMIO");
+#endif
+
+  dfh_feature_t dfh = {0};
+  res = MMIORead64Blk(dma_h, dfh_offset, (uint64_t)&dfh, sizeof(dfh));
+  ON_ERR_GOTO(res, out, "MMIORead64Blk");
+
+  if (fpga_dma_feature_is_bbb(dfh.dfh) && (dfh.feature_uuid_lo == FPGA_DMA_UUID_L) &&
+      (dfh.feature_uuid_hi == FPGA_DMA_UUID_H)) {
+    dma_h->dma_base = dfh_offset;
+    dma_h->dma_csr_base = dma_h->dma_base + FPGA_DMA_CSR;
+    dma_h->dma_desc_base = dma_h->dma_base + FPGA_DMA_DESC;
+    dma_h->dma_ase_cntl_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_CNTL;
+    dma_h->dma_ase_data_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_DATA;
+    dma_found = true;
+    *dma_p = dma_h;
+    res = FPGA_OK;
+  } else {
+    *dma_p = NULL;
+    res = FPGA_NOT_FOUND;
+    goto out;
+  }
+
+  // Buffer size must be page aligned for prepareBuffer
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaPrepareBuffer(
+        dma_h->fpga_h, FPGA_DMA_BUF_SIZE, (void **)&(dma_h->dma_buf_ptr[i]), &dma_h->dma_buf_wsid[i], 0);
+    ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+    // Make sure it's actually allocated
+    dma_h->dma_buf_ptr[i][0] = 0xff;
+    madvise((void *)dma_h->dma_buf_ptr[i], FPGA_DMA_BUF_SIZE, MADV_SEQUENTIAL);
+
+    res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->dma_buf_wsid[i], &dma_h->dma_buf_iova[i]);
+    ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  }
+
+  // Allocate magic number buffer
+  res = fpgaPrepareBuffer(dma_h->fpga_h, FPGA_DMA_ALIGN_BYTES, (void **)&(dma_h->magic_buf), &dma_h->magic_wsid, 0);
+  ON_ERR_GOTO(res, out, "fpgaPrepareBuffer");
+
+  dma_h->magic_buf[0] = 0xff;
+
+  res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->magic_wsid, &dma_h->magic_iova);
+  ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress");
+  memset((void *)dma_h->magic_buf, 0, FPGA_DMA_ALIGN_BYTES);
+
+  // turn on global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 1;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, rel_buf, "MMIOWrite32Blk");
+
+  // register interrupt event handle
+  res = fpgaCreateEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaCreateEventHandle");
+
+  res = fpgaRegisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh, interrupt_num /*vector id */);
+  ON_ERR_GOTO(res, destroy_eh, "fpgaRegisterEvent");
+
+  return FPGA_OK;
+
+destroy_eh:
+  res = fpgaDestroyEventHandle(&dma_h->eh);
+  ON_ERR_GOTO(res, rel_buf, "fpgaDestroyEventHandle");
+
+rel_buf:
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+  }
+out:
+  if (!dma_found) {
+    free(dma_h);
+  }
+  return res;
+}
+
+/**
+ * _read_memory_mmio_unaligned
+ *
+ * @brief                Performs a unaligned read(address not 4/8/64 byte aligned) from FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _read_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                               uint64_t dev_addr,
+                                               uint64_t host_addr,
+                                               uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - shift) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast((void *)host_addr, count, ((char *)(&read_tmp)) + shift, count);
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio_unaligned
+ *
+ * @brief                Performs an unaligned write(address not 4/8/64 byte aligned) to FPGA address(device address).
+ * @param[in] dma        Handle to the FPGA DMA object
+ * @param[in] dev_addr   FPGA address
+ * @param[in] host_addr  Host buffer address
+ * @param[in] count      Size in bytes, always less than 8bytes.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+static fpga_result _write_memory_mmio_unaligned(fpga_dma_handle dma_h,
+                                                uint64_t dev_addr,
+                                                uint64_t host_addr,
+                                                uint64_t count) {
+  fpga_result res = FPGA_OK;
+
+  assert(count < QWORD_BYTES);
+
+  if (0 == count) return res;
+
+  uint64_t shift = dev_addr % QWORD_BYTES;
+  debug_print("shift = %08lx , count = %08lx \n", shift, count);
+
+  _switch_to_ase_page(dma_h, dev_addr);
+  uint64_t dev_aligned_addr = (dev_addr - (dev_addr % QWORD_BYTES)) & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+
+  // read data from device memory
+  uint64_t read_tmp = 0;
+  res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, count, (void *)host_addr, count);
+
+  // write back to device
+  res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp));
+  if (res != FPGA_OK) return res;
+
+  return res;
+}
+
+/**
+ * _write_memory_mmio
+ *
+ * @brief                   Writes to a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in/out] count     Pointer to the Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _write_memory_mmio(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*dst_ptr));
+  if (!IS_ALIGNED_DWORD(*dst_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(dst)) {
+    // Write out a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(dst));
+
+  // Write out blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, size_to_copy);
+    ON_ERR_RETURN(res, "MMIOWrite64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Write out remaining DWORD
+    _switch_to_ase_page(dma_h, dst);
+    offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIOWrite32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_host_to_fpga
+ *
+ * @brief                   Tx "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the FPGA address
+ * @param[in/out] src_ptr   Pointer to the Host buffer address
+ * @param[in] count         Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_host_to_fpga(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t dst = *dst_ptr;
+  uint64_t src = *src_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using dst masking method
+  if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) {
+    unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    src += unaligned_size;
+    dst += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _write_memory_mmio(dma_h, &dst, &src, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using dst masking method
+  unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+/**
+ * _read_memory_mmio
+ *
+ * @brief                   Reads a DWORD/QWORD aligned memory address(FPGA address).
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Pointer to the size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src, dst, and count
+ *
+ */
+static fpga_result _read_memory_mmio(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t *count) {
+  fpga_result res = FPGA_OK;
+
+  if (*count < DWORD_BYTES) return res;
+
+  assert(*count >= DWORD_BYTES);
+  assert(IS_ALIGNED_DWORD(*src_ptr));
+  if (!IS_ALIGNED_DWORD(*src_ptr))  // If QWORD aligned, this will be true
+    return FPGA_EXCEPTION;
+
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t align_bytes = *count;
+  uint64_t offset = 0;
+
+  if (!IS_ALIGNED_QWORD(src)) {
+    // Read a single DWORD to get QWORD aligned
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  if (0 == align_bytes) return res;
+
+  assert(IS_ALIGNED_QWORD(src));
+
+  // Read blocks of 64-bit values
+  while (align_bytes >= QWORD_BYTES) {
+    uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW;
+    left_in_page -= src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1)));
+    if (size_to_copy < QWORD_BYTES) break;
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, size_to_copy);
+    ON_ERR_RETURN(res, "MMIORead64Blk");
+    src += size_to_copy;
+    dst += size_to_copy;
+    align_bytes -= size_to_copy;
+  }
+
+  if (align_bytes >= DWORD_BYTES) {
+    // Read remaining DWORD
+    _switch_to_ase_page(dma_h, src);
+    offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK;
+    res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES);
+    ON_ERR_RETURN(res, "MMIORead32Blk");
+    src += DWORD_BYTES;
+    dst += DWORD_BYTES;
+    align_bytes -= DWORD_BYTES;
+  }
+
+  assert(align_bytes < DWORD_BYTES);
+
+  *src_ptr = src;
+  *dst_ptr = dst;
+  *count = align_bytes;
+  return res;
+}
+
+/**
+ * _ase_fpga_to_host
+ *
+ * @brief                   Tx "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make
+ * calls to handle unaligned and aligned MMIO writes.
+ * @param[in] dma           Handle to the FPGA DMA object
+ * @param[in/out] dst_ptr   Pointer to the Host Buffer Address
+ * @param[in/out] src_ptr   Pointer to the FPGA address
+ * @param[in/out] count     Size in bytes
+ * @return fpga_result      FPGA_OK on success, return code otherwise.  Updates src and dst
+ *
+ */
+static fpga_result _ase_fpga_to_host(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t count) {
+  fpga_result res = FPGA_OK;
+  uint64_t src = *src_ptr;
+  uint64_t dst = *dst_ptr;
+  uint64_t count_left = count;
+  uint64_t unaligned_size = 0;
+
+  debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr);
+
+  // Aligns address to 8 byte using src masking method
+  if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) {
+    unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+    if (unaligned_size > count_left) unaligned_size = count_left;
+    res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+    if (res != FPGA_OK) return res;
+    count_left -= unaligned_size;
+    dst += unaligned_size;
+    src += unaligned_size;
+  }
+  // Handles 8/4 byte MMIO transfer
+  res = _read_memory_mmio(dma_h, &src, &dst, &count_left);
+  if (res != FPGA_OK) return res;
+
+  // Left over unaligned count bytes are transfered using src masking method
+  unaligned_size = QWORD_BYTES - (src % QWORD_BYTES);
+  if (unaligned_size > count_left) unaligned_size = count_left;
+
+  res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size);
+  if (res != FPGA_OK) return res;
+
+  count_left -= unaligned_size;
+
+  *dst_ptr = dst + unaligned_size;
+  *src_ptr = src + unaligned_size;
+
+  return FPGA_OK;
+}
+
+static fpga_result clear_interrupt(fpga_dma_handle dma_h) {
+  // clear interrupt by writing 1 to IRQ bit in status register
+  msgdma_status_t status = {0};
+  status.st.irq = 1;
+
+  return MMIOWrite32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg));
+}
+
+static fpga_result poll_interrupt(fpga_dma_handle dma_h) {
+  struct pollfd pfd = {0};
+  msgdma_status_t status = { 0 };
+  fpga_result res = FPGA_OK;
+  int poll_res;
+
+  res = fpgaGetOSObjectFromEventHandle(dma_h->eh, &pfd.fd);
+  ON_ERR_GOTO(res, out, "fpgaGetOSObjectFromEventHandle failed\n");
+
+  pfd.events = POLLIN;
+
+#ifdef CHECK_DELAYS
+  if (0 == poll(&pfd, 1, 0)) poll_wait_count++;
+#endif
+  poll_res = poll(&pfd, 1, FPGA_DMA_TIMEOUT_MSEC);
+  MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)& status.reg, sizeof(status.reg));
+  if (poll_res < 0) {
+    fprintf(stderr, "Poll error errno = %s DMA status reg: 0x%x\n", strerror(errno), status.reg);
+    res = FPGA_EXCEPTION;
+    goto out;
+  } else if (poll_res == 0) {
+    fprintf(stderr, "Poll(interrupt) timeout DMA status reg: 0x%x\n", status.reg);
+    res = FPGA_EXCEPTION;
+  } else {
+    uint64_t count = 0;
+    ssize_t bytes_read = read(pfd.fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      debug_print("Poll success. Return = %d, count = %d\n", poll_res, (int)count);
+      res = FPGA_OK;
+    } else {
+      fprintf(stderr, "Error: poll failed read: zero bytes read");
+      res = FPGA_EXCEPTION;
+    }
+  }
+
+out:
+  clear_interrupt(dma_h);
+  return res;
+}
+
+static fpga_result _issue_magic(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  *(dma_h->magic_buf) = 0x0ULL;
+
+  res = _do_dma(dma_h,
+                dma_h->magic_iova | FPGA_DMA_WF_HOST_MASK,
+                FPGA_DMA_WF_ROM_MAGIC_NO_MASK,
+                64,
+                1,
+                FPGA_TO_HOST_MM,
+                FPGA2HOST_IRQ_REQ /*intr_en */);
+  return res;
+}
+
+static void _wait_magic(fpga_dma_handle dma_h) {
+#ifndef SKIP_FPGA2HOST_IRQ
+  poll_interrupt(dma_h);
+#endif
+  while (*(dma_h->magic_buf) != FPGA_DMA_WF_MAGIC_NO)
+    ;
+  *(dma_h->magic_buf) = 0x0ULL;
+}
+
+fpga_result transferHostToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int issued_intr = 0;
+  debug_print("Host To Fpga ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(dst)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      return res;
+    } else {
+      aligned_addr = ((dst / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - dst;
+      res = _ase_host_to_fpga(dma_h, &dst, &src, align_bytes);
+      ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chuncks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      // constant size transfer, no length check required for memcpy
+      memcpy_s_fast(dma_h->dma_buf_ptr[i % FPGA_DMA_MAX_BUF],
+                    FPGA_DMA_BUF_SIZE,
+                    (void *)(src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE);
+      // The value of FPGA_DMA_MAX_BUF is 2. Thus FPGA_DMA_MAX_BUF/2 -- 1, so the comparison
+      // is always i % 1 == 0, which will always be true. This means that the i == (dma_chunks -1)
+      // portion of the conditional will never be reached. However, for clarity and in case
+      // FPGA_DMA_MAX_BUF changes, I will leave the conditional as is and apply a coverity supression
+      // coverity[deadcode:FALSE]
+      if ((i % (FPGA_DMA_MAX_BUF / 2) == (FPGA_DMA_MAX_BUF / 2) - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (i == (FPGA_DMA_MAX_BUF / 2) - 1) {
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true);
+        } else {
+          if (issued_intr) poll_interrupt(dma_h);
+          res = _do_dma(dma_h,
+                        (dst + i * FPGA_DMA_BUF_SIZE),
+                        dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                        FPGA_DMA_BUF_SIZE,
+                        0,
+                        type,
+                        true /*intr_en */);
+        }
+        issued_intr = 1;
+      } else {
+        res = _do_dma(dma_h,
+                      (dst + i * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK,
+                      FPGA_DMA_BUF_SIZE,
+                      0,
+                      type,
+                      false /*intr_en */);
+      }
+    }
+    if (issued_intr) {
+      poll_interrupt(dma_h);
+      issued_intr = 0;
+    }
+    if (count_left) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+
+        memcpy_s_fast(
+            dma_h->dma_buf_ptr[0], dma_tx_bytes, (void *)(src + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      true /*intr_en */);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+        poll_interrupt(dma_h);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_host_to_fpga(dma_h, &dst, &src, count_left);
+        ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToHost(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t j = 0;
+  uint64_t count_left = count;
+  uint64_t aligned_addr = 0;
+  uint64_t align_bytes = 0;
+  int wf_issued = 0;
+
+  debug_print("FPGA To Host ----------- src = %08lx, dst = %08lx \n", src, dst);
+  if (!IS_DMA_ALIGNED(src)) {
+    if (count_left < FPGA_DMA_ALIGN_BYTES) {
+      res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      return res;
+    } else {
+      aligned_addr = ((src / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES;
+      align_bytes = aligned_addr - src;
+      res = _ase_fpga_to_host(dma_h, &src, &dst, align_bytes);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      count_left = count_left - align_bytes;
+    }
+  }
+  if (count_left) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print(
+        "DMA TX : dma chunks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src);
+    uint64_t pending_buf = 0;
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    dma_h->dma_buf_iova[i % (FPGA_DMA_MAX_BUF)] | FPGA_DMA_HOST_MASK,
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+
+      const int num_pending = i - pending_buf + 1;
+      if (num_pending == (FPGA_DMA_MAX_BUF / 2)) {  // Enters this loop only once,after first batch of descriptors.
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+      if (num_pending > (FPGA_DMA_MAX_BUF - 1) || i == (dma_chunks - 1) /*last descriptor */) {
+        if (wf_issued) {
+          _wait_magic(dma_h);
+          for (j = 0; j < (FPGA_DMA_MAX_BUF / 2); j++) {
+            // constant size transfer; no length check required
+            memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                          FPGA_DMA_BUF_SIZE,
+                          dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                          FPGA_DMA_BUF_SIZE);
+            pending_buf++;
+          }
+          wf_issued = 0;
+        }
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        wf_issued = 1;
+      }
+    }
+
+    if (wf_issued) _wait_magic(dma_h);
+
+    // clear out final dma memcpy operations
+    while (pending_buf < dma_chunks) {
+      // constant size transfer; no length check required
+      memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)],
+                    FPGA_DMA_BUF_SIZE);
+      pending_buf++;
+    }
+    if (count_left > 0) {
+      uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES;
+      if (dma_tx_bytes != 0) {
+        debug_print("dma_tx_bytes = %08lx  was transfered using DMA\n", dma_tx_bytes);
+        res = _do_dma(dma_h,
+                      dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK,
+                      (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                      dma_tx_bytes,
+                      1,
+                      type,
+                      false /*intr_en */);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+        if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) {
+          res = FPGA_NO_MEMORY;
+          ON_ERR_GOTO(res, out, "Illegal transfer size\n");
+        }
+        memcpy_s_fast(
+            (void *)(dst + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes, dma_h->dma_buf_ptr[0], dma_tx_bytes);
+      }
+      count_left -= dma_tx_bytes;
+      if (count_left) {
+        dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes;
+        res = _ase_fpga_to_host(dma_h, &src, &dst, count_left);
+        ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed");
+      }
+    }
+  }
+out:
+  return res;
+}
+
+fpga_result transferFpgaToFpga(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+  uint64_t i = 0;
+  uint64_t count_left = count;
+  uint64_t *tmp_buf = NULL;
+  if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src) && IS_DMA_ALIGNED(count_left)) {
+    uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE;
+    count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE);
+    debug_print("!!!FPGA to FPGA!!! TX :dma chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                dma_chunks,
+                count_left,
+                dst,
+                src);
+
+    for (i = 0; i < dma_chunks; i++) {
+      res = _do_dma(dma_h,
+                    (dst + i * FPGA_DMA_BUF_SIZE),
+                    (src + i * FPGA_DMA_BUF_SIZE),
+                    FPGA_DMA_BUF_SIZE,
+                    0,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      if ((i + 1) % FPGA_DMA_MAX_BUF == 0 || i == (dma_chunks - 1) /*last descriptor */) {
+        res = _issue_magic(dma_h);
+        ON_ERR_GOTO(res, out, "Magic number issue failed");
+        _wait_magic(dma_h);
+      }
+    }
+    if (count_left > 0) {
+      debug_print("Count_left = %08lx  was transfered using DMA\n", count_left);
+      res = _do_dma(dma_h,
+                    (dst + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    (src + dma_chunks * FPGA_DMA_BUF_SIZE),
+                    count_left,
+                    1,
+                    type,
+                    false /*intr_en */);
+      ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed");
+      res = _issue_magic(dma_h);
+      ON_ERR_GOTO(res, out, "Magic number issue failed");
+      _wait_magic(dma_h);
+    }
+  } else {
+    if ((src < dst) && (src + count_left >= dst)) {
+      debug_print("Overlapping addresses, Provide correct dst address\n");
+      return FPGA_NOT_SUPPORTED;
+    }
+    uint32_t tx_chunks = count_left / FPGA_DMA_BUF_ALIGN_SIZE;
+    count_left -= (tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE);
+    debug_print("!!!FPGA to FPGA TX!!! : tx chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n",
+                tx_chunks,
+                count_left,
+                dst,
+                src);
+    tmp_buf = (uint64_t *)malloc(FPGA_DMA_BUF_ALIGN_SIZE);
+    for (i = 0; i < tx_chunks; i++) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + i * FPGA_DMA_BUF_ALIGN_SIZE), FPGA_DMA_BUF_ALIGN_SIZE, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + i * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, FPGA_DMA_BUF_ALIGN_SIZE, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    if (count_left > 0) {
+      res = transferFpgaToHost(
+          dma_h, (uint64_t)tmp_buf, (src + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), count_left, FPGA_TO_HOST_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+      res = transferHostToFpga(
+          dma_h, (dst + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, count_left, HOST_TO_FPGA_MM);
+      ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed");
+    }
+    free(tmp_buf);
+  }
+out:
+  return res;
+out_spl:
+  free(tmp_buf);
+  return res;
+}
+
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) {
+  fpga_result res = FPGA_OK;
+
+  if (!dma_h) return FPGA_INVALID_PARAM;
+
+  if (type >= FPGA_MAX_TRANSFER_TYPE) return FPGA_INVALID_PARAM;
+
+  if (!dma_h->fpga_h) return FPGA_INVALID_PARAM;
+
+  if (type == HOST_TO_FPGA_MM) {
+    res = transferHostToFpga(dma_h, dst, src, count, HOST_TO_FPGA_MM);
+  } else if (type == FPGA_TO_HOST_MM) {
+    res = transferFpgaToHost(dma_h, dst, src, count, FPGA_TO_HOST_MM);
+  } else if (type == FPGA_TO_FPGA_MM) {
+    res = transferFpgaToFpga(dma_h, dst, src, count, FPGA_TO_FPGA_MM);
+  } else {
+    // Should not be possible, since we have handled all fpga_dma_transfer_t types
+    assert(0);
+  }
+
+  return res;
+}
+
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context) {
+  // TODO
+  return FPGA_NOT_SUPPORTED;
+}
+
+fpga_result fpgaDmaClose(fpga_dma_handle dma_h) {
+  fpga_result res = FPGA_OK;
+  int i = 0;
+  if (!dma_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  if (!dma_h->fpga_h) {
+    res = FPGA_INVALID_PARAM;
+    goto out;
+  }
+
+  for (i = 0; i < FPGA_DMA_MAX_BUF; i++) {
+    res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]);
+    ON_ERR_GOTO(res, out, "fpgaReleaseBuffer failed");
+  }
+
+  res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->magic_wsid);
+  ON_ERR_GOTO(res, out, "fpgaReleaseBuffer");
+
+  fpgaUnregisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh);
+  fpgaDestroyEventHandle(&dma_h->eh);
+
+  // turn off global interrupts
+  msgdma_ctrl_t ctrl = {0};
+  ctrl.ct.global_intr_en_mask = 0;
+  res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg));
+  ON_ERR_GOTO(res, out, "MMIOWrite32Blk");
+
+out:
+  free((void *)dma_h);
+  return res;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
new file mode 100644
index 0000000..e382696
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h
@@ -0,0 +1,141 @@
+// Copyright 2017-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma.h
+ * \brief FPGA DMA BBB API Header
+ *
+ * Known Limitations
+ * - Supports only synchronous (blocking) transfers
+ */
+
+#ifndef __FPGA_DMA_H__
+#define __FPGA_DMA_H__
+
+#include <opae/fpga.h>
+
+//#define DEBUG_MEM 1
+//#define FPGA_DMA_DEBUG 1
+#define SKIP_FPGA2HOST_IRQ 1
+#ifdef SKIP_FPGA2HOST_IRQ
+#define FPGA2HOST_IRQ_REQ false
+#else
+#define FPGA2HOST_IRQ_REQ true
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The DMA driver supports host to FPGA, FPGA to host and FPGA
+ * to FPGA transfers. The FPGA interface can be streaming
+ * or memory-mapped. Streaming interfaces are not currently
+ * supported.
+ */
+typedef enum {
+  HOST_TO_FPGA_MM = 0,  // Memory mapped FPGA interface
+  FPGA_TO_HOST_MM,      // Memory mapped FPGA interface
+  FPGA_TO_FPGA_MM,      // Memory mapped FPGA interface
+  FPGA_MAX_TRANSFER_TYPE,
+} fpga_dma_transfer_t;
+
+typedef struct _dma_handle_t *fpga_dma_handle;
+
+// Callback for asynchronous DMA transfers
+typedef void (*fpga_dma_transfer_cb)(void *context);
+
+/**
+ * fpgaDmaOpen
+ *
+ * @brief           Open a handle to DMA BBB.
+ *                  Scans the device feature chain looking for a DMA BBB.
+ *
+ * @param[in]  fpga Handle to the FPGA AFU object obtained via fpgaOpen()
+ * @param[in]  dma_base to DMA channel DFH
+ * @param[in]  interrupt_num interrupt number assigned to DMA channel
+ * @param[out] dma  DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dma_base, int interrupt_num, fpga_dma_handle *dma);
+
+/**
+ * fpgaDmaTransferSync
+ *
+ * @brief             Perform a blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferSync(
+    fpga_dma_handle dma, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type);
+
+/**
+ * fpgaDmaTransferAsync (Not supported)
+ *
+ * @brief             Perform a non-blocking copy of 'count' bytes from memory area pointed
+ *                    by src to memory area pointed by dst where fpga_dma_transfer_t specifies the
+ *                    type of memory transfer.
+ * @param[in] dma     Handle to the FPGA DMA object
+ * @param[in] dst     Address of the destination buffer
+ * @param[in] src     Address of the source buffer
+ * @param[in] count   Size in bytes
+ * @param[in] type    Must be one of the following values:
+ *                    HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface.
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory
+ *                                      User must specify valid src and dst.
+ *                    FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces
+ *                                      User must specify valid src and dst.
+ * @param[in] cb      Callback to invoke when DMA transfer is complete
+ * @param[in] context Pointer to define user-defined context
+ * @return fpga_result FPGA_OK on success, return code otherwise
+ *
+ */
+fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma,
+                                 uint64_t dst,
+                                 uint64_t src,
+                                 size_t count,
+                                 fpga_dma_transfer_t type,
+                                 fpga_dma_transfer_cb cb,
+                                 void *context);
+
+/**
+ * fpgaDmaClose
+ *
+ * @brief           Close the DMA BBB handle.
+ *
+ * @param[in] dma   DMA object handle
+ * @returns         FPGA_OK on success, return code otherwise
+ */
+fpga_result fpgaDmaClose(fpga_dma_handle dma);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __FPGA_DMA_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
new file mode 100644
index 0000000..e4c8373
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h
@@ -0,0 +1,289 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+/**
+ * \fpga_dma_internal.h
+ * \brief FPGA DMA BBB Internal Header
+ */
+
+#ifndef __FPGA_DMA_INT_H__
+#define __FPGA_DMA_INT_H__
+
+#include <opae/fpga.h>
+#include "x86-sse2.h"
+
+#ifdef CHECK_DELAYS
+#pragma message "Compiled with -DCHECK_DELAYS.  Not to be used in production"
+#endif
+
+#ifdef FPGA_DMA_DEBUG
+#pragma message "Compiled with -DFPGA_DMA_DEBUG.  Not to be used in production"
+#endif
+
+#ifndef max
+#define max(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a > _b ? _a : _b;      \
+  })
+#endif
+
+#ifndef min
+#define min(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a < _b ? _a : _b;      \
+  })
+#endif
+
+#define FPGA_DMA_TIMEOUT_MSEC (5000)
+
+#define QWORD_BYTES 8
+#define DWORD_BYTES 4
+#define IS_ALIGNED_DWORD(addr) (addr % 4 == 0)
+#define IS_ALIGNED_QWORD(addr) (addr % 8 == 0)
+
+#define FPGA_DMA_UUID_H 0xef82def7f6ec40fc
+#define FPGA_DMA_UUID_L 0xa9149a35bace01ea
+#define FPGA_DMA_WF_MAGIC_NO 0x5772745F53796E63ULL
+#define FPGA_DMA_HOST_MASK 0x2000000000000
+#define FPGA_DMA_WF_HOST_MASK 0x3000000000000
+#define FPGA_DMA_WF_ROM_MAGIC_NO_MASK 0x1000000000000
+
+#define AFU_DFH_REG 0x0
+#define AFU_DFH_NEXT_OFFSET 16
+#define AFU_DFH_EOL_OFFSET 40
+#define AFU_DFH_TYPE_OFFSET 60
+
+// BBB Feature ID (refer CCI-P spec)
+#define FPGA_DMA_BBB 0x2
+
+// Feature ID for DMA BBB
+#define FPGA_DMA_BBB_FEATURE_ID 0x765
+
+// DMA Register offsets from base
+#define FPGA_DMA_CSR 0x40
+#define FPGA_DMA_DESC 0x60
+#define FPGA_DMA_ADDR_SPAN_EXT_CNTL 0x200
+#define FPGA_DMA_ADDR_SPAN_EXT_DATA 0x1000
+
+#define DMA_ADDR_SPAN_EXT_WINDOW (4 * 1024)
+#define DMA_ADDR_SPAN_EXT_WINDOW_MASK ((uint64_t)(DMA_ADDR_SPAN_EXT_WINDOW - 1))
+
+#define FPGA_DMA_MASK_32_BIT 0xFFFFFFFF
+
+#define FPGA_DMA_CSR_BUSY (1 << 0)
+#define FPGA_DMA_DESC_BUFFER_EMPTY 0x2
+#define FPGA_DMA_DESC_BUFFER_FULL 0x4
+
+#define FPGA_DMA_ALIGN_BYTES 64
+#define IS_DMA_ALIGNED(addr) (addr % FPGA_DMA_ALIGN_BYTES == 0)
+
+#define CSR_BASE(dma_handle) ((uint64_t)dma_handle->dma_csr_base)
+#define ASE_DATA_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_data_base)
+#define ASE_CNTL_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_cntl_base)
+#define HOST_MMIO_32_ADDR(dma_handle, offset) \
+  ((volatile uint32_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_64_ADDR(dma_handle, offset) \
+  ((volatile uint64_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset)))
+#define HOST_MMIO_32(dma_handle, offset) (*HOST_MMIO_32_ADDR(dma_handle, offset))
+#define HOST_MMIO_64(dma_handle, offset) (*HOST_MMIO_64_ADDR(dma_handle, offset))
+
+#define CSR_STATUS(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, status))
+#define CSR_CONTROL(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, ctrl))
+
+// Granularity of DMA transfer (maximum bytes that can be packed
+// in a single descriptor).This value must match configuration of
+// the DMA IP. Larger transfers will be broken down into smaller
+// transactions.
+#define FPGA_DMA_BUF_SIZE (1024 * 1024 * 2UL)
+#define FPGA_DMA_BUF_ALIGN_SIZE FPGA_DMA_BUF_SIZE
+
+// Convenience macros
+
+#ifdef FPGA_DMA_DEBUG
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+#define FPGA_DMA_MAX_BUF 2
+
+typedef struct __attribute__((__packed__)) {
+  uint64_t dfh;
+  uint64_t feature_uuid_lo;
+  uint64_t feature_uuid_hi;
+} dfh_feature_t;
+
+typedef union {
+  uint64_t reg;
+  struct {
+    uint64_t feature_type : 4;
+    uint64_t reserved_8 : 8;
+    uint64_t afu_minor : 4;
+    uint64_t reserved_7 : 7;
+    uint64_t end_dfh : 1;
+    uint64_t next_dfh : 24;
+    uint64_t afu_major : 4;
+    uint64_t feature_id : 12;
+  } bits;
+} dfh_reg_t;
+
+struct _dma_handle_t {
+  fpga_handle fpga_h;
+  uint32_t mmio_num;
+  uint64_t mmio_va;
+  uint64_t cur_ase_page;
+  uint64_t dma_base;
+  uint64_t dma_offset;
+  uint64_t dma_csr_base;
+  uint64_t dma_desc_base;
+  uint64_t dma_ase_cntl_base;
+  uint64_t dma_ase_data_base;
+  // Interrupt event handle
+  fpga_event_handle eh;
+  // magic number buffer
+  volatile uint64_t *magic_buf;
+  uint64_t magic_iova;
+  uint64_t magic_wsid;
+  uint64_t *dma_buf_ptr[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_wsid[FPGA_DMA_MAX_BUF];
+  uint64_t dma_buf_iova[FPGA_DMA_MAX_BUF];
+};
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t tx_channel : 8;
+    uint32_t generate_sop : 1;
+    uint32_t generate_eop : 1;
+    uint32_t park_reads : 1;
+    uint32_t park_writes : 1;
+    uint32_t end_on_eop : 1;
+    uint32_t reserved_1 : 1;
+    uint32_t transfer_irq_en : 1;
+    uint32_t early_term_irq_en : 1;
+    uint32_t trans_error_irq_en : 8;
+    uint32_t early_done_en : 1;
+    uint32_t reserved_2 : 6;
+    uint32_t go : 1;
+  };
+} msgdma_desc_ctrl_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  uint32_t rd_address;
+  // 0x4
+  uint32_t wr_address;
+  // 0x8
+  uint32_t len;
+  // 0xC
+  uint16_t seq_num;
+  uint8_t rd_burst_count;
+  uint8_t wr_burst_count;
+  // 0x10
+  uint16_t rd_stride;
+  uint16_t wr_stride;
+  // 0x14
+  uint32_t rd_address_ext;
+  // 0x18
+  uint32_t wr_address_ext;
+  // 0x1c
+  msgdma_desc_ctrl_t control;
+} msgdma_ext_desc_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t busy : 1;
+    uint32_t desc_buf_empty : 1;
+    uint32_t desc_buf_full : 1;
+    uint32_t rsp_buf_empty : 1;
+    uint32_t rsp_buf_full : 1;
+    uint32_t stopped : 1;
+    uint32_t resetting : 1;
+    uint32_t stopped_on_errror : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t irq : 1;
+    uint32_t reserved : 22;
+  } st;
+} msgdma_status_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t stop_dispatcher : 1;
+    uint32_t reset_dispatcher : 1;
+    uint32_t stop_on_error : 1;
+    uint32_t stopped_on_early_term : 1;
+    uint32_t global_intr_en_mask : 1;
+    uint32_t stop_descriptors : 1;
+    uint32_t rsvd : 22;
+  } ct;
+} msgdma_ctrl_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_fill_level : 16;
+    uint32_t wr_fill_level : 16;
+  } fl;
+} msgdma_fill_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rsp_fill_level : 16;
+    uint32_t rsvd : 16;
+  } rsp;
+} msgdma_rsp_level_t;
+
+typedef union {
+  uint32_t reg;
+  struct {
+    uint32_t rd_seq_num : 16;
+    uint32_t wr_seq_num : 16;
+  } seq;
+} msgdma_seq_num_t;
+
+typedef struct __attribute__((__packed__)) {
+  // 0x0
+  msgdma_status_t status;
+  // 0x4
+  msgdma_ctrl_t ctrl;
+  // 0x8
+  msgdma_fill_level_t fill_level;
+  // 0xc
+  msgdma_rsp_level_t rsp;
+  // 0x10
+  msgdma_seq_num_t seq_num;
+} msgdma_csr_t;
+
+#endif  // __FPGA_DMA_INT_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
new file mode 100644
index 0000000..206b98a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp
@@ -0,0 +1,278 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <poll.h>
+#include <stdlib.h>
+
+#include <thread>
+
+#include "ccip_mmd_device.h"
+#include "eventfd_wrapper.h"
+#include "kernel_interrupt.h"
+
+using namespace intel_opae_mmd;
+
+// if ENABLE_OPENCL_KERNEL_INTERRUPTS is set at compile time, interrupts will
+// be enabled.
+#define ENABLE_OPENCL_KERNEL_INTERRUPTS
+
+// if ENABLE_OPENCL_KERNEL_POLLING_THREAD is set at compile time, a thread will
+// replace yield and the thread will call runtime call back
+
+// DLA runtime assumes interrupt service routing will run on its own (instead of runtime yielding to MMD) when hardware
+// interrupts
+#ifdef DLA_MMD
+#define ENABLE_OPENCL_KERNEL_POLLING_THREAD
+#endif
+
+// ccip interrupt line that is used for kernel
+#define MMD_KERNEL_INTERRUPT_LINE_NUM 1
+
+KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle)
+    : m_initialized(false),
+      m_eventfd_wrapper(NULL),
+      m_thread(NULL),
+      m_kernel_interrupt_fn(NULL),
+      m_kernel_interrupt_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      m_event_handle(0) {
+  enable_interrupts();
+}
+
+KernelInterrupt::~KernelInterrupt() { disable_interrupts(); }
+
+void KernelInterrupt::disable_interrupts() {
+  // kill the thread
+  if (m_thread) {
+    // send message to thread to end it
+    m_eventfd_wrapper->notify(1);
+
+    // join with thread until it ends
+    m_thread->join();
+
+    delete m_thread;
+    m_thread = NULL;
+  }
+
+  if (m_eventfd_wrapper) {
+    delete m_eventfd_wrapper;
+    m_eventfd_wrapper = NULL;
+  }
+
+  if (m_event_handle) {
+    fpga_result res;
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaUnregisterEvent");
+    }
+#endif
+
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "error fpgaDestroyEventHandle");
+    }
+  }
+
+  // disable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000000);
+#endif
+
+  m_initialized = false;
+}
+
+void KernelInterrupt::enable_interrupts() {
+  m_eventfd_wrapper = new eventfd_wrapper();
+  if (!m_eventfd_wrapper->initialized()) return;
+
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  m_thread = new std::thread(interrupt_polling_thread, std::ref(*this));
+#endif
+
+  fpga_result res;
+  // Create event
+  res = fpgaCreateEventHandle(&m_event_handle);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error creating event handle");
+    return;
+  }
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // Register user interrupt with event handle
+  res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, MMD_KERNEL_INTERRUPT_LINE_NUM);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "error registering event");
+    res = fpgaDestroyEventHandle(&m_event_handle);
+    return;
+  }
+
+  // enable opencl kernel interrupts
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  set_interrupt_mask(0x00000001);
+#endif
+#endif
+
+  m_initialized = true;
+}
+
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+void KernelInterrupt::set_interrupt_mask(uint32_t intr_mask) {
+  fpga_result res;
+  res = fpgaWriteMMIO32(m_fpga_handle, 0, AOCL_IRQ_MASKING_BASE, intr_mask);
+  if (res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaWriteMMIO32: %d\n", res);
+    return;
+  }
+}
+#endif
+
+void KernelInterrupt::interrupt_polling_thread(KernelInterrupt& obj) {
+  bool thread_is_active = true;
+  while (thread_is_active) {
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    const int timeout = -1;
+#else
+    const int timeout = 0;
+    usleep(100);
+#endif
+    thread_is_active = obj.poll_interrupt(timeout);
+  }
+}
+
+bool KernelInterrupt::poll_interrupt(int poll_timeout_arg) {
+  fpga_result fpga_res;
+
+  int res;
+  // get eventfd handles
+  int intr_fd;
+  fpga_res = fpgaGetOSObjectFromEventHandle(m_event_handle, &intr_fd);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "error getting event file handle");
+    return false;
+  }
+  int thread_signal_fd = m_eventfd_wrapper->get_fd();
+
+  struct pollfd pollfd_arr[2];
+  pollfd_arr[0].fd = intr_fd;
+  pollfd_arr[0].events = POLLIN;
+  pollfd_arr[0].revents = 0;
+  pollfd_arr[1].fd = thread_signal_fd;
+  pollfd_arr[1].events = POLLIN;
+  pollfd_arr[1].revents = 0;
+  res = poll(pollfd_arr, 2, poll_timeout_arg);
+  if (res < 0) {
+    fprintf(stderr, "Poll error errno = %s\n", strerror(errno));
+    return false;
+  } else if (res > 0 && pollfd_arr[0].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(intr_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+  } else if (res > 0 && pollfd_arr[1].revents == POLLIN) {
+    uint64_t count;
+    ssize_t bytes_read = read(thread_signal_fd, &count, sizeof(count));
+    if (bytes_read > 0) {
+      DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count);
+    } else {
+      fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+      // TODO: remove exit call. Revist this when fixing kernel interrupts
+      exit(-1);
+    }
+    return false;
+  } else {
+    // no event fd event happened
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+    return false;
+#endif
+  }
+
+#ifdef DLA_MMD
+  run_kernel_interrupt_fn();
+#else  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+
+  // probobly not required for interrupt polling but we poll the interrupt
+  // csr line to make sure an interrupt was actually triggered
+  uint32_t irqval = 0;
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  DEBUG_PRINT("irqval: %u\n", irqval);
+  if (irqval) run_kernel_interrupt_fn();
+
+#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS
+  // workaround for fb:530016
+  // check if irq line is still high and generate another interrupt event
+  fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval);
+  if (fpga_res != FPGA_OK) {
+    fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res);
+    return false;
+  }
+
+  // signal intr event fd
+  if (irqval) {
+    DEBUG_PRINT("CRITICAL WARNING: irqval has not been cleared by aocl runtime\n");
+    uint64_t count = 1;
+    ssize_t res = write(intr_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+  }
+#endif
+#endif
+
+  return true;
+}
+
+bool KernelInterrupt::yield_is_enabled() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  return false;
+#else
+  return true;
+#endif
+}
+
+void KernelInterrupt::yield() {
+#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD
+  usleep(0);
+#else
+  poll_interrupt(0);
+#endif
+}
+
+void KernelInterrupt::run_kernel_interrupt_fn() {
+  if (m_kernel_interrupt_fn) {
+    m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data);
+  } else {
+    fprintf(stderr, "m_kernel_interrupt_fn is NULL.  No interrupt handler set!\n");
+  }
+}
+
+void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  m_kernel_interrupt_fn = fn;
+  m_kernel_interrupt_user_data = user_data;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
new file mode 100644
index 0000000..44e9b50
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h
@@ -0,0 +1,75 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _KERNEL_INTERRUPT_H
+#define _KERNEL_INTERRUPT_H
+
+#include <opae/fpga.h>
+
+#include <atomic>
+#include <thread>
+
+#include "aocl_mmd.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class KernelInterrupt final {
+ public:
+  KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle);
+  ~KernelInterrupt();
+
+  bool initialized() { return m_initialized; }
+
+  void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+  void yield();
+  static bool yield_is_enabled();
+
+  void enable_interrupts();
+  void disable_interrupts();
+
+ private:
+#ifndef DLA_MMD  // IRQ offsets no longer exist in DLA hardware (removed from board.qsys)
+  void set_interrupt_mask(uint32_t intr_mask);
+#endif
+  void run_kernel_interrupt_fn();
+  bool poll_interrupt(int poll_timeout_arg);
+
+  static void interrupt_polling_thread(KernelInterrupt& obj);
+
+  bool m_initialized;
+  eventfd_wrapper* m_eventfd_wrapper;
+
+  std::thread* m_thread;
+
+  aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn;
+  void* m_kernel_interrupt_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  fpga_event_handle m_event_handle;
+
+  // not used and not implemented
+  KernelInterrupt(KernelInterrupt& other);
+  KernelInterrupt& operator=(const KernelInterrupt& other);
+};  // class KernelInterrupt
+
+};  // namespace intel_opae_mmd
+
+#endif  // _KERNEL_INTERRUPT_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
new file mode 100644
index 0000000..65d7f1a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c
@@ -0,0 +1,133 @@
+// Copyright 2018-2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+// This is derived from OPAE + OpenCL PAC BSP
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <safe_string/safe_string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "memcpy_s_fast.h"
+#include "x86-sse2.h"
+
+#pragma pop_macro("_GNU_SOURCE")
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n);
+
+memcpy_fn_t p_memcpy = memcpy_setup;  // Initial value points to setup routine
+
+/**
+ * SSE2_memcpy
+ *
+ * @brief                memcpy using SSE2 or REP MOVSB
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+static void *SSE2_memcpy(void *dst, size_t max, const void *src, size_t n) {
+  assert(n <= max);
+
+  void *ldst = dst;
+  void *lsrc = (void *)src;
+  if (IS_CL_ALIGNED(src) && IS_CL_ALIGNED(dst))  // 64-byte aligned
+  {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      aligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  } else {
+    if (n >= MIN_SSE2_SIZE)  // Arbitrary crossover performance point
+    {
+      debug_print("copying 0x%lx bytes (unaligned) with SSE2\n", (uint64_t)ALIGN_TO_CL(n));
+      unaligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n));
+      ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n));
+      lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n));
+      n -= ALIGN_TO_CL(n);
+    }
+  }
+
+  if (n) {
+    register unsigned long int dummy;
+    debug_print("copying 0x%lx bytes with REP MOVSB\n", n);
+    __asm__ __volatile__("rep movsb\n"
+                         : "=&D"(ldst), "=&S"(lsrc), "=&c"(dummy)
+                         : "0"(ldst), "1"(lsrc), "2"(n)
+                         : "memory");
+  }
+
+  return dst;
+}
+
+/**
+ * memcpy_wrap
+ *
+ * @brief                Trampoline for memcpy
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+static void *memcpy_wrap(void *dst, size_t max, const void *src, size_t n) { return memcpy(dst, src, n); }
+#endif  // ENABLE_MEMCPY_ENV_VAR_CHECK
+
+/**
+ * memcpy_setup
+ * Will be called on the first memcpy_s_fast invocation only.
+ *
+ * @brief                Set up which memcpy routine will be used at runtime
+ * @param[in] dst        Pointer to the destination memory
+ * @param[in] max        Size in bytes of destination
+ * @param[in] src        Pointer to the source memory
+ * @param[in] n          Size in bytes to copy
+ * @return dst
+ *
+ */
+
+static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n) {
+  // Default to SSE2_memcpy
+  p_memcpy = SSE2_memcpy;
+
+//
+#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+  char *pmemcpy = getenv(USE_MEMCPY_ENV);
+
+  if (pmemcpy) {
+    if (!strcasecmp(pmemcpy, "libc")) {
+      p_memcpy = memcpy_wrap;
+    } else if (!strcasecmp(pmemcpy, "sse2")) {
+      p_memcpy = SSE2_memcpy;
+    } else if (!strcasecmp(pmemcpy, "memcpy_s")) {
+      p_memcpy = (memcpy_fn_t)memcpy_s;
+    }
+  }
+#endif  // #ifdef ENABLE_MEMCPY_ENV_VAR_CHECK
+
+  return p_memcpy(dst, max, src, n);
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
new file mode 100644
index 0000000..08056d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h
@@ -0,0 +1,69 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef MEMCPY_S_FAST_H_
+#define MEMCPY_S_FAST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Constants needed in memcpy routines
+// Arbitrary crossover point for using SSE2 over rep movsb
+#define MIN_SSE2_SIZE 4096
+
+// TODO: hidden environment variables to experiment with performance
+// in production software are not a good idea in my opinion. Commenting out
+// for now but hopefully can remove this code completely in the long term.
+//#define USE_MEMCPY_ENV        "PAC_MEMCPY"
+
+#define CACHE_LINE_SIZE 64
+#define ALIGN_TO_CL(x) ((uint64_t)(x) & ~(CACHE_LINE_SIZE - 1))
+#define IS_CL_ALIGNED(x) (((uint64_t)(x) & (CACHE_LINE_SIZE - 1)) == 0)
+
+// Convenience macros
+#ifdef DEBUG_MEM
+#define debug_print(fmt, ...)                                \
+  do {                                                       \
+    if (FPGA_DMA_DEBUG) {                                    \
+      fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+      fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    }                                                        \
+  } while (0)
+
+#define error_print(fmt, ...)                              \
+  do {                                                     \
+    fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \
+    fprintf(stderr, fmt, ##__VA_ARGS__);                   \
+    err_cnt++;                                             \
+  } while (0)
+#else
+#define debug_print(...)
+#define error_print(...)
+#endif
+
+typedef void *(*memcpy_fn_t)(void *dst, size_t max, const void *src, size_t len);
+
+extern memcpy_fn_t p_memcpy;
+
+#define memcpy_s_fast(a, b, c, d) p_memcpy(a, b, c, d)
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // MEMCPY_S_FAST_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
new file mode 100644
index 0000000..92337a3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp
@@ -0,0 +1,434 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <safe_string/safe_string.h>
+#include "memcpy_s_fast.h"
+
+#include "ccip_mmd_device.h"
+#include "mmd_dma.h"
+
+using namespace intel_opae_mmd;
+
+// disable dma and only use mmio.  this is very slow.
+//#define DISABLE_DMA
+
+// Each MSGDMA_BBB DFH is now 0x100 instead of 0x2_0000 (it needed to be 0x2_0000 previously because
+// the ASE component was within the msgdma_bbb.qsys).
+// Original addressing:
+//              board_afu_dfh: 0x0-0x3f.
+//              msgdma_bbb_csr: 0x2_0000-0x2_1fff.
+// Original range at board.ddr_board.msgdma_bbb: 0x2_0000- 0x2_1fff.
+//              DFH : 0x0-0x3f.
+//              ASE.cntl : 0x200-0x207.
+//              ASE.windowed_slave : 0x1000-0x1fff.
+// Current addressing (with ASE removed from the msgdma_bbb and now living on its own in ddr_board.qsys):
+//              From top-level board.qsys (base address 0x0):
+//                  board | dfh                             : 0x0_0000 - 0x0_003f
+//                  board | ddr_board.ase                   : 0x1_0000 - 0x1_1fff
+//                  board | ddr_board.msgdma_bbb_0          : 0x2_0000 - 0x2_007f
+//                  board | ddr_board.msgdma_bbb_1          : 0x2_0100 - 0x2_017f
+//                  board | ddr_board.null_dfh              : 0x2_0200 - 0x2_023f
+//              From ase.qsys (base address: 0x1_0000):
+//                  board.ddr_board.ase.dfh_csr             : 0x0-0x3f
+//                  board.ddr_board.ase.ASE.cntl            : 0x200-0x207
+//                  board.ddr_board.ase.ASE.windowed_slave  : 0x1000-0x1fff
+//              From msgdma_bbb.qsys inst0 (base address: 0x2_0000)
+//                  board.ddr_board.msgdma_bbb_inst_0.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+//              From msgdma_bbb.qsys inst1 (base address: 0x2_0100)
+//                  board.ddr_board.msgdma_bbb_inst_1.dfh_csr                                   : 0x0-0x3f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.CSR              : 0x40-0x5f
+//                  board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f
+
+#define MEM_WINDOW_CRTL 0x200
+#define MEM_WINDOW_MEM 0x1000
+#define MEM_WINDOW_SPAN (4 * 1024)
+#define MEM_WINDOW_SPAN_MASK ((long)(MEM_WINDOW_SPAN - 1))
+#define MINIMUM_DMA_SIZE 256
+#define DMA_ALIGNMENT 256
+
+#ifdef DEBUG_MEM
+#define DCP_DEBUG_DMA(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DCP_DEBUG_DMA(...)
+#endif
+
+mmd_dma::mmd_dma(fpga_handle fpga_handle_arg,
+                 int mmd_handle,
+                 uint64_t dfh_offset_arg,
+                 uint64_t ase_bbb_addr_arg,
+                 int interrupt_num_arg)
+    : m_initialized(false),
+      m_dma_op_mutex(),
+      m_status_handler_fn(NULL),
+      m_status_handler_user_data(NULL),
+      m_fpga_handle(fpga_handle_arg),
+      m_mmd_handle(mmd_handle),
+      dfh_offset(dfh_offset_arg),
+      interrupt_num(interrupt_num_arg),
+      dma_h(NULL),
+      msgdma_bbb_base_addr(0),
+      ase_bbb_base_addr(ase_bbb_addr_arg) {
+#ifndef DISABLE_DMA
+
+  fpga_result res;
+  res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+  if (res != FPGA_OK) {
+    m_dma_work_thread = NULL;
+    fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+    return;
+  }
+#endif  // DISABLE_DMA
+
+  m_dma_work_thread = new dma_work_thread(*this);
+  if (!m_dma_work_thread->initialized()) {
+    return;
+  }
+
+  m_initialized = true;
+}
+
+mmd_dma::~mmd_dma() {
+  // kill the thread
+  if (m_dma_work_thread) {
+    delete m_dma_work_thread;
+    m_dma_work_thread = NULL;
+  }
+
+  if (dma_h) {
+    if (fpgaDmaClose(dma_h) != FPGA_OK) fprintf(stderr, "Error closing DMA\n");
+  }
+  m_initialized = false;
+}
+
+void mmd_dma::reinit_dma() {
+  if (!m_initialized) return;
+
+  if (dma_h) {
+    m_initialized = false;
+
+    fpga_result res;
+    res = fpgaDmaClose(dma_h);
+    dma_h = NULL;
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error closing DMA\n");
+      return;
+    }
+
+    res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h);
+    if (res != FPGA_OK) {
+      fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res));
+      return;
+    }
+
+    m_initialized = true;
+  }
+}
+
+void mmd_dma::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  m_status_handler_fn = fn;
+  m_status_handler_user_data = user_data;
+}
+
+void mmd_dma::event_update_fn(aocl_mmd_op_t op, int status) {
+  m_status_handler_fn(m_mmd_handle, m_status_handler_user_data, op, status);
+}
+
+fpga_result mmd_dma::do_dma(dma_work_item &item) {
+  // main dma function needs to be thread safe because dma csr operations
+  // are not thread safe
+  std::lock_guard<std::mutex> lock(m_dma_op_mutex);
+
+  fpga_result res = FPGA_OK;
+  assert(item.rd_host_addr != NULL || item.wr_host_addr != NULL);
+
+  // Tell the kernel we'll need these and they're sequential
+  uint64_t addr = item.rd_host_addr ? (uint64_t)item.rd_host_addr : (uint64_t)item.wr_host_addr;
+  addr = addr & ~((uint64_t)getpagesize() - 1);  // Align to page boundary
+  size_t remainder = ((size_t)getpagesize() - (addr & getpagesize())) & ~(getpagesize() - 1);
+  madvise((void *)addr, item.size + remainder, MADV_SEQUENTIAL);
+
+  if (item.rd_host_addr) {
+    res = read_memory(item.rd_host_addr, item.dev_addr, item.size);
+  } else {
+    assert(item.wr_host_addr);
+    res = write_memory(item.wr_host_addr, item.dev_addr, item.size);
+  }
+
+  if (item.op) {
+    // TODO: check what 'status' value should really be.  Right now just
+    // using 0 as was done in previous CCIP MMD.  Also handle case if op is NULL
+    event_update_fn(item.op, 0);
+  }
+
+  return res;
+}
+
+fpga_result mmd_dma::enqueue_dma(dma_work_item &item) {
+  return static_cast<fpga_result>(m_dma_work_thread->enqueue_dma(item));
+}
+
+fpga_result mmd_dma::read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = host_addr;
+  item.wr_host_addr = NULL;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  assert(host_addr);
+  dma_work_item item;
+  item.op = op;
+  item.rd_host_addr = NULL;
+  item.wr_host_addr = host_addr;
+  item.dev_addr = dev_addr;
+  item.size = size;
+
+  return enqueue_dma(item);
+}
+
+fpga_result mmd_dma::read_memory(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      read_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = read_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return read_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return read_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+#ifdef DISABLE_DMA
+  res = read_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, (uint64_t)host_addr /*dst*/, dev_addr /*src*/, dma_size, FPGA_TO_HOST_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = read_memory_mmio(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(host_addr, size, ((char *)(&read_tmp)) + shift, size);
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: read data %8p %08lx %16p\n", host_addr, dev_addr, host_addr);
+    res = fpgaReadMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = read_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory_mmio done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  // check for alignment
+  if (dev_addr % DMA_ALIGNMENT != 0) {
+    // check for mmio alignment
+    uint64_t mmio_shift = dev_addr % 8;
+    if (mmio_shift != 0) {
+      size_t unaligned_size = 8 - mmio_shift;
+      if (unaligned_size > size) unaligned_size = size;
+
+      DCP_DEBUG_DMA("DCP DEBUG: write_memory %ld %ld %ld\n", mmio_shift, unaligned_size, size);
+      write_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size);
+
+      if (size > unaligned_size)
+        res = write_memory(
+            (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size);
+      return res;
+    }
+
+    // TODO: need to do a shift here
+    return write_memory_mmio(host_addr, dev_addr, size);
+  }
+
+  // check size
+  if (size < MINIMUM_DMA_SIZE) return write_memory_mmio(host_addr, dev_addr, size);
+
+  size_t remainder = (size % DMA_ALIGNMENT);
+  size_t dma_size = size - remainder;
+
+// TODO: make switch for MMIO
+#ifdef DISABLE_DMA
+  res = write_memory_mmio(host_addr, dev_addr, dma_size);
+#else
+  res = fpgaDmaTransferSync(dma_h, dev_addr /*dst*/, (uint64_t)host_addr /*src*/, dma_size, HOST_TO_FPGA_MM);
+#endif
+  if (res != FPGA_OK) return res;
+
+  if (remainder) res = write_memory(host_addr + dma_size / 8, dev_addr + dma_size, remainder);
+
+  if (res != FPGA_OK) return res;
+
+  DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size);
+  DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size);
+
+  DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::write_memory done!\n");
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size);
+  fpga_result res = FPGA_OK;
+
+  uint64_t shift = dev_addr % 8;
+
+  assert(size + shift <= 8);
+
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+
+  uint64_t dev_aligned_addr = dev_addr - shift;
+
+  // read data from device memory
+  uint64_t read_tmp;
+  res = fpgaReadMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp);
+  if (res != FPGA_OK) return res;
+  // overlay our data
+  memcpy_s_fast(((char *)(&read_tmp)) + shift, size, host_addr, size);
+
+  // write back to device
+  res = fpgaWriteMMIO64(
+      m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_aligned_addr & MEM_WINDOW_SPAN_MASK), read_tmp);
+  if (res != FPGA_OK) return res;
+
+  return FPGA_OK;
+}
+
+fpga_result mmd_dma::write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size) {
+  DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size);
+
+  fpga_result res = FPGA_OK;
+  uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+  res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+  if (res != FPGA_OK) return res;
+  DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+  for (size_t i = 0; i < size / 8; i++) {
+    uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK;
+    if (mem_page != cur_mem_page) {
+      cur_mem_page = mem_page;
+      res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page);
+      if (res != FPGA_OK) return res;
+      DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page);
+    }
+    DCP_DEBUG_DMA("DCP DEBUG: write data %8p %08lx %016lx\n", host_addr, dev_addr, *host_addr);
+    res = fpgaWriteMMIO64(
+        m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), *host_addr);
+    if (res != FPGA_OK) return res;
+
+    host_addr += 1;
+    dev_addr += 8;
+  }
+
+  if (size % 8 != 0) {
+    res = write_memory_mmio_unaligned(host_addr, dev_addr, size % 8);
+    if (res != FPGA_OK) return res;
+  }
+
+  DCP_DEBUG_DMA("DCP DEBUG: aocl_mmd_write done!\n");
+  return FPGA_OK;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
new file mode 100644
index 0000000..ff33aed
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h
@@ -0,0 +1,97 @@
+/* (C) 1992-2017 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifndef _MMD_DMA_H
+#define _MMD_DMA_H
+
+#pragma push_macro("_GNU_SOURCE")
+#undef _GNU_SOURCE
+#define _GNU_SOURCE
+#include <sched.h>
+#pragma pop_macro("_GNU_SOURCE")
+
+#include <opae/fpga.h>
+
+#include <mutex>
+
+#include "aocl_mmd.h"
+#include "dma_work_thread.h"
+#include "fpga_dma.h"
+
+namespace intel_opae_mmd {
+
+class eventfd_wrapper;
+
+class mmd_dma final {
+ public:
+  mmd_dma(fpga_handle fpga_handle_arg,
+          int mmd_handle,
+          uint64_t dfh_offset_arg,
+          uint64_t ase_bbb_addr_arg,
+          int interrupt_num_arg);
+  ~mmd_dma();
+
+  bool initialized() { return m_initialized; }
+
+  fpga_result read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result do_dma(dma_work_item &item);
+
+  void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+
+  // used after reconfigation
+  void reinit_dma();
+
+  void bind_to_node(void);
+
+ private:
+  // Helper functions
+  fpga_result enqueue_dma(dma_work_item &item);
+  fpga_result read_memory(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size);
+  fpga_result read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size);
+
+  void event_update_fn(aocl_mmd_op_t op, int status);
+
+  bool m_initialized;
+
+  dma_work_thread *m_dma_work_thread;
+  std::mutex m_dma_op_mutex;
+
+  aocl_mmd_status_handler_fn m_status_handler_fn;
+  void *m_status_handler_user_data;
+
+  fpga_handle m_fpga_handle;
+  int m_mmd_handle;
+
+  uint64_t dfh_offset;
+  int interrupt_num;
+  fpga_dma_handle dma_h;
+  uint64_t msgdma_bbb_base_addr;
+  uint64_t ase_bbb_base_addr;
+
+  // not used and not implemented
+  mmd_dma(mmd_dma &other);
+  mmd_dma &operator=(const mmd_dma &other);
+};  // class mmd_dma
+
+};  // namespace intel_opae_mmd
+
+#endif  // _MMD_DMA_H
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
new file mode 100644
index 0000000..e1fb5d3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S
@@ -0,0 +1,269 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if defined(__i386__) || defined(__amd64__)
+
+.intel_syntax noprefix
+.text
+
+#define PREFETCH_DISTANCE 256
+
+.macro asm_function_helper function_name
+    .global \function_name
+.func \function_name
+\function_name:
+#ifdef __amd64__
+  #ifdef _WIN64
+    .set DST,  rcx
+    .set SRC,  rdx
+    .set SIZE, r8
+  #else
+    .set DST,  rdi
+    .set SRC,  rsi
+    .set SIZE, rdx
+  #endif
+#else
+    mov  eax,  [esp + 4]
+    mov  ecx,  [esp + 8]
+    mov  edx,  [esp + 12]
+    .set DST,  eax
+    .set SRC,  ecx
+    .set SIZE, edx
+#endif
+.endm
+
+.macro asm_function function_name
+#if defined(_WIN32) && !defined(_WIN64)
+    asm_function_helper _\function_name
+#else
+    asm_function_helper \function_name
+#endif
+.endm
+
+.macro push3 a, b, c
+    push \a
+    push \b
+    push \c
+.endm
+
+.macro pop3 a, b, c
+    pop \c
+    pop \b
+    pop \a
+.endm
+
+/*****************************************************************************/
+
+asm_function aligned_block_copy_movsb
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    rep movsb
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    rep movsb
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function aligned_block_copy_movsd
+0:
+#ifdef __amd64__
+    push3       rdi rsi rcx
+    push3       DST SRC SIZE
+    pop3        rdi rsi rcx
+    sar         rcx, 2
+    rep movsd
+    pop3        rdi rsi rcx
+#else
+    push3       edi esi ecx
+    push3       DST SRC SIZE
+    pop3        edi esi ecx
+    sar         ecx, 2
+    rep movsd
+    pop3        edi esi ecx
+#endif
+    ret
+.endfunc
+
+asm_function unaligned_block_copy_sse2
+0:
+    movdqu      xmm0,       [SRC + 0]
+    movdqu      xmm1,       [SRC + 16]
+    movdqu      xmm2,       [SRC + 32]
+    movdqu      xmm3,       [SRC + 48]
+    movdqu      [DST + 0],  xmm0
+    movdqu      [DST + 16], xmm1
+    movdqu      [DST + 32], xmm2
+    movdqu      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_sse2
+0:
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE, 64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf32_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    prefetchnta [SRC + PREFETCH_DISTANCE + 32]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm1
+    movdqa      [DST + 32], xmm2
+    movdqa      [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_copy_nt_pf64_sse2
+0:
+    prefetchnta [SRC + PREFETCH_DISTANCE]
+    movdqa      xmm0,       [SRC + 0]
+    movdqa      xmm1,       [SRC + 16]
+    movdqa      xmm2,       [SRC + 32]
+    movdqa      xmm3,       [SRC + 48]
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm1
+    movntdq     [DST + 32], xmm2
+    movntdq     [DST + 48], xmm3
+    add         SRC,        64
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movdqa      [DST + 0],  xmm0
+    movdqa      [DST + 16], xmm0
+    movdqa      [DST + 32], xmm0
+    movdqa      [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+asm_function aligned_block_fill_nt_sse2
+    movdqa      xmm0,       [SRC + 0]
+0:
+    movntdq     [DST + 0],  xmm0
+    movntdq     [DST + 16], xmm0
+    movntdq     [DST + 32], xmm0
+    movntdq     [DST + 48], xmm0
+    add         DST,        64
+    sub         SIZE,       64
+    jg          0b
+    ret
+.endfunc
+
+/*****************************************************************************/
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
new file mode 100644
index 0000000..6ebe2ef
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h
@@ -0,0 +1,54 @@
+// From TinyMembench v0.4, with slight modifications for Windows.
+/*
+ * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_SSE2_H__
+#define __X86_SSE2_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aligned_block_copy_movsb(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_movsd(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void unaligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_copy_nt_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+void aligned_block_copy_nt_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+void aligned_block_fill_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
new file mode 100644
index 0000000..edb46c7
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h
@@ -0,0 +1,489 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (C) 1992-2019 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device.  No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+#include <cstddef>  //size_t
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried.  This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "18.1"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data andy requires explicit function calls from the user
+ * to sychronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to sychronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * sychronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device.  The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support.  The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name.  The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES.  The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ *
+ * */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device.  This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ *      param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface.  If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,      /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,          /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,             /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,           /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                  /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                 /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,            /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,           /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,         /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11 /* total # of concurent operations read + writes*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes.  This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition.  For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+                                    aocl_mmd_info_t requested_info_id,
+                                    size_t param_value_size,
+                                    void* param_value,
+                                    size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.  Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signalled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signalled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed.  E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_copy(
+    int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device.  That means the kernels will be idle and no read/write/copy
+ * commands are active.  Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size.  The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again.  At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+
+/* Shared memory allocator
+ * Allocates memory that is shared between the host and the FPGA.  The
+ * host will access this memory using the pointer returned by
+ * aocl_mmd_shared_mem_alloc, while the FPGA will access the shared memory
+ * using device_ptr_out.  If shared memory is not supported this should return
+ * NULL.
+ *
+ * Shared memory survives FPGA reprogramming if the CPU is not rebooted.
+ *
+ * Arguments:
+ *   size - the size of the shared memory to allocate
+ *   device_ptr_out - will receive the pointer value used by the FPGA (the device)
+ *                    to access the shared memory.  Cannot be NULL.  The type is
+ *                    unsigned long long to handle the case where the host has a
+ *                    smaller pointer size than the device.
+ *
+ * Returns: The pointer value to be used by the host to access the shared
+ * memory if successful, otherwise NULL.
+ */
+AOCL_MMD_CALL void* aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long* device_ptr_out) WEAK;
+
+/* Shared memory de-allocator
+ * Frees previously allocated shared memory.  If shared memory is not supported,
+ * this function should do nothing.
+ *
+ * Arguments:
+ *   host_ptr - the host pointer that points to the shared memory, as returned by
+ *              aocl_mmd_shared_mem_alloc
+ *   size     - the size of the shared memory to free. Must match the size
+ *              originally passed to aocl_mmd_shared_mem_alloc
+ */
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void* host_ptr, size_t size) WEAK;
+
+/* DEPRECATED. Use aocl_mmd_program instead
+ * This reprogram API is only for mmd version previous than 18.1
+ */
+AOCL_MMD_CALL int aocl_mmd_reprogram(int handle, void* user_data, size_t size) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+#include <cstdint>
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore
new file mode 100644
index 0000000..66e06bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore
@@ -0,0 +1,18 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master
new file mode 100644
index 0000000..835c7e0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master
@@ -0,0 +1 @@
+sc
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt
new file mode 100644
index 0000000..e7e4584
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt
@@ -0,0 +1,144 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_MAX_DEVICE=128")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOPTION3=1 -DACL_USE_DMA=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_HAS_STDLIB_STDIO")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_BIT=64")
+
+# Select PCIE Gen3 x16
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x16")
+
+if (WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DAOCL_MMD_CALL=__declspec(dllexport)")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_COMPILER_IS_MSVC=1 -DACL_HOST_RUNTIME_IS_STATIC=1")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=windows -DACL_TARGET_SYS=windows -DWINDOWS")
+endif()
+
+# from the opencl makefile
+if (NOT WIN32)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -O3 -DACL_COMPILER_IS_MSVC=0 -DACL_HOST_RUNTIME_IS_STATIC=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -O2")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_TARGET_SYS=linux -DLINUX")
+  # Release build only
+  set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
+endif()
+
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+set(MMD_SRC
+   ./host/acl_pcie_config.cpp
+   ./host/acl_pcie.cpp
+   ./host/acl_pcie_debug.cpp
+   ./host/acl_pcie_device.cpp
+   ./host/acl_pcie_dma_linux.cpp
+   ./host/acl_pcie_dma_windows.cpp
+   ./host/acl_pcie_hostch.cpp
+   ./host/acl_pcie_mm_io.cpp
+   ./host/acl_pcie_timer.cpp
+)
+
+add_library(de10_agilex_mmd SHARED ${MMD_SRC})
+
+target_include_directories(de10_agilex_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+if (WIN32)
+  # Terrasic production BSP Linux kernel space driver header files
+  set(TERASIC_KERNEL_HEADER_DIR $ENV{AOCL_BOARD_PACKAGE_ROOT}/linux64/driver)
+  set(TERASIC_KERNEL_HEADER_FILES 
+    fpga_cmd_guids.h
+    hw_host_channel.h
+    hw_pcie_constants.h
+    hw_pcie_dma.h
+  )
+  if (EXISTS ${TERASIC_KERNEL_HEADER_DIR})
+    foreach(header ${TERASIC_KERNEL_HEADER_FILES})
+      if (EXISTS ${TERASIC_KERNEL_HEADER_DIR}/${header})
+        file(COPY ${TERASIC_KERNEL_HEADER_DIR}/${header} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/include)
+      else()
+        message(WARNING "Header file ${header} does not exist in ${TERASIC_KERNEL_HEADER_DIR}")
+      endif()
+    endforeach()
+  else()
+    message(FATAL_ERROR "Source directory ${TERASIC_KERNEL_HEADER_DIR} does not exist.")
+  endif()
+
+  set(HW_PCI_DMA_H ${CMAKE_CURRENT_SOURCE_DIR}/include/hw_pcie_dma.h)
+  file(READ ${HW_PCI_DMA_H} HW_PCI_DMA_H_CONTENT)
+  # Remove any end-of-line whitespace from the file content (spaces and tabs)
+  string(REGEX REPLACE "[ \t]+(\r?\n)" "\\1" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}")
+  set(OLD_CODE_BLOCK
+"PACK(
+struct DMA_DESC_ENTRY {
+    UINT32 src_addr_ldw;
+    UINT32 src_addr_udw;
+    UINT32 dest_addr_ldw;
+    UINT32 dest_addr_udw;
+    UINT32 ctl_dma_len;
+    UINT32 reserved[3];
+});")
+  set(NEW_CODE_BLOCK
+"#if defined(GEN3_x8)
+PACK(
+struct DMA_DESC_ENTRY {
+    UINT32 src_addr_ldw;
+    UINT32 src_addr_udw;
+    UINT32 dest_addr_ldw;
+    UINT32 dest_addr_udw;
+    UINT32 ctl_dma_len;
+    UINT32 reserved[3];
+});
+#elif defined(GEN3_x16)
+PACK(
+struct DMA_DESC_ENTRY {
+    UINT64 src_addr;
+    UINT64 dst_addr;
+    UINT32 ctrl;
+    UINT32 reserved[3];
+});
+#endif")
+  string(REPLACE "${OLD_CODE_BLOCK}" "${NEW_CODE_BLOCK}" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}")
+  file(WRITE ${HW_PCI_DMA_H} "${HW_PCI_DMA_H_CONTENT}")
+
+  set_target_properties(de10_agilex_mmd PROPERTIES LINK_FLAGS "-subsystem:console -nologo -fixed:no -incremental:no -opt:noref -ignore:4089 /NXCOMPAT /DYNAMICBASE")
+
+  find_library(ACL_CHECK_SYS_CMD_LIB
+    acl_check_sys_cmd
+    PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64)
+  find_library(FPGA_LIB
+    FpgaLib
+    PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64)
+
+  target_link_libraries(de10_agilex_mmd ${ACL_CHECK_SYS_CMD_LIB} ${FPGA_LIB})
+else()
+  target_link_libraries(de10_agilex_mmd)
+endif()
+
+install(TARGETS de10_agilex_mmd
+  RUNTIME DESTINATION "dla/runtime/bin" COMPONENT de10_agilex_mmd
+  LIBRARY DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd
+  ARCHIVE DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp
new file mode 100644
index 0000000..527d8bf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp
@@ -0,0 +1,951 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie.cpp  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions that are defined in aocl_mmd.h               */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#include "acl_pcie_device.h"
+#include "hw_pcie_constants.h"
+#ifndef DLA_MMD
+#include "acl_check_sys_cmd.h"
+#endif
+
+// other standard header files
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <unistd.h>
+#endif  // LINUX
+
+// MAX size of line read from pipe-ing the output of system call to MMD
+#define BUF_SIZE 1024
+// MAX size of command passed to system for invoking system call from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+#ifndef DLA_MMD
+// static helper functions
+static bool blob_has_elf_signature(void *data, size_t data_size);
+#endif
+
+// global variables used for handling multi-devices and its helper functions
+// Use a DeviceMapManager to manage a heap-allocated map for storing device information
+// instead of using a static global map because of a segmentation fault which occurs in
+// the following situation:
+// 1) Host program contains a global variable which calls clReleaseContext in its destructor.
+//    When the program ends the global goes out of scope and the destructor is called.
+// 2) clReleaseContext calls a function in the MMD library which modifies the static global map in
+//    the MMD library.
+// In this situation it was discovered that the destructor of the static global map is called before
+// the destructor of the global in the host program, thus resulting in a segmentation fault when
+// clReleaseContext calls a function that modifies the internal map after it has been destroyed.
+// Using a heap-allocated map avoids this issue as the lifetime of the map persists until it is
+// deleted or the process is completely terminated.
+class DeviceMapManager {
+ public:
+  typedef std::pair<const std::string, ACL_PCIE_DEVICE *> DeviceInfo;
+  typedef std::map<int, DeviceInfo> DeviceMap;
+
+  static inline bool empty() { return !s_device_map; }
+
+  // Returns the underlying device map. The map must not be empty when this is called.
+  static inline const DeviceMap &get_device_map() {
+    ACL_PCIE_ASSERT(s_device_map, "no devices are open  -- aborting\n");
+    return *s_device_map;
+  }
+
+  // Returns the device info associated with the given handle. The handle must exist.
+  static inline const DeviceInfo &get_pcie_device_info(int handle) { return get_device_it_for_handle(handle)->second; }
+
+  // Returns the device associated with the given handle. The handle must exist.
+  static inline ACL_PCIE_DEVICE *get_pcie_device(int handle) { return get_pcie_device_info(handle).second; }
+
+  // Adds a device with the specified name for the given handle. If a device with the same handle already exists
+  // it is discarded first. The caller must ensure they don't associate the same device with multiple handles.
+  static inline void add_pcie_device_handle(int handle, const std::string &name, ACL_PCIE_DEVICE *dev) {
+    // To avoid memory leaks ensure that only this function ever allocates a new device map because
+    // we only ever delete the map when the size of the map goes from non-empty to empty.
+    if (!s_device_map) s_device_map = new DeviceMap();
+
+    if (s_device_map->count(handle)) discard_pcie_device_handle(handle);
+    s_device_map->insert(std::pair<int, DeviceInfo>(handle, DeviceInfo(name, dev)));
+  }
+
+  // Removes the device associated with the given handle. The handle must exist.
+  static inline void discard_pcie_device_handle(int handle) {
+    DeviceMap::iterator it = get_device_it_for_handle(handle);
+
+    delete it->second.second;
+    s_device_map->erase(it);
+    if (s_device_map->empty()) {
+      // From a functional perspective the map can remain allocated for
+      // the entire lifetime the MMD is loaded but there
+      // is no other good place to clean it up except here.
+      delete s_device_map;
+      s_device_map = NULL;
+    }
+  }
+
+  // Removes all devices.
+  static inline void discard_all_pcie_device_handles() {
+    if (!s_device_map) return;
+
+    for (DeviceMapManager::DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) {
+      delete it->second.second;
+    }
+
+    delete s_device_map;
+    s_device_map = NULL;
+  }
+
+  // Returns true if any device is currently being programmed.
+  static inline bool is_any_device_being_programmed() {
+    if (!s_device_map) return false;
+
+    for (DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) {
+      if (it->second.second->is_being_programmed()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  static inline DeviceMap::iterator get_device_it_for_handle(int handle) {
+    ACL_PCIE_ASSERT(s_device_map, "can't find handle %d -- aborting\n", handle);
+    DeviceMap::iterator it = s_device_map->find(handle);
+    ACL_PCIE_ASSERT(it != s_device_map->end(), "can't find handle %d -- aborting\n", handle);
+    return it;
+  }
+
+  static DeviceMap *s_device_map;
+};
+DeviceMapManager::DeviceMap *DeviceMapManager::s_device_map = NULL;
+
+static int test_device_exception_signal_number = 63;
+
+// Functions for handling interrupts or signals for multiple devices
+// This functions are used inside the ACL_PCIE_DEVICE class
+#if defined(WINDOWS)
+void pcie_interrupt_handler(void *data) {
+  ACL_PCIE_DEVICE *device = static_cast<ACL_PCIE_DEVICE *>(data);
+  device->service_interrupt();
+}
+
+BOOL ctrl_c_handler(DWORD fdwCtrlType) {
+  if (fdwCtrlType != CTRL_C_EVENT) return FALSE;
+
+  if (DeviceMapManager::is_any_device_being_programmed()) {
+    ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n");
+    return TRUE;
+  }
+
+  // On Windows, the signal handle function is executed by another thread,
+  // so we cannot simply free all the open devices.
+  // Just exit when received a ctrl-c event, the OS will take care of the clean-up.
+  exit(1);
+}
+#endif  // WINDOWS
+#if defined(LINUX)
+// On Linux, driver will send a SIG_INT_NOTIFY *signal* to notify about an interrupt.
+void pcie_linux_signal_handler(int sig, siginfo_t *info, void *unused) {
+  // test_device_exception_signal_number is reserved for device exception testing
+  if (sig == test_device_exception_signal_number) {
+    ACL_PCIE_ERROR_IF(DeviceMapManager::get_device_map().empty(),
+                      return,
+                      "No devices available to trigger test_device_exception_signal_number on.\n");
+    // Pick the last (most recent) handle for device exception testing
+    unsigned int handle = DeviceMapManager::get_device_map().rbegin()->first;
+    DeviceMapManager::get_pcie_device(handle)->test_trigger_device_interrupt();
+  } else {
+    // the last bit indicates the DMA completion
+    unsigned int irq_type_flag = info->si_int & 0x1;
+    // other bits shows the handle value of the device that sent the interrupt
+    unsigned int handle = info->si_int >> 1;
+    if (DeviceMapManager::empty() || !DeviceMapManager::get_device_map().count(handle)) {
+      ACL_PCIE_DEBUG_MSG(":: received an unknown handle %d in signal handler, ignore this.\n", handle);
+      return;
+    }
+
+    DeviceMapManager::get_pcie_device(handle)->service_interrupt(irq_type_flag);
+  }
+}
+
+void ctrl_c_handler(int sig_num) {
+  if (DeviceMapManager::is_any_device_being_programmed()) {
+    ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n");
+    return;
+  }
+
+  // Free all the resource allocated for open devices before exiting the program.
+  // It also notifies the kernel driver about the termination of the program,
+  // so that the kernel driver won't try to talk to any user-allocated memory
+  // space (mainly for the DMA) after the program exit.
+  DeviceMapManager::discard_all_pcie_device_handles();
+  exit(1);
+}
+
+void abort_signal_handler(int sig_num) {
+  DeviceMapManager::discard_all_pcie_device_handles();
+  exit(1);
+}
+
+int allocate_and_register_linux_signal_number_helper(int pid) {
+  char buffer[4096], *locOfSigCgt;
+  FILE *fp;
+  int bytes_read, status, ret = -1;
+  unsigned long long sigmask = 0;
+  struct sigaction sigusr {}, sigabrt {};
+
+  snprintf(buffer, sizeof(buffer), "/proc/%d/status", pid);
+  fp = fopen(buffer, "rb");
+  ACL_PCIE_ERROR_IF(fp == NULL, return -1, "Unable to open file %s\n", buffer);
+  bytes_read = fread(buffer, sizeof(buffer[0]), sizeof(buffer) - 1, fp);
+  fclose(fp);
+  buffer[bytes_read] = 0;                   // null terminate the string
+  locOfSigCgt = strstr(buffer, "SigCgt:");  // returns null if can't find, shouldn't happen
+  ACL_PCIE_ERROR_IF(locOfSigCgt == NULL, return -1, "Did not find SigCgt: for PID %d\n", pid);
+  sscanf(locOfSigCgt + 7, "%llx", &sigmask);
+
+  // Find an unused signal number
+  for (int i = SIGRTMAX; i >= SIGRTMIN; i--) {
+    if (!((sigmask >> (i - 1)) & 1)) {
+      ret = i;
+      break;
+    }
+  }
+  ACL_PCIE_ERROR_IF(ret == -1, return -1, "Unable to find an unused signal number\n");
+
+  // Enable if driver is using signals to communicate with the host.
+  sigusr.sa_sigaction = pcie_linux_signal_handler;
+  sigusr.sa_flags = SA_SIGINFO;
+  status = sigaction(ret, &sigusr, NULL);
+  if (getenv("ACL_MMD_TEST_INTELFPGA")) {
+    ACL_PCIE_ERROR_IF(((sigmask >> (test_device_exception_signal_number - 1)) & 1),
+                      return -1,
+                      "Signal number %i cannot be occupied\n",
+                      test_device_exception_signal_number);
+    status = sigaction(test_device_exception_signal_number, &sigusr, NULL);
+  }
+  ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, ret);
+
+  // Install signal handler for SIGABRT from assertions in the upper layers
+  sigabrt.sa_handler = abort_signal_handler;
+  sigemptyset(&sigabrt.sa_mask);
+  sigabrt.sa_flags = 0;
+  status = sigaction(SIGABRT, &sigabrt, NULL);
+  ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, SIGABRT);
+
+  // if it makes it here, the user got an unused signal number and we installed all signal handlers
+  return ret;
+}
+
+// returns an unused signal number, -1 means ran into some error
+int allocate_and_register_linux_signal_number(pthread_mutex_t *mutex) {
+  int pid = getpid();
+  int err = pthread_mutex_lock(mutex);
+  ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_lock error %d\n", err);
+
+  // this has multiple return points, put in separate function so that we don't bypass releasing the mutex
+  int ret = allocate_and_register_linux_signal_number_helper(pid);
+
+  err = pthread_mutex_unlock(mutex);
+  ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_unlock error %d\n", err);
+
+  return ret;
+}
+#endif  // LINUX
+
+// Function to install the signal handler for Ctrl-C
+// If ignore_sig != 0, the ctrl-c signal will be ignored by the program
+// If ignore_sig  = 0, the custom signal handler (ctrl_c_handler) will be used
+int install_ctrl_c_handler(int ingore_sig) {
+#if defined(WINDOWS)
+  SetConsoleCtrlHandler((ingore_sig ? NULL : (PHANDLER_ROUTINE)ctrl_c_handler), TRUE);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct sigaction sig;
+  sig.sa_handler = (ingore_sig ? SIG_IGN : ctrl_c_handler);
+  sigemptyset(&sig.sa_mask);
+  sig.sa_flags = 0;
+  sigaction(SIGINT, &sig, NULL);
+#endif  // LINUX
+
+  return 0;
+}
+
+// Function to return the number of boards installed in the system
+unsigned int get_offline_num_boards() {
+  unsigned int num_boards = 0;
+
+  // These are for reading/parsing the environment variable
+  const char *override_count_string = 0;
+  long parsed_count;
+  char *endptr;
+
+// Windows MMD will try to open all the devices
+#if defined(WINDOWS)
+  fpga_result result;
+  fpga_properties filter = NULL;
+
+  result = fpgaGetProperties(NULL, &filter);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n");
+  }
+
+  result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to set object type.\n");
+  }
+
+  result = fpgaPropertiesSetVendorID(filter, ACL_PCI_INTELFPGA_VENDOR_ID);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to set vendor ID.\n");
+  }
+
+  result = fpgaEnumerate(&filter, 1, NULL, 1, &num_boards);
+  if (result != FPGA_OK) {
+    num_boards = ACL_MAX_DEVICE;
+
+    if (filter != NULL) fpgaDestroyProperties(&filter);
+
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to scan for the PCI device.\n");
+  }
+
+  if (filter != NULL) fpgaDestroyProperties(&filter);
+
+  if (num_boards == 0) {
+    num_boards = ACL_MAX_DEVICE;
+  }
+
+End:
+#endif  // WINDOWS
+
+// Linux MMD will look into the number of devices
+#if defined(LINUX)
+  FILE *fp;
+  char str_line_in[BUF_SIZE];
+  char str_board_pkg_name[BUF_SIZE];
+  char str_cmd[SYSTEM_CMD_SIZE];
+
+  snprintf(str_board_pkg_name, sizeof(str_board_pkg_name), "acl%s", ACL_BOARD_PKG_NAME);
+  snprintf(str_cmd, sizeof(str_cmd), "ls /sys/class/aclpci_%s 2>/dev/null", ACL_BOARD_PKG_NAME);
+
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(str_cmd), "Invalid popen() function parameter: %s\n", str_cmd);
+#endif
+  fp = popen(str_cmd, "r");
+
+  if (fp == NULL) {
+    ACL_PCIE_INFO("Couldn't open pipe stream\n");
+    return false;
+  }
+  // Read every line from output
+  while (fgets(str_line_in, BUF_SIZE, fp) != NULL) {
+    if (strncmp(str_board_pkg_name, str_line_in, strnlen(str_board_pkg_name, MAX_NAME_SIZE)) == 0) {
+      num_boards++;
+    }
+  }
+
+  pclose(fp);
+
+#endif  // LINUX
+
+  override_count_string = getenv("CL_OVERRIDE_NUM_DEVICES_INTELFPGA");
+  if (override_count_string) {
+    endptr = 0;
+    parsed_count = strtol(override_count_string, &endptr, 10);
+    if (endptr == override_count_string  // no valid characters
+        || *endptr                       // an invalid character
+        || (parsed_count < 0 || parsed_count >= (long)ACL_MAX_DEVICE)) {
+      // malformed override string, do nothing
+    } else {
+      // Was ok.
+      num_boards = (unsigned int)parsed_count;
+    }
+  }
+
+  return num_boards;
+}
+
+// Get information about the board using the enum aocl_mmd_offline_info_t for
+// offline info (called without a handle), and the enum aocl_mmd_info_t for
+// info specific to a certain board.
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_UNSIGNED(X)                                  \
+  {                                                         \
+    *((unsigned *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(unsigned); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+#if defined(WINDOWS)
+#define RESULT_STR(X)                                                                                         \
+  do {                                                                                                        \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                                              \
+    memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                                               \
+  } while (0)
+#else
+#define RESULT_STR(X)                                                                     \
+  do {                                                                                    \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                          \
+    memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                           \
+  } while (0)
+#endif
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  // It might be helpful to cache the info if function aocl_mmd_get_offline_info is called frequently.
+  unsigned int num_boards;
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(MMD_VERSION);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      num_boards = get_offline_num_boards();
+      RESULT_INT((int)num_boards);
+      break;
+    }
+    case AOCL_MMD_BOARD_NAMES: {
+      // Construct a list of all possible devices supported by this MMD layer
+      std::ostringstream boards;
+      num_boards = get_offline_num_boards();
+      for (unsigned i = 0; i < num_boards; i++) {
+        boards << "acl" << ACL_BOARD_PKG_NAME << i;
+        if (i < num_boards - 1) boards << ";";
+      }
+      RESULT_STR(boards.str().c_str());
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME: {
+      RESULT_STR(ACL_VENDOR_NAME);
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(ACL_PCI_INTELFPGA_VENDOR_ID);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY);
+      break;
+  }
+  return 0;
+}
+
+int aocl_mmd_get_info(
+    int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_get_info failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  switch (requested_info_id) {
+    case AOCL_MMD_BOARD_NAME: {
+      std::ostringstream board_name;
+      board_name << ACL_BOARD_NAME << " (" << DeviceMapManager::get_pcie_device_info(handle).first << ")";
+      RESULT_STR(board_name.str().c_str());
+      break;
+    }
+    case AOCL_MMD_NUM_KERNEL_INTERFACES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_KERNEL_INTERFACES:
+      RESULT_INT(AOCL_MMD_KERNEL);
+      break;
+    case AOCL_MMD_PLL_INTERFACES:
+      RESULT_INT(AOCL_MMD_PLL);
+      break;
+    case AOCL_MMD_MEMORY_INTERFACE:
+      RESULT_INT(AOCL_MMD_MEMORY);
+      break;
+    case AOCL_MMD_PCIE_INFO:
+      RESULT_STR(pcie_dev->get_dev_pcie_info());
+      break;
+    case AOCL_MMD_CONCURRENT_READS:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_CONCURRENT_READS_OR_WRITES:
+      RESULT_INT(1);
+      break;
+    case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CAPABILITIES:
+      RESULT_UNSIGNED(0);
+      break;
+    case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+    case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY:
+      RESULT_SIZE_T(0);
+      break;
+
+    case AOCL_MMD_TEMPERATURE: {
+      float *r;
+      int temp;
+      pcie_dev->get_ondie_temp_slow_call(&temp);
+      r = (float *)param_value;
+      *r = ACL_PCIE_TEMP_FORMULA;
+      if (param_size_ret) *param_size_ret = sizeof(float);
+      break;
+    }
+
+    // currently not supported
+    case AOCL_MMD_BOARD_UNIQUE_ID:
+      return -1;
+  }
+  return 0;
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+// Open and initialize the named device.
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+  static int signal_handler_installed = 0;
+  static int unique_id = 0;
+  int dev_num = -1;
+  static int user_signal_number = -1;
+#if defined(LINUX)
+  static pthread_mutex_t linux_signal_arb_mutex =
+      PTHREAD_MUTEX_INITIALIZER;  // initializes as unlocked, static = no cleanup needed
+
+  if (sscanf(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) {
+    return -1;
+  }
+#endif  // LINUX
+
+#if defined(WINDOWS)
+  if (sscanf_s(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) {
+    return -1;
+  }
+#endif
+  if (dev_num < 0 || dev_num >= ACL_MAX_DEVICE) {
+    return -1;
+  }
+  if (++unique_id <= 0) {
+    unique_id = 1;
+  }
+
+  ACL_PCIE_ASSERT(DeviceMapManager::empty() || DeviceMapManager::get_device_map().count(unique_id) == 0,
+                  "unique_id %d is used before.\n",
+                  unique_id);
+
+  if (signal_handler_installed == 0) {
+#if defined(LINUX)
+    user_signal_number = allocate_and_register_linux_signal_number(&linux_signal_arb_mutex);
+    if (user_signal_number == -1) return -1;
+#endif  // LINUX
+
+    install_ctrl_c_handler(0 /* use the custom signal handler */);
+    signal_handler_installed = 1;
+  }
+
+  ACL_PCIE_DEVICE *pcie_dev = NULL;
+
+  try {
+    pcie_dev = new ACL_PCIE_DEVICE(dev_num, name, unique_id, user_signal_number);
+  }
+
+  // Catch any memory allocation failures
+  catch (std::bad_alloc &) {
+    delete pcie_dev;
+    return -1;
+  }
+
+  if (!pcie_dev->is_valid()) {
+    delete pcie_dev;
+    return -1;
+  }
+
+  DeviceMapManager::add_pcie_device_handle(unique_id, name, pcie_dev);
+  if (pcie_dev->is_initialized()) {
+    return unique_id;
+  } else {
+    // Perform a bitwise-not operation to the unique_id if the device
+    // do not pass the initial test. This negative unique_id indicates
+    // a fail to open the device, but still provide actual the unique_id
+    // to allow reprogram executable to get access to the device and
+    // reprogram the board when the board is not usable.
+    return ~unique_id;
+  }
+}
+
+// Close an opened device, by its handle.
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+  DeviceMapManager::discard_pcie_device_handle(handle);
+
+  return 0;
+}
+
+// Set the interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_kernel_interrupt(fn, user_data);
+}
+
+// Set the device interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_device_interrupt(fn, user_data);
+}
+
+// Set the operation status handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_set_status_handler failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->set_status_handler(fn, user_data);
+}
+
+// Called when the host is idle and hence possibly waiting for events to be
+// processed by the device
+int AOCL_MMD_CALL aocl_mmd_yield(int handle) { return DeviceMapManager::get_pcie_device(handle)->yield(); }
+
+// Read, write and copy operations on a single interface.
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  void *host_addr = dst;
+  size_t dev_addr = offset;
+
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->read_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  void *host_addr = const_cast<void *>(src);
+  size_t dev_addr = offset;
+
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_write failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->write_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return -1,
+                    "aocl_mmd_copy failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->copy_block(op, (aocl_mmd_interface_t)mmd_interface, src_offset, dst_offset, len);
+}
+
+// Initialize host channel specified in channel_name
+int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->create_hostchannel(channel_name, queue_depth, direction);
+}
+
+// reset the host channel specified with channel handle
+int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(),
+      return -1,
+      "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n",
+      handle);
+
+  return pcie_dev->destroy_channel(channel);
+}
+
+// Get the pointer to buffer the user can write/read from the kernel with
+AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(),
+                    return NULL,
+                    "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n",
+                    handle);
+
+  return pcie_dev->hostchannel_get_buffer(buffer_size, channel, status);
+}
+
+// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer
+size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) {
+  ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle);
+  ACL_PCIE_ERROR_IF(
+      !pcie_dev->is_initialized(), *status = -1;
+      return 0, "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n", handle);
+
+  return pcie_dev->hostchannel_ack_buffer(send_size, channel, status);
+}
+
+#ifdef DLA_MMD
+
+AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle)
+{
+  auto ret = DeviceMapManager::get_pcie_device(handle)->pause_and_save_pcie();
+  if (ret) {
+    return -1;
+  }
+  return 0;
+}
+AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle)
+{
+  auto ret = DeviceMapManager::get_pcie_device(handle)->restore_and_resume_pcie();
+  if (ret) {
+    return -1;
+  }
+  return 0;
+}
+// Reprogram the device given the sof file name
+int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename, const bool skipSaveRestore) {
+  if (DeviceMapManager::get_pcie_device(handle)->reprogram_sof(sof_filename, skipSaveRestore))
+  {
+    return -1;
+  }
+  return 0;
+}
+#else
+// Reprogram the device based on the program mode
+int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) {
+  // assuming the an ELF-formatted blob.
+  if (!blob_has_elf_signature(data, data_size)) {
+    ACL_PCIE_DEBUG_MSG("ad hoc fpga bin\n");
+    return -1;
+  }
+
+  // program the device based on the certain mode
+  if (program_mode & AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM) {
+    if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_PR)) return -1;
+    return handle;
+  } else {
+    if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_JTAG)) return -1;
+    // Re-open the device to reinitialize hardware
+    const std::string device_name = DeviceMapManager::get_pcie_device_info(handle).first;
+    DeviceMapManager::discard_pcie_device_handle(handle);
+
+    return aocl_mmd_open(device_name.c_str());
+  }
+}
+#endif
+// Shared memory allocator
+AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) {
+  return DeviceMapManager::get_pcie_device(handle)->shared_mem_alloc(size, device_ptr_out);
+}
+
+// Shared memory de-allocator
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) {
+  DeviceMapManager::get_pcie_device(handle)->shared_mem_free(host_ptr, size);
+}
+
+#ifndef DLA_MMD
+// This function checks if the input data has an ELF-formatted blob.
+// Return true when it does.
+static bool blob_has_elf_signature(void *data, size_t data_size) {
+  bool result = false;
+  if (data && data_size > 4) {
+    unsigned char *cdata = (unsigned char *)data;
+    const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'};  // Little endian
+    result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) &&
+             (cdata[3] == elf_signature[3]);
+  }
+  return result;
+}
+#endif
+
+// Return a positive number when single device open. Otherwise, return -1
+AOCL_MMD_CALL int get_open_handle() {
+  if (DeviceMapManager::empty() || DeviceMapManager::get_device_map().size() != 1) {
+    return -1;
+  }
+  return DeviceMapManager::get_device_map().begin()->first;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t *properties,
+                                        int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_free(void *mem) {
+  // Not supported on this BSP
+  return 0;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) {
+  // Not supported on this BSP
+  return 0;
+}
+
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; }
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; }
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 333.333333; }  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; }
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 33) * instance + addr; }
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(
+      handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  constexpr uint64_t hw_timer_address = 0x37000;
+  const uint32_t start_bit = 1;
+  const uint32_t stop_bit = 2;
+
+  // Send the start command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now();
+  int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to
+  // determine the amount of time between the start and stop commands for the hardware counter
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  // Send the stop command to the hardware counter
+  std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now();
+  status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Read back the value of the counter
+  uint32_t counter = 0;
+  status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, ACL_MMD_KERNEL_HANDLE, hw_timer_address);
+  assert(status == 0);
+
+  // Calculate the clock frequency of the counter, which is running on clk_dla
+  double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count();
+  return 1.0e-6 * counter / elapsed_seconds;  // 1.0e-6 is to convert to MHz
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h
new file mode 100644
index 0000000..cfba6a3
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h
@@ -0,0 +1,177 @@
+#ifndef ACL_PCIE_H
+#define ACL_PCIE_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie.h  --------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file defines macros and types that are used inside the MMD driver          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifndef ACL_PCIE_EXPORT
+#define ACL_PCIE_EXPORT __declspec(dllimport)
+#endif
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#ifdef DLA_MMD
+#include <cstdint>
+#else
+#include <CL/cl_platform.h>
+#endif
+#include "aocl_mmd.h"
+#include "hw_pcie_constants.h"
+
+#define MMD_VERSION AOCL_MMD_VERSION_STRING
+
+#ifdef DLA_MMD
+#include "version.h"
+#else
+#include <version.h>
+#endif
+
+#define KERNEL_DRIVER_VERSION_EXPECTED ACL_DRIVER_VERSION
+
+#if defined(_WIN32) || defined(_WIN64)
+// Need DWORD, UINT32, etc.
+// But windows.h spits out a lot of spurious warnings.
+#pragma warning(push)
+#pragma warning(disable : 4668)
+#include <windows.h>
+#pragma warning(pop)
+
+// OPAE header files
+#include <initguid.h>
+#include <opae/fpga.h>
+#include "fpga_cmd_guids.h"
+
+#define INVALID_DEVICE (NULL)
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%lu"
+#define DWORD_FMT_X "%lx"
+#define DWORD_FMT_4X "%04lX"
+
+// define for the format string for size_t type
+#ifdef _WIN64
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+#else
+#define SIZE_FMT_U "%Iu"
+#define SIZE_FMT_X "%Ix"
+#endif
+
+typedef ULONG64 KPTR;
+typedef UINT64 DMA_ADDR;
+#endif  // WINDOWS
+
+#if defined(LINUX)
+typedef uintptr_t KPTR;
+typedef int fpga_handle;
+typedef unsigned int fpga_result;
+#define FPGA_OK 0
+
+typedef unsigned int DWORD;
+typedef unsigned long long QWORD;
+typedef char INT8;
+typedef unsigned char UINT8;
+typedef int16_t INT16;
+typedef uint16_t UINT16;
+typedef int INT32;
+typedef unsigned int UINT32;
+typedef long long INT64;
+typedef unsigned long long UINT64;
+
+#define INVALID_HANDLE_VALUE ((int)(-1))
+
+// Linux driver-specific exports
+#include "pcie_linux_driver_exports.h"
+
+#define INVALID_DEVICE (-1)
+#define WD_STATUS_SUCCESS 0
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%u"
+#define DWORD_FMT_X "%x"
+#define DWORD_FMT_4X "%04X"
+
+// define for the format string for size_t type
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+
+#endif  // LINUX
+
+#define MAX_NAME_SIZE (1204)
+
+typedef enum {
+  AOCL_MMD_KERNEL = ACL_MMD_KERNEL_HANDLE,  // Control interface into kernel interface
+  AOCL_MMD_MEMORY = ACL_MMD_MEMORY_HANDLE,  // Data interface to device memory
+  AOCL_MMD_PLL = ACL_MMD_PLL_HANDLE,        // Interface for reconfigurable PLL
+  AOCL_MMD_HOSTCH = ACL_MMD_HOSTCH_HANDLE
+} aocl_mmd_interface_t;
+
+// Describes the properties of key components in a standard ACL device
+#define PCIE_INFO_STR_LEN 1024
+#define PCIE_SLOT_INFO_STR_LEN 128
+
+struct ACL_PCIE_DEVICE_DESCRIPTION {
+  DWORD vendor_id;
+  DWORD device_id;
+  char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN];
+  char pcie_info_str[PCIE_INFO_STR_LEN];
+  bool interrupt_valid;
+  UINT32 interrupt_data;
+  UINT64 interrupt_addr;
+};
+
+#define ACL_PCIE_ASSERT(COND, ...)                        \
+  do {                                                    \
+    if (!(COND)) {                                        \
+      printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \
+      printf(__VA_ARGS__);                                \
+      fflush(stdout);                                     \
+      assert(0);                                          \
+    }                                                     \
+  } while (0)
+
+#define ACL_PCIE_ERROR_IF(COND, NEXT, ...) \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define ACL_PCIE_INFO(...)             \
+  do {                                 \
+    printf("MMD INFO : " __VA_ARGS__); \
+    fflush(stdout);                    \
+  } while (0)
+
+// Define the flag of program
+#define ACL_PCIE_PROGRAM_PR 1
+#define ACL_PCIE_PROGRAM_JTAG 0
+
+#endif  // ACL_PCIE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp
new file mode 100644
index 0000000..03c76dd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp
@@ -0,0 +1,1049 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_config.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle functions that program the FPGA.       */
+/* The declaration of the class lives in the acl_pcie_config.h.                    */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_config.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+
+// other standard header files
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+#include <sstream>
+#if defined(WINDOWS)
+#include <process.h>
+#endif  // WINDOWS
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif  // LINUX
+
+#if defined(WINDOWS)
+#define FREEZE_STATUS_OFFSET 0
+#define FREEZE_CTRL_OFFSET 4
+#define FREEZE_VERSION_OFFSET 12
+#define FREEZE_BRIDGE_SUPPORTED_VERSION 0xad000003
+
+#define FREEZE_REQ 1
+#define RESET_REQ 2
+#define UNFREEZE_REQ 4
+
+#define FREEZE_REQ_DONE 1
+#define UNFREEZE_REQ_DONE 2
+
+#define ALT_PR_DATA_OFST 0x00
+#define ALT_PR_CSR_OFST 0x04
+#define ALT_PR_VER_OFST 0x08
+
+#define ALT_PR_CSR_PR_START 1
+#define ALT_PR_CSR_STATUS_SFT 1
+#define ALT_PR_CSR_STATUS_MSK (7 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_NRESET (0 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_BUSY (1 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_IN_PROG (2 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_SUCCESS (3 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_ERR (4 << ALT_PR_CSR_STATUS_SFT)
+
+#define ACL_DMA_PR_ALIGNMENT_BYTES 4096
+
+#define PLL_OFFSET_VERSION_ID 0x000
+#define PLL_OFFSET_ROM 0x400
+#define PLL_OFFSET_RECONFIG_CTRL_S10 0x800
+#define PLL_OFFSET_COUNTER 0x100
+#define PLL_OFFSET_RESET 0x110
+#define PLL_OFFSET_LOCK 0x120
+
+#define PLL_M_HIGH_REG_S10 0x104
+#define PLL_M_LOW_REG_S10 0x107
+#define PLL_M_BYPASS_ENABLE_REG_S10 0x105
+#define PLL_M_EVEN_DUTY_ENABLE_REG_S10 0x106
+
+#define PLL_N_HIGH_REG_S10 0x100
+#define PLL_N_LOW_REG_S10 0x102
+#define PLL_N_BYPASS_ENABLE_REG_S10 0x101
+#define PLL_N_EVEN_DUTY_ENABLE_REG_S10 0x101
+
+#define PLL_C0_HIGH_REG_S10 0x11B
+#define PLL_C0_LOW_REG_S10 0x11E
+#define PLL_C0_BYPASS_ENABLE_REG_S10 0x11C
+#define PLL_C0_EVEN_DUTY_ENABLE_REG_S10 0x11D
+
+#define PLL_C1_HIGH_REG_S10 0x11F
+#define PLL_C1_LOW_REG_S10 0x122
+#define PLL_C1_BYPASS_ENABLE_REG_S10 0x120
+#define PLL_C1_EVEN_DUTY_ENABLE_REG_S10 0x121
+
+#define PLL_LF_REG_S10 0x10A
+
+#define PLL_CP1_REG_S10 0x101
+#define PLL_CP2_REG_S10 0x10D
+
+#define PLL_REQUEST_CAL_REG_S10 0x149
+#define PLL_ENABLE_CAL_REG_S10 0x14A
+#endif  // WINDOWS
+
+#ifndef DLA_MMD
+#include "acl_check_sys_cmd.h"
+#include "pkg_editor.h"
+#endif
+
+// MAX size of line read from pipe-ing the output of find_jtag_cable.tcl to MMD
+#define READ_SIZE 1024
+// MAX size of command passed to system for invoking find_jtag_cable.tcl from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+// Function to install the signal handler for Ctrl-C
+// Implemented inside acl_pcie.cpp
+extern int install_ctrl_c_handler(int ingore_sig);
+
+ACL_PCIE_CONFIG::ACL_PCIE_CONFIG(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma) {
+  m_handle = Handle;
+  m_io = io;
+  m_pcie = pcie;
+  m_dma = dma;
+
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+  UINT32 NumCmds = 0;
+  FpgaCmd = NULL;
+
+  // Get the number of supported commands
+  result = fpgaGetSupportedCommands(Handle, NULL, &NumCmds);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n");
+
+  // Allocate memory for the guid array based on NumCmds
+  FpgaCmd = (fpga_guid *)malloc(NumCmds * sizeof(fpga_guid));
+
+  if (FpgaCmd == NULL) {
+    throw std::bad_alloc();
+  }
+
+  ACL_PCIE_ERROR_IF(FpgaCmd == NULL, return, "malloc failed in ACL_PCIE_CONFIG().\n");
+
+  // Populate the guid array
+  result = fpgaGetSupportedCommands(Handle, FpgaCmd, &NumCmds);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n");
+#endif  // WINDOWS
+
+  return;
+}
+
+ACL_PCIE_CONFIG::~ACL_PCIE_CONFIG() {
+#if defined(WINDOWS)
+  // Free the guid array
+  if (FpgaCmd) {
+    free(FpgaCmd);
+    FpgaCmd = NULL;
+  }
+#endif
+}
+
+// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf
+// For Linux, the actual implementation of PR is inside the kernel mode driver.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len) {
+  int pr_result = 1;  // set to default - failure
+
+  ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n");
+  ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n");
+
+#if defined(WINDOWS)
+  int i;
+  uint32_t version;
+  UINT32 to_send, status;
+  UINT32 *data;
+  fpga_result result;
+
+  /* Get version ID */
+  result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version);
+  ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version);
+
+  /* Check if PR is supported */
+  if (version < (unsigned int)ACL_PR_PIO_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n");
+    pr_result = 1;
+    return pr_result;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  to_send = 0x00000001;
+  ACL_PCIE_DEBUG_MSG(":: Writing 0x%08X to PR IP status register\n", (int)to_send);
+  result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, to_send);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+
+  if ((status != 0x10) && (status != 0x0)) {
+    ACL_PCIE_ERROR_IF(1, return 1, ":: PR IP not in an usable state.\n");
+  }
+
+  data = (UINT32 *)core_bitstream;
+  ACL_PCIE_DEBUG_MSG(":: Writing %d bytes of bitstream file to PR IP at BAR %d, OFFSET 0x%08X\n",
+                     (int)core_rbf_len,
+                     (int)ACL_PRCONTROLLER_BAR,
+                     (int)ACL_PRCONTROLLER_OFFSET);
+  for (i = 0; i < (int)core_rbf_len / 4; i++) {
+    result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, data[i]);
+    ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n");
+  }
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP data register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status);
+  ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n");
+
+  if (status == 0x14) {
+    ACL_PCIE_DEBUG_MSG(":: PR done!: 0x%08X\n", (int)status);
+    pr_result = 0;
+  } else {
+    ACL_PCIE_DEBUG_MSG(":: PR error!: 0x%08X\n", (int)status);
+    pr_result = 1;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: PR completed!\n");
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL};
+
+  cmd_pr.user_addr = core_bitstream;
+  cmd_pr.size = core_rbf_len;
+
+  pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr));
+
+#endif  // LINUX
+
+  return pr_result;
+}
+
+// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf
+// For Linux, the actual implementation of PR is inside the kernel mode driver.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str) {
+  int pr_result = 1;  // set to default - failure
+#if defined(WINDOWS)
+  uint32_t pll_config_array[8] = {0};
+#else
+  int pll_config_array[8] = {0};
+#endif  // WINDOWS
+  std::stringstream converter(pll_config_str);
+
+  ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n");
+  ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n");
+
+  /* parse PLL string */
+  converter >> pll_config_array[0] >> pll_config_array[1] >> pll_config_array[2] >> pll_config_array[3] >>
+      pll_config_array[4] >> pll_config_array[5] >> pll_config_array[6] >> pll_config_array[7];
+  if (converter.fail() == true) {
+    ACL_PCIE_ERROR_IF(1, return 1, "PLL configuration string requires 8 integer elements\n");
+  };
+
+#if defined(WINDOWS)
+  int i, j, k, result, count, chunk_num, frames;
+  size_t offset;
+  uint32_t to_send, status;
+  uint32_t version;
+  uint32_t *data;
+  uint32_t pll_freq_khz, pll_m, pll_n, pll_c0, pll_c1, pll_lf, pll_cp, pll_rc;
+  uint32_t pll_m_high, pll_m_low, pll_m_bypass_enable, pll_m_even_duty_enable;
+  uint32_t pll_n_high, pll_n_low, pll_n_bypass_enable, pll_n_even_duty_enable;
+  uint32_t pll_c0_high, pll_c0_low, pll_c0_bypass_enable, pll_c0_even_duty_enable;
+  uint32_t pll_c1_high, pll_c1_low, pll_c1_bypass_enable, pll_c1_even_duty_enable;
+  uint32_t pll_cp1, pll_cp2;
+  uint32_t pll_byte;
+
+  /* Get version ID */
+  result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version);
+  ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version);
+
+  /* Check if PR is supported */
+  if (version < (unsigned int)ACL_PR_PIO_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n");
+    pr_result = 1;
+    return pr_result;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n");
+
+  /* freeze bridge */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+  ACL_PCIE_DEBUG_MSG(":: Asserting region freeze\n");
+  fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, FREEZE_REQ);
+  Sleep(1);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+  ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+  ACL_PCIE_DEBUG_MSG(":: PR Beginning\n");
+
+  /* PR IP write initialisation */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_VER_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_VER_OFST version is 0x%08X\n", (int)status);
+
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+
+  to_send = ALT_PR_CSR_PR_START;
+  ACL_PCIE_DEBUG_MSG(":: Starting PR by writing 0x%08X to ALT_PR_CSR_OFST\n", (int)to_send);
+  fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, to_send);
+
+  /* Wait for PR to be in progress */
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  i = 0;
+  while (status != ALT_PR_CSR_STATUS_PR_IN_PROG) {
+    Sleep(1);
+    i++;
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  };
+  ACL_PCIE_DEBUG_MSG(":: PR IP initialization took %d ms, ALT_PR_CSR_OFST status is 0x%08X\n", i, (int)status);
+
+  // ---------------------------------------------------------------
+  // Legacy PR using PIO
+  // ---------------------------------------------------------------
+  if ((version >= (unsigned int)ACL_PR_PIO_VERSIONID) && (version < (unsigned int)ACL_PR_DMA_VERSIONID)) {
+    /* PR IP write bitstream */
+    MemoryBarrier();
+    data = (UINT32 *)core_bitstream;
+    count = (int)core_rbf_len;
+    ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X\n", (int)count);
+
+    /* Write out the complete 32-bit chunks */
+    /* Wait for a designated amount of time between 4K chunks */
+    i = 0;
+    j = 0;
+    chunk_num = 0;
+    while (count >= 4) {
+      fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, data[i]);
+      i++;
+      j++;
+      count = count - 4;
+      if (j >= 1024) {
+        chunk_num++;
+        j = 0;
+        Sleep(1);
+      }
+    }
+    ACL_PCIE_DEBUG_MSG(":: Number of 4K chunks written: %d\n", (int)chunk_num);
+    ACL_PCIE_DEBUG_MSG(":: Number of bytes in PR bitstream remaining: %d\n", (int)count);
+
+    /* Write out remaining non 32-bit chunks */
+    to_send = data[i];
+    switch (count) {
+      case 3:
+        to_send = to_send & 0x00ffffff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 2:
+        to_send = to_send & 0x0000ffff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 1:
+        to_send = to_send & 0x000000ff;
+        fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send);
+        break;
+      case 0:
+        break;
+      default:
+        /* This will never happen */
+        return 1;
+    }
+  }
+
+  // ---------------------------------------------------------------
+  // PR using DMA
+  // ---------------------------------------------------------------
+  if (version >= (unsigned int)ACL_PR_DMA_VERSIONID) {
+    /* PR IP write bitstream */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X, initiating DMA transfer to PR IP\n", (int)core_rbf_len);
+
+    /* Write PR bitstream using DMA */
+    frames = (int)core_rbf_len / ACL_DMA_PR_ALIGNMENT_BYTES;
+    ACL_PCIE_DEBUG_MSG(
+        ":: PR bitstream will be sent in %d Byte frames, a total of %d frames\n", ACL_DMA_PR_ALIGNMENT_BYTES, frames);
+
+    // sending in 4kB frames
+    for (k = 0; k < frames; k++) {
+      offset = (size_t)k * ACL_DMA_PR_ALIGNMENT_BYTES;
+      void *host_addr_new = reinterpret_cast<void *>(core_bitstream + offset);
+      size_t dev_addr_new = ACL_PCIE_PR_DMA_OFFSET;
+
+      status = (uint32_t)m_dma->read_write(host_addr_new, dev_addr_new, ACL_DMA_PR_ALIGNMENT_BYTES, NULL, false);
+
+      while (!m_dma->is_idle()) {
+        ACL_PCIE_DEBUG_MSG(":: DMA still in progress...\n");
+      }
+    }
+  }
+
+  // Wait for PR complete
+  MemoryBarrier();
+  result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+  ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+  i = 0;
+  // wait till we get a PR_SUCCESS, or PR_ERROR, or a 1 second timeout
+  while (status != ALT_PR_CSR_STATUS_PR_SUCCESS && status != ALT_PR_CSR_STATUS_PR_ERR && i < 100000) {
+    Sleep(100);
+    i++;
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+    ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status);
+  };
+
+  if (status == ALT_PR_CSR_STATUS_PR_SUCCESS) {
+    /* dynamically reconfigure IOPLL for kernel clock */
+    /* read kernel clock generation version ID */
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_VERSION_ID, &status);
+    ACL_PCIE_DEBUG_MSG(":: Kernel clock generator version ID is 0x%08X\n", (int)status);
+
+    /* extract PLL settings from PLL configuration array */
+    pll_freq_khz = pll_config_array[0];
+    pll_m = pll_config_array[1];
+    pll_n = pll_config_array[2];
+    pll_c0 = pll_config_array[3];
+    pll_c1 = pll_config_array[4];
+    pll_lf = pll_config_array[5];
+    pll_cp = pll_config_array[6];
+    pll_rc = pll_config_array[7];
+
+    ACL_PCIE_DEBUG_MSG(":: PLL settings are %d %d %d %d %d %d %d %d\n",
+                       pll_freq_khz,
+                       pll_m,
+                       pll_n,
+                       pll_c0,
+                       pll_c1,
+                       pll_lf,
+                       pll_cp,
+                       pll_rc);
+
+    // Measure kernel clock frequency
+    fpgaWriteMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0);
+    Sleep(1000);
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status);
+    ACL_PCIE_DEBUG_MSG(":: Before reconfig, kernel clock set to %d Hz\n", (int)status);
+
+    // extract all PLL parameters
+    pll_m_high = (pll_m >> 8) & 0xFF;
+    pll_m_low = pll_m & 0xFF;
+    pll_m_bypass_enable = (pll_m >> 16) & 0x01;
+    pll_m_even_duty_enable = (pll_m >> 17) & 0x01;
+
+    pll_n_high = (pll_n >> 8) & 0xFF;
+    pll_n_low = pll_n & 0xFF;
+    pll_n_bypass_enable = (pll_n >> 16) & 0x01;
+    pll_n_even_duty_enable = (pll_n >> 17) & 0x01;
+
+    pll_c0_high = (pll_c0 >> 8) & 0xFF;
+    pll_c0_low = pll_c0 & 0xFF;
+    pll_c0_bypass_enable = (pll_c0 >> 16) & 0x01;
+    pll_c0_even_duty_enable = (pll_c0 >> 17) & 0x01;
+
+    pll_c1_high = (pll_c1 >> 8) & 0xFF;
+    pll_c1_low = pll_c1 & 0xFF;
+    pll_c1_bypass_enable = (pll_c1 >> 16) & 0x01;
+    pll_c1_even_duty_enable = (pll_c1 >> 17) & 0x01;
+
+    pll_lf = (pll_lf >> 6) & 0xFF;
+
+    pll_cp = pll_cp & 0xFF;
+    pll_cp1 = pll_cp & 0x07;
+    pll_cp2 = (pll_cp >> 3) & 0x07;
+
+    pll_rc = pll_rc & 0x03;
+
+    /* read and write PLL settings */
+    to_send = pll_m_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_m_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_m_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_m_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_n_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_n_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_n_even_duty_enable << 7) | (pll_cp1 << 4) | pll_n_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_c0_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c0_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c0_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_c0_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = pll_c1_high;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_HIGH_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c1_low;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_LOW_REG_S10,
+                  &to_send,
+                  1);
+    to_send = pll_c1_bypass_enable;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_BYPASS_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+    to_send = (pll_c1_even_duty_enable << 7);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_EVEN_DUTY_ENABLE_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = (pll_cp2 << 5);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_CP2_REG_S10,
+                  &to_send,
+                  1);
+
+    to_send = (pll_lf << 3) | (pll_rc << 1);
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_LF_REG_S10,
+                  &to_send,
+                  1);
+
+    // start PLL calibration
+    /* read/modify/write the request calibration */
+    ACL_PCIE_DEBUG_MSG(":: Requesting PLL calibration\n");
+    result = fpgaReadMmio(m_handle,
+                          ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                          ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10,
+                          &pll_byte,
+                          1);
+    to_send = pll_byte | 0x40;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10,
+                  &to_send,
+                  1);
+    /* write 0x03 to enable calibration interface */
+    to_send = 0x03;
+    fpgaWriteMmio(m_handle,
+                  ACL_PCIE_KERNELPLL_RECONFIG_BAR,
+                  ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_ENABLE_CAL_REG_S10,
+                  &to_send,
+                  1);
+    ACL_PCIE_DEBUG_MSG(":: PLL calibration done\n");
+
+    // Measure kernel clock frequency
+    fpgaWriteMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0);
+    Sleep(1000);
+    result = fpgaReadMMIO32(
+        m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status);
+    ACL_PCIE_DEBUG_MSG(":: After reconfig, kernel clock set to %d Hz\n", (int)status);
+
+    /* assert reset */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Asserting region reset\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, RESET_REQ);
+    Sleep(10);
+
+    /* unfreeze bridge */
+    MemoryBarrier();
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status);
+
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+    ACL_PCIE_DEBUG_MSG(":: Removing region freeze\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, UNFREEZE_REQ);
+    Sleep(1);
+
+    ACL_PCIE_DEBUG_MSG(":: Checking freeze bridge status\n");
+    result =
+        fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status);
+    ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status);
+
+    /* deassert reset */
+    MemoryBarrier();
+    ACL_PCIE_DEBUG_MSG(":: Deasserting region reset\n");
+    fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, 0);
+
+    MemoryBarrier();
+    result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status);
+    ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status);
+    if (status == 0x6) {
+      ACL_PCIE_DEBUG_MSG(":: PR done! Status is 0x%08X\n", (int)status);
+      pr_result = 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status);
+      pr_result = 1;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status);
+    pr_result = 1;
+  }
+
+  ACL_PCIE_DEBUG_MSG(":: PR completed!\n");
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL};
+
+  cmd_pr.user_addr = core_bitstream;
+  cmd_pr.size = core_rbf_len;
+  cmd_pr.device_addr = pll_config_array;
+
+  pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr));
+
+#endif  // LINUX
+
+  return pr_result;
+}
+
+// Windows specific code to disable PCIe advanced error reporting on the
+// upstream port.
+// No-op in Linux because save_pcie_control_regs() has already disabled
+// AER on the upstream port.
+// Returns 0 on success
+int ACL_PCIE_CONFIG::disable_AER_windows(void) {
+  fpga_result result = FPGA_OK;
+
+#if defined(WINDOWS)
+  // IOCTL call to disable AER in kernel mode
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_DISABLE_AER), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when disabling AER.\n");
+#endif  // WINDOWS
+  return result;
+}
+
+// Windows specific code to enable PCIe advanced error reporting on the
+// upstream port.
+// No-op in Linux because load_pcie_control_regs() has already enabled
+// AER on the upstream port.
+// Returns 0 on success
+int ACL_PCIE_CONFIG::enable_AER_and_retrain_link_windows(void) {
+  fpga_result result = FPGA_OK;
+
+#if defined(WINDOWS)
+  // IOCTL call to enable AER and retrain link in kernel mode
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_ENABLE_AER_RETRAIN_LINK), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when enabling AER.\n");
+#endif  // WINDOWS
+  return result;
+}
+
+// Program the FPGA using a given SOF file
+// Quartus is needed for this, because,
+//   quartus_pgm is used to program the board through USB blaster
+// For Linux, when the kernel driver is asked to save/load_pcie_control_regs(),
+//   it will also disable/enable the aer on the upstream, so no need to
+//   implement those here.
+// NOTE: This function only works with single device machines - if there
+// are multiple cards (and multiple USB-blasters) in the system, it doesn't
+// properly determine which card is which.  Only the first device will be
+// programmed.
+// Return 0 on success.
+int ACL_PCIE_CONFIG::program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index) {
+  const int MAX_ATTEMPTS = 3;
+  int program_failed = 1;
+  int status;
+  bool use_cable_autodetect = true;
+
+  // If ad_cable value is "0", either JTAG cable autodetect failed or not
+  // supported, then use the default value
+  if (strcmp(ad_cable, "0") == 0) use_cable_autodetect = false;
+
+  const char *cable = getenv("ACL_PCIE_JTAG_CABLE");
+  if (!cable) {
+    if (use_cable_autodetect) {
+      cable = ad_cable;
+      ACL_PCIE_DEBUG_MSG("setting Cable to autodetect value %s\n", cable);
+    } else {
+      cable = "1";
+      ACL_PCIE_DEBUG_MSG("setting Cable to default value %s\n", cable);
+    }
+  }
+
+  const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX");
+  if (!device_index) {
+    if (use_cable_autodetect) {
+      device_index = ad_device_index;
+      ACL_PCIE_DEBUG_MSG("setting Device Index to autodetect value %s\n", device_index);
+    } else {
+      device_index = "1";
+      ACL_PCIE_DEBUG_MSG("setting Device Index to default value %s\n", device_index);
+    }
+  }
+
+  char cmd[4 * 1024];
+#ifdef DLA_MMD
+#if defined(WINDOWS)
+  if ((ACL_PCIE_DEBUG | 0) >= VERBOSITY_DEFAULT) {
+    snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\"", cable, filename, device_index);
+  } else {
+    snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index);
+  }
+#else
+  snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null", cable, filename, device_index);
+#endif
+  ACL_PCIE_INFO("Executing \"%s\"\n", cmd);
+#else
+#if defined(WINDOWS)
+  snprintf(
+      cmd, sizeof(cmd), "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index);
+#endif
+#if defined(LINUX)
+  snprintf(cmd,
+           sizeof(cmd),
+           "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null",
+           cable,
+           filename,
+           device_index);
+#endif
+  ACL_PCIE_DEBUG_MSG("Executing \"%s\"\n", cmd);
+#endif
+
+  // Disable AER
+  status = disable_AER_windows();
+  ACL_PCIE_ERROR_IF(status, return -1, "Failed to disable AER on Windows before programming SOF.\n");
+
+  // Set the program to ignore the ctrl-c signal
+  // This setting will be inherited by the system() function call below,
+  // so that the quartus_pgm call won't be interrupt by the ctrl-c signal.
+  install_ctrl_c_handler(1 /* ignore the signal */);
+
+  // Program FPGA by executing the command
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid system() function parameter: %s\n", cmd);
+#endif
+  for (int attempts = 0; attempts < MAX_ATTEMPTS && program_failed; attempts++) {
+    if (attempts > 0) {
+      ACL_PCIE_INFO("Execution failed.  Will try again in case the error was transient.\n");
+    }
+    program_failed = system(cmd);
+#if defined(WINDOWS)
+    Sleep(2000);
+#endif  // WINDOWS
+#if defined(LINUX)
+    sleep(2);
+#endif  // LINUX
+  }
+
+  // Restore the original custom ctrl-c signal handler
+  install_ctrl_c_handler(0 /* use the custom signal handler */);
+
+  // Enable AER
+  status = enable_AER_and_retrain_link_windows();
+  ACL_PCIE_ERROR_IF(status, return -1, "Failed to enable AER and retrain link on Windows after programming SOF.\n");
+
+  return program_failed;
+}
+
+bool ACL_PCIE_CONFIG::find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index) {
+  FILE *fp;
+  int status;
+  char line_in[READ_SIZE];
+  bool found_cable = false;
+
+  char cmd[SYSTEM_CMD_SIZE];
+  const char *aocl_boardpkg_root = getenv("AOCL_BOARD_PACKAGE_ROOT");
+  if (!aocl_boardpkg_root) {
+    ACL_PCIE_INFO("AOCL_BOARD_PACKAGE_ROOT not set!!!");
+    return false;
+  }
+
+  snprintf(cmd, sizeof(cmd), "aocl do quartus_stp -t %s/scripts/find_jtag_cable.tcl %X", aocl_boardpkg_root, cade_id);
+  ACL_PCIE_DEBUG_MSG("executing \"%s\"\n", cmd);
+
+  // Open PIPE to tcl script
+#ifndef DLA_MMD
+  ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid popen() function parameter: %s\n", cmd);
+#endif
+#if defined(WINDOWS)
+  fp = _popen(cmd, "r");
+#endif  // WINDOWS
+#if defined(LINUX)
+  fp = popen(cmd, "r");
+#endif  // LINUX
+
+  if (fp == NULL) {
+    ACL_PCIE_INFO("Couldn't open fp file\n");
+  } else {
+    // Read everyline and look for matching string from tcl script
+    while (fgets(line_in, READ_SIZE, fp) != NULL) {
+      ACL_PCIE_DEBUG_MSG("%s", line_in);
+      const char *str_match_cable = "Matched Cable:";
+      const char *str_match_dev_name = "Device Name:@";
+      const char *str_match_end = ":";
+      // parsing the string and extracting the cable/index value
+      // from the output of find_jtag_cable.tcl script
+      char *pos_cable = strstr(line_in, str_match_cable);
+      if (pos_cable) {
+        found_cable = true;
+        // find the sub-string locations in the line
+        char *pos_dev_name = strstr(line_in, str_match_dev_name);
+        if (pos_dev_name) {
+          char *pos_end =
+              strstr(pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE), str_match_end);  // Find the last ":"
+          if (pos_end) {
+            // calculate the cable/index string size
+            size_t i_cable_str_len = pos_dev_name - pos_cable - strnlen(str_match_cable, MAX_NAME_SIZE);
+            size_t i_dev_index_str_len = pos_end - pos_dev_name - strnlen(str_match_dev_name, MAX_NAME_SIZE);
+            // extract the cable/index value from the line
+            snprintf(ad_cable,
+                     AD_CABLE_SIZE,
+                     "%.*s",
+                     (int)i_cable_str_len,
+                     pos_cable + strnlen(str_match_cable, MAX_NAME_SIZE));
+            snprintf(ad_device_index,
+                     AD_CABLE_SIZE,
+                     "%.*s",
+                     (int)i_dev_index_str_len,
+                     pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE));
+            ACL_PCIE_DEBUG_MSG("JTAG Autodetect device found Cable:%s, Device Index:%s\n", ad_cable, ad_device_index);
+            break;
+          }
+        }
+      }
+    }
+
+#if defined(WINDOWS)
+    status = _pclose(fp);
+#endif  // WINDOWS
+#if defined(LINUX)
+    status = pclose(fp);
+#endif  // LINUX
+
+    if (status == -1) {
+      /* Error reported by pclose() */
+      ACL_PCIE_INFO("Couldn't close find_cable_with_ISSP file\n");
+    } else {
+      /* Use macros described under wait() to inspect `status' in order
+       *        to determine success/failure of command executed by popen()
+       *        */
+    }
+  }
+
+  if (!found_cable) {
+    ACL_PCIE_INFO("Autodetect Cable not found!!\n");
+  }
+
+  return found_cable;
+}
+
+// Functions to save/load control registers form PCI Configuration Space
+// This saved registers are used to restore the PCIe link after reprogramming
+// through methods other than PR
+// For Windows, the register values are stored in this class, and do
+//   nothing else
+// For Linux, the register values are stored inside the kernel driver,
+//   And, it will disable the interrupt and the aer on the upstream,
+//   when the save_pci_control_regs() function is called. They will
+//   be enable when load_pci_control_regs() is called.
+// Return 0 on success
+int ACL_PCIE_CONFIG::save_pci_control_regs() {
+  int save_failed = 1;
+
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+
+  // IOCTL call to save PCI control register
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SAVE_PCI_CTRL_REG), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when saving PCI Control registers.\n");
+
+  save_failed = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_save = {ACLPCI_CMD_BAR, ACLPCI_CMD_SAVE_PCI_CONTROL_REGS, NULL, NULL};
+  save_failed = read(m_handle, &cmd_save, 0);
+#endif  // LINUX
+
+  return save_failed;
+}
+
+int ACL_PCIE_CONFIG::load_pci_control_regs() {
+  int load_failed = 1;
+#if defined(WINDOWS)
+
+  fpga_result result = FPGA_OK;
+  // IOCTL call to load PCI control register
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_LOAD_PCI_CTRL_REG), NULL, NULL, 0);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when loading PCI Control registers.\n");
+
+  load_failed = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd cmd_load = {ACLPCI_CMD_BAR, ACLPCI_CMD_LOAD_PCI_CONTROL_REGS, NULL, NULL};
+  load_failed = read(m_handle, &cmd_load, 0);
+#endif  // LINUX
+
+  return load_failed;
+}
+
+// Functions to query the PCI related information
+// Use NULL as input for the info that you don't care about
+// Return 0 on success.
+int ACL_PCIE_CONFIG::query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str) {
+  int status = 0;
+#if defined(WINDOWS)
+  fpga_result result = FPGA_OK;
+  // IOCTL call to obtain PCIe gen information
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_GEN), NULL, pcie_gen, sizeof(unsigned int));
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device gen info.\n");
+
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_LANES), NULL, pcie_num_lanes, sizeof(unsigned int));
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device lanes info.\n");
+
+  status = (result == FPGA_OK) ? (0) : (-1);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct acl_cmd driver_cmd;
+
+  if (pcie_gen != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_GEN;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_gen;
+    driver_cmd.size = sizeof(*pcie_gen);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+
+  if (pcie_num_lanes != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_NUM_LANES;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_num_lanes;
+    driver_cmd.size = sizeof(*pcie_num_lanes);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+
+  if (pcie_slot_info_str != NULL) {
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_GET_PCI_SLOT_INFO;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = pcie_slot_info_str;
+    driver_cmd.size = sizeof(pcie_slot_info_str);
+    status |= read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  }
+#endif  // LINUX
+  return status;
+}
+
+void ACL_PCIE_CONFIG::wait_seconds(unsigned seconds) {
+#if defined(WINDOWS)
+  Sleep(seconds * 1000);
+#endif  // WINDOWS
+
+#if defined(LINUX)
+  sleep(seconds);
+#endif  // LINUX
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h
new file mode 100644
index 0000000..3f07634
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h
@@ -0,0 +1,109 @@
+#ifndef ACL_PCIE_CONFIG_H
+#define ACL_PCIE_CONFIG_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_config.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle functions that program the FPGA.         */
+/* The actual implementation of the class lives in the acl_pcie_config.cpp,        */
+/* so look there for full documentation.                                           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#endif
+
+// Forward declaration for classes used by ACL_PCIE_DEVICE
+class ACL_PCIE_DMA;
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+
+#define PCIE_AER_CAPABILITY_ID ((DWORD)0x0001)
+#define PCIE_AER_UNCORRECTABLE_STATUS_OFFSET ((DWORD)0x4)
+#define PCIE_AER_UNCORRECTABLE_MASK_OFFSET ((DWORD)0x8)
+#define PCIE_AER_CORRECTABLE_STATUS_OFFSET ((DWORD)0x10)
+#define PCIE_AER_SURPRISE_DOWN_BIT ((DWORD)(1 << 5))
+
+// The size of the char array that holds the name of autodetect JTAG cable and device index
+#define AD_CABLE_SIZE 10
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif  // LINUX
+
+class ACL_PCIE_CONFIG {
+ public:
+  ACL_PCIE_CONFIG(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma);
+  ~ACL_PCIE_CONFIG();
+
+  // Change the core only via PCIe, using an in-memory image of the core.rbf
+  // This is supported only for Stratix V and newer devices.
+  // Return 0 on success.
+  int program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len);
+  int program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str);
+
+  // Program the FPGA using a given SOF file
+  // Input filename, autodetect cable, autodetect device index
+  // Return 0 on success.
+  int program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index);
+
+  // Look up CADEID using ISSP
+  // Return TRUE with cable value in ad_cable, ad_device_index if cable found
+  // Otherwise return FALSE
+  bool find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index);
+
+  // Functions to save/load control registers from PCI Configuration Space
+  // Return 0 on success.
+  int save_pci_control_regs();
+  int load_pci_control_regs();
+
+  // Functions to query the PCI related information
+  // Use NULL as input for the info that you don't care about
+  // Return 0 on success.
+  int query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str);
+
+  // Windows-specific code to control AER, and retrain the link
+  int enable_AER_and_retrain_link_windows(void);
+  int disable_AER_windows(void);
+
+  // Platform agnostic sleep (in seconds)
+  void wait_seconds(unsigned seconds);
+
+ private:
+  ACL_PCIE_CONFIG &operator=(const ACL_PCIE_CONFIG &) { return *this; }
+
+  ACL_PCIE_CONFIG(const ACL_PCIE_CONFIG &src) {}
+
+  fpga_handle m_handle;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_DMA *m_dma;
+#if defined(WINDOWS)
+  fpga_guid *FpgaCmd;
+#endif  // WINDOWS
+};
+
+#endif  // ACL_PCIE_CONFIG_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp
new file mode 100644
index 0000000..8afc1c7
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp
@@ -0,0 +1,61 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_debug.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#include "acl_pcie_debug.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int ACL_PCIE_DEBUG = 0;
+int ACL_PCIE_WARNING = 1;  // turn on the warning message by default
+
+int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = 0;
+
+void set_mmd_debug() {
+  char* mmd_debug_var = getenv("ACL_PCIE_DEBUG");
+  if (mmd_debug_var) {
+    char* endptr = NULL;
+    long parsed_count;
+    parsed_count = strtol(mmd_debug_var, &endptr, 10);
+    if (endptr == mmd_debug_var  // no valid characters
+        || *endptr               // an invalid character
+        || (parsed_count < 0 || parsed_count >= (long)VERBOSITY_EVERYTHING)) {
+      // malformed string, do nothing
+    } else {
+      ACL_PCIE_DEBUG = (int)parsed_count;
+      printf("\n:: MMD DEBUG LEVEL set to %d\n", ACL_PCIE_DEBUG);
+    }
+  }
+
+  char* hal_debug_dump_flash_bootsect = getenv("ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR");
+  if (hal_debug_dump_flash_bootsect) ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = atoi(hal_debug_dump_flash_bootsect);
+}
+
+void set_mmd_warn_msg() {
+  char* mmd_warn_var = getenv("ACL_PCIE_WARNING");
+  if (mmd_warn_var) {
+    ACL_PCIE_WARNING = atoi(mmd_warn_var);
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h
new file mode 100644
index 0000000..072eabc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h
@@ -0,0 +1,64 @@
+#ifndef ACL_PCIE_DEBUG_H
+#define ACL_PCIE_DEBUG_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_debug.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+enum ACL_VERBOSITY {
+  VERBOSITY_DEFAULT = 1,
+  VERBOSITY_INVOCATION = 2,  // Dump kernel invocation details
+  VERBOSITY_OP = 3,          // Dump operation invocation details
+  VERBOSITY_IRQ = 5,
+  VERBOSITY_BLOCKTX = 9,  // Dump PCIe block transfers
+  VERBOSITY_PCIE = 10,    // Dump all PCIe transactions
+  VERBOSITY_EVERYTHING = 100
+};
+
+extern int ACL_PCIE_DEBUG;
+extern int ACL_PCIE_WARNING;
+extern int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR;
+
+// This function gets the value of ACL_PCIE_DEBUG from the environment variable
+void set_mmd_debug();
+void set_mmd_warn_msg();
+
+#include <stdio.h>
+
+#define ACL_PCIE_DEBUG_MSG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_DEFAULT, m, ##__VA_ARGS__)
+#define ACL_PCIE_DEBUG_MSG_VERBOSE(verbosity, m, ...) \
+  if ((ACL_PCIE_DEBUG | 0) >= verbosity) do {         \
+      printf((m), ##__VA_ARGS__), fflush(stdout);     \
+  } while (0)
+
+#define ACL_PCIE_WARN_MSG(...)            \
+  do {                                    \
+    if (ACL_PCIE_WARNING) {               \
+      printf("** WARNING: " __VA_ARGS__); \
+      fflush(stdout);                     \
+    }                                     \
+  } while (0)
+
+#endif  // ACL_PCIE_DEBUG_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp
new file mode 100644
index 0000000..8489c32
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp
@@ -0,0 +1,2029 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_device.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle operations on a single device.         */
+/* The declaration of the class lives in the acl_pcie_device.h                     */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+#define NOMINMAX
+#include <time.h>
+#endif  // WINDOWS
+
+// common and its own header files
+#include "acl_pcie.h"
+#include "acl_pcie_device.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_config.h"
+#include "acl_pcie_debug.h"
+#include "acl_pcie_dma.h"
+#include "acl_pcie_mm_io.h"
+#if !defined(DLA_MMD) || defined(WINDOWS)
+#include "pkg_editor.h"
+#endif
+
+// other standard header files
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include "acl_pcie_hostch.h"
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif  // LINUX
+
+#define MAX_LEN 1024
+
+#define FREEZE_CTRL_OFFSET 4
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+#define ACL_VERSIONID_MIN 0xA0C7C1E0
+
+static int num_open_devices = 0;
+
+#if defined(WINDOWS)
+fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num);
+
+// Interrupt service routine for all interrupts on the PCIe interrupt line
+// PCIe interrupts in Windows XP are level-based.  The KMD is responsible for
+// masking off the interrupt until this routine can service the request at
+// user-mode priority.
+extern void pcie_interrupt_handler(void *data);
+#endif  // WINDOWS
+#if defined(LINUX)
+fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num);
+#endif  // LINUX
+
+ACL_PCIE_DEVICE::ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number)
+    : kernel_interrupt(NULL),
+      kernel_interrupt_user_data(NULL),
+      device_interrupt(NULL),
+      device_interrupt_user_data(NULL),
+      event_update(NULL),
+      event_update_user_data(NULL),
+      m_user_signal_number(0),
+      m_io(NULL),
+      m_dma(NULL),
+      m_hostch(NULL),
+      m_config(NULL),
+      m_handle(-1),
+      m_device(INVALID_HANDLE_VALUE),
+#if ACL_USE_DMA == 1
+      m_use_dma_for_big_transfers(true),
+#else
+      m_use_dma_for_big_transfers(false),
+#endif
+      m_mmd_irq_handler_enable(false),
+      m_initialized(false),
+      m_being_programmed(false),
+      m_skip_quartus_version_check(false),
+      m_segment(0) {
+  if (NULL == name) {
+    // Throw an error and bail out
+    throw std::runtime_error("Invalid argument, passed in an empty name pointer when creating device object!");
+  }
+
+  int status = 0;
+
+  // Set debug level from the environment variable ACL_PCIE_DEBUG
+  // Determine if warning messages should be disabled depends on ACL_PCIE_WARNING
+  if (num_open_devices == 0) {
+    set_mmd_debug();
+    set_mmd_warn_msg();
+  }
+
+#if defined(WINDOWS)
+  strncpy_s(m_name, MAX_NAME_LENGTH, name, (MAX_NAME_LENGTH - 1));
+#else
+  strncpy(m_name, name, (MAX_NAME_LENGTH - 1));
+#endif
+  m_name[(MAX_NAME_LENGTH - 1)] = '\0';
+
+  m_handle = handle;
+  m_info.vendor_id = ACL_PCI_INTELFPGA_VENDOR_ID;
+  m_info.device_id = 0;  // search for all device id
+  m_info.interrupt_valid = false;
+  m_info.interrupt_data = 0x00;
+  m_info.interrupt_addr = 0x00;
+
+#if defined(WINDOWS)
+  m_device = open_device_windows(&m_info, dev_num);
+#endif  // WINDOWS
+#if defined(LINUX)
+  m_device = open_device_linux(&m_info, dev_num);
+#endif  // LINUX
+
+  // Return to caller if this is simply an invalid device.
+  if (m_device == INVALID_HANDLE_VALUE) {
+    return;
+  }
+
+  // Initialize device IO and CONFIG objects
+  m_io = new ACL_PCIE_MM_IO_MGR(m_device);
+
+  // Initialize the DMA object and enable interrupts on the DMA controller
+  try {
+    m_dma = new ACL_PCIE_DMA(m_device, m_io, this);
+  }
+
+  // Catch any memory allocation failures
+  catch (std::bad_alloc &) {
+    throw std::bad_alloc();
+  }
+
+  try {
+    m_config = new ACL_PCIE_CONFIG(m_device, m_io, this, m_dma);
+  }
+
+  catch (std::bad_alloc &) {
+    throw std::bad_alloc();
+  }
+
+  // Set the segment ID to 0 first forcing cached "segment" to all 1s
+  m_segment = ~m_segment;
+  if (this->set_segment(0x0)) {
+    return;
+  }
+
+  // performance basic I/O tests
+  if (this->version_id_test()) {
+    return;
+  }
+  if (this->wait_for_uniphy()) {
+    return;
+  }
+
+  // Get PCIE information
+  unsigned int pcie_gen, pcie_num_lanes;
+  char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN] = {0};
+
+  status = m_config->query_pcie_info(&pcie_gen, &pcie_num_lanes, pcie_slot_info_str);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to query PCIe related information.\n", m_name);
+  snprintf(m_info.pcie_info_str,
+           PCIE_INFO_STR_LEN,
+           "dev_id = " DWORD_FMT_4X ", bus:slot.func = %s, Gen%u x%u",
+           m_info.device_id,
+           pcie_slot_info_str,
+           pcie_gen,
+           pcie_num_lanes);
+
+  m_user_signal_number = user_signal_number;
+
+  // Initialize the Host Channel object
+  m_hostch = new ACL_PCIE_HOSTCH(m_device, m_io, this, m_dma);
+
+  if (this->enable_interrupts(m_user_signal_number)) {
+    return;
+  }
+
+  char *str_test_quartus_ver = getenv("ACL_SKIP_QUARTUS_VERSION_CHECK");
+  if (str_test_quartus_ver) m_skip_quartus_version_check = 1;
+
+#if defined(WINDOWS)
+  enable_msi(true);
+#endif
+
+#ifdef DLA_MMD
+  // software reset
+  uint32_t software_reset_data = 0;  // value doesn't matter, any write to software reset will cause it to trigger
+  constexpr int software_reset_offset = 0x8000;
+  status = m_io->kernel_if->write_block(software_reset_offset, sizeof(uint32_t), &software_reset_data);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] failed to write block.\n", m_name);
+  // software reset applies backpressure to the avalon interface while the reset counter is running
+  // issue a read request, which will not return until the reset counter is done
+  status = m_io->kernel_if->read_block(software_reset_offset, sizeof(uint32_t), &software_reset_data);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] failed to read block.\n", m_name);
+#endif
+  // Done!
+  m_initialized = true;
+  ACL_PCIE_DEBUG_MSG(":: [%s] successfully initialized (device id: " DWORD_FMT_X ").\n", m_name, m_info.device_id);
+  ACL_PCIE_DEBUG_MSG("::           Using DMA for big transfers? %s\n", (m_use_dma_for_big_transfers ? "yes" : "no"));
+}
+
+ACL_PCIE_DEVICE::~ACL_PCIE_DEVICE() {
+#if defined(WINDOWS)
+  enable_msi(false);
+#endif
+
+  int status = this->disable_interrupts();
+  ACL_PCIE_ERROR_IF(status, /* do nothing */, "[%s] fail disable interrupt in device destructor.\n", m_name);
+
+  if (m_hostch) {
+    delete m_hostch;
+    m_hostch = NULL;
+  }
+  if (m_config) {
+    delete m_config;
+    m_config = NULL;
+  }
+  if (m_dma) {
+    delete m_dma;
+    m_dma = NULL;
+  }
+  if (m_io) {
+    delete m_io;
+    m_io = NULL;
+  }
+
+  if (is_valid()) {
+    --num_open_devices;
+#if defined(WINDOWS)
+    fpga_result result = fpgaClose(m_device);
+    ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "[%s] failed to close the device handle.\n", m_name);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+    close(m_device);
+#endif  // LINUX
+  }
+}
+
+#if defined(WINDOWS)
+// Enable/Disable MSI
+void ACL_PCIE_DEVICE::enable_msi(bool enable) {
+  int status;
+
+  if (!m_info.interrupt_valid) {
+    return;
+  }
+
+  if (!enable) {
+    // disable MSI DATA
+    m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, 0x00);
+  } else {
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_L, m_info.interrupt_addr & 0xffffffff);
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_H, (m_info.interrupt_addr >> 0x20) & 0xffffffff);
+    MemoryBarrier();
+    // enable MSI DATA
+    status = m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, PCIE_CRA_MSI_ENABLE | m_info.interrupt_data );
+  }
+  MemoryBarrier();
+}
+
+fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) {
+  fpga_result result;
+  fpga_handle device = INVALID_HANDLE_VALUE;
+  DWORD pci_class_code_rev = 0;
+  DWORD pci_subsystem_ids = 0;
+  DWORD pci_link_info = 0;
+
+  // Variables for fpga enumerate
+  fpga_properties filter = NULL;
+  UINT32 numMatches;
+  fpga_token afcToken;
+  volatile PUINT64 mmioPtr = NULL;
+
+  // Variables for fpga properties
+  fpga_properties prop = nullptr;
+  UINT8 bus;
+  UINT8 l_device;
+  UINT8 function;
+
+  const UINT8 CAP_PTR_ADDRESS = 0x34;
+  const UINT8 MSI_CAP_ID = 0x05;
+  UINT8 nextCapPtr;
+  UINT8 msiCapPtr;
+  UINT8 capID;
+  bool hasFound = false;
+  UINT8 capArray[2];
+  UINT16 msi_control;
+  UINT16 data16 = 0x00;
+  UINT32 data32 = 0x00;
+  UINT64 data64 = 0x00;
+
+  // Initialize filter structure
+  result = fpgaGetProperties(NULL, &filter);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n");
+  }
+
+  // Set object type in filter structure
+  result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set object type.\n");
+  }
+
+  // Set vendor ID in the filter structure
+  result = fpgaPropertiesSetVendorID(filter, (uint16_t)info->vendor_id);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set vendor ID.\n");
+  }
+
+  // Enumerate all PCI devices and find devices matching the filters
+  result = fpgaEnumerate(&filter, 1, &afcToken, 1, &numMatches);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to scan for the PCI device.\n");
+  }
+
+  if (numMatches < 1) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] Device not found\n", dev_num);
+    device = INVALID_HANDLE_VALUE;
+    goto DestroyTok;
+  }
+
+  // Open the device handle
+  result = fpgaOpen(afcToken, &device, 0);
+  if (result != FPGA_OK) {
+    device = INVALID_HANDLE_VALUE;
+    ACL_PCIE_ERROR_IF(1, goto DestroyTok, "[acl" ACL_BOARD_PKG_NAME "%d] failed to open the device.\n", dev_num);
+  }
+
+  // Map MMIO number 0
+  result = fpgaMapMMIO(device, 0, (PUINT64 *)&mmioPtr);
+  if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to map MMIO.\n", dev_num);
+  }
+
+  // Read SubSystem IDs out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x2C, (PVOID)&pci_subsystem_ids, sizeof(pci_subsystem_ids));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI SubSystem IDs found: 0x%lx\n", dev_num, pci_subsystem_ids);
+  if ((ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 31, 16) != ACL_PCI_SUBSYSTEM_DEVICE_ID) ||
+      (ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 15, 0) != ACL_PCI_SUBSYSTEM_VENDOR_ID)) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME
+                       "%d] PCI SubSystem IDs do not match, found %08lx but expected %04x%04x\n",
+                       dev_num,
+                       pci_subsystem_ids,
+                       ACL_PCI_SUBSYSTEM_DEVICE_ID,
+                       ACL_PCI_SUBSYSTEM_VENDOR_ID);
+    goto Close;
+  }
+  // Save device id
+  info->device_id = ACL_PCI_SUBSYSTEM_DEVICE_ID;
+
+  // Read Class code out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 8, (PVOID)&pci_class_code_rev, sizeof(pci_class_code_rev));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code and Rev is: %lx\n", dev_num, pci_class_code_rev);
+  if (((pci_class_code_rev & (0xff00ff00)) >> 8) != ACL_PCI_CLASSCODE) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code does not match, expected %x, read %ld\n",
+                       dev_num,
+                       ACL_PCI_CLASSCODE,
+                       (pci_class_code_rev & 0xff00ff00) >> 8);
+    goto Close;
+  }
+
+  // Check PCI Revision
+  if ((pci_class_code_rev & 0x0ff) != ACL_PCI_REVISION) {
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Revision does not match\n", dev_num);
+    goto Close;
+  }
+
+  // Read MSI data and address
+  info->interrupt_valid = false;
+  result = fpgaReadPciConfigSpace(device, CAP_PTR_ADDRESS, (PVOID)&nextCapPtr, sizeof(nextCapPtr));
+  while (!hasFound && nextCapPtr > CAP_PTR_ADDRESS && FPGA_OK == result) {
+    result = fpgaReadPciConfigSpace(device, nextCapPtr, (PVOID)&capArray, sizeof(capArray));
+    if (FPGA_OK == result) {
+      capID = capArray[0];
+      if (capID == MSI_CAP_ID) {
+        hasFound = true;
+        info->interrupt_valid = true;
+        info->interrupt_addr = 0x00;
+        info->interrupt_data = 0x00;
+        msiCapPtr = nextCapPtr;
+        result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x02, (PVOID)&msi_control, sizeof(msi_control));
+        if (FPGA_OK == result) {
+          ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] %d-bit address,  %d-bit data\n",
+                             dev_num,
+                             (msi_control & 0x0080) ? 64 : 32,
+                             (msi_control & 0x0200) ? 32 : 16);
+          if (msi_control & 0x0080) {  // 64-bit address
+            result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data64, sizeof(data64));
+            if (FPGA_OK == result) {
+              info->interrupt_addr = data64;
+              if (msi_control & 0x0200) {  // Extended message enable
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data32, sizeof(data32));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data32;
+          }
+              } else {
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data16, sizeof(data16));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data16;
+          }
+              }
+            }
+          } else {  // 32-bit address
+            result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data32, sizeof(data32));
+            if (FPGA_OK == result) {
+              info->interrupt_addr = data32;
+              if (msi_control & 0x0200) {  // Extended message enable
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data32, sizeof(data32));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data32;
+          }
+              } else {
+                result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data16, sizeof(data16));
+                if (FPGA_OK == result) {
+                  info->interrupt_data = data16;
+          }
+              }
+            }
+          }
+        }
+        ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME
+                           "%d] MSI Control = 0x%04x, MSI Address = 0x%llx, MSI Data = 0x%x\n",
+                           dev_num,
+                           msi_control,
+                           info->interrupt_addr,
+                           info->interrupt_data);
+      } else {
+        nextCapPtr = capArray[1];
+      }
+    }
+  }
+
+  if (result != FPGA_OK || !info->interrupt_valid)
+  {
+    ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to read MSI interrupt address/data.\n", dev_num);
+  }
+
+  result = fpgaGetProperties(afcToken, &prop);
+  if (prop) {
+    result = fpgaPropertiesGetBus(prop, &bus);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get bus.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] bus is: %d\n", dev_num, bus);
+    result = fpgaPropertiesGetDevice(prop, &l_device);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get device.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] device is: %d\n", dev_num, l_device);
+    result = fpgaPropertiesGetFunction(prop, &function);
+    if (result != FPGA_OK) {
+    ACL_PCIE_ERROR_IF(1, goto Close, "failed to get function.\n");
+    }
+    ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] function is: %d\n", dev_num, function);
+    snprintf(info->pcie_slot_info_str,
+             PCIE_SLOT_INFO_STR_LEN,
+             "%u:%u.%u",
+             bus, l_device, function);
+    fpgaDestroyProperties(&prop);
+  }
+  // Read Link status out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x80, (PVOID)&pci_link_info, sizeof(pci_link_info));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Status is: 0x%lx\n", dev_num, pci_link_info);
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Speed is: %d\n",
+                     dev_num,
+                     ((pci_link_info >> 16) & 0x0F));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Negotiated Link Width is: %d\n",
+                     dev_num,
+                     ((pci_link_info >> 20) & 0x3F));
+
+  // Read Maximum Payload Size out of PCI config space
+  result = fpgaReadPciConfigSpace(device, 0x78, (PVOID)&pci_link_info, sizeof(pci_link_info));
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size raw data is: 0x%lx\n", dev_num, pci_link_info);
+  ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is: %d\n", dev_num, ((pci_link_info >> 5) & 0x0007));
+  switch ((pci_link_info >> 5) & 0x0007) {
+    case 0:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 128-byte\n", dev_num);
+      break;
+    case 1:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 256-byte\n", dev_num);
+      break;
+    case 2:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 512-byte\n", dev_num);
+      break;
+    case 3:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 1024-byte\n", dev_num);
+      break;
+    case 4:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 2048-byte\n", dev_num);
+      break;
+    default:
+      ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is Unknown\n", dev_num);
+      break;
+  }
+
+  ++num_open_devices;
+  goto DestroyTok;
+
+  // Resource cleanup
+
+Close:
+  fpgaClose(device);
+  device = INVALID_HANDLE_VALUE;
+
+DestroyTok:
+
+  if (afcToken != NULL) fpgaDestroyToken(&afcToken);
+
+DestroyProp:
+
+  if (filter != NULL) fpgaDestroyProperties(&filter);
+
+End:
+  return device;
+}
+#endif  // WINDOWS
+
+#if defined(LINUX)
+fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) {
+  char buf[128] = {0};
+  char expected_ver_string[128] = {0};
+  int descriptor;
+  int oldflags;
+  int bytes_read;
+  struct acl_cmd driver_cmd;
+
+  snprintf(buf, sizeof(buf), "/dev/acl" ACL_BOARD_PKG_NAME "%d", dev_num);
+  ssize_t device = open(buf, O_RDWR);
+
+  // Return INVALID_DEVICE when the device is not available
+  if (device == -1) {
+    goto Close;
+  }
+
+  // Make sure the Linux kernel driver is recent
+  driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_DRIVER_VERSION, NULL, buf, 0};
+  bytes_read = read(device, &driver_cmd, 0);
+  ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command");
+
+  snprintf(
+      expected_ver_string, sizeof(expected_ver_string), "%s.%s", ACL_BOARD_PKG_NAME, KERNEL_DRIVER_VERSION_EXPECTED);
+  ACL_PCIE_ERROR_IF(strstr(buf, expected_ver_string) != buf,
+                    goto Close,
+                    "Kernel driver mismatch: The board kernel driver version is %s, but\nthis host program expects "
+                    "%s.\n  Please reinstall the driver using aocl install.\n",
+                    buf,
+                    expected_ver_string);
+
+  // Save the device id for the selected board
+  driver_cmd.bar_id = ACLPCI_CMD_BAR;
+  driver_cmd.command = ACLPCI_CMD_GET_PCI_DEV_ID;
+  driver_cmd.device_addr = NULL;
+  driver_cmd.user_addr = &info->device_id;
+  driver_cmd.size = sizeof(info->device_id);
+  bytes_read = read(device, &driver_cmd, sizeof(driver_cmd));
+  ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command");
+
+  // Set the FD_CLOEXEC flag for the file handle to disable the child to
+  // inherit this file handle. So the jtagd will not hold the file handle
+  // of the device and keep sending bogus interrupts after we call quartus_pgm.
+  oldflags = fcntl(device, F_GETFD, 0);
+  descriptor = fcntl(device, F_SETFD, oldflags | FD_CLOEXEC);
+  if (descriptor < 0) {
+    goto Close;
+  }
+
+  ++num_open_devices;
+  goto End;
+
+// I really don't want to use goto but it's for consistency with windows version, and convenience with macros
+Close:
+  if (device >= 0) {
+    close(device);
+  }
+  device = INVALID_HANDLE_VALUE;
+
+End:
+  return device;
+}
+
+#endif  // LINUX
+
+// This function can be used for triggering a fake device exception for testing
+void ACL_PCIE_DEVICE::test_trigger_device_interrupt() {
+  // Example:
+  // Raising ECC NON CORRECTABLE exception (exception code 2)
+  // Providing integer-type private_info (say, equals to 5)
+  unsigned long long int exception_type = 2;
+  int test_private_info = 5;
+  aocl_mmd_interrupt_info interrupt_data = {exception_type, &test_private_info, sizeof(test_private_info)};
+  this->device_interrupt(m_handle, &interrupt_data, this->device_interrupt_user_data);
+}
+
+// Perform operations required when an interrupt is received for this device
+void ACL_PCIE_DEVICE::service_interrupt(unsigned int irq_type_flag) {
+  unsigned int kernel_update = 0;
+  unsigned int dma_update = 0;
+
+  int status = this->get_interrupt_type(&kernel_update, &dma_update, irq_type_flag);
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_IRQ,
+                             ":: [%s] Irq service routine called, kernel_update=%d, dma_update=%d \n",
+                             m_name,
+                             kernel_update,
+                             dma_update);
+
+  if (kernel_update && kernel_interrupt != NULL) {
+#if defined(WINDOWS)
+    status = this->mask_irqs();
+    ACL_PCIE_ERROR_IF(status, return, "[%s] failed to mask kernel interrupt.\n", m_name);
+#endif
+    // A kernel-status interrupt - update the status of running kernels
+    ACL_PCIE_ASSERT(kernel_interrupt, "[%s] received kernel interrupt before the handler is installed.\n", m_name);
+    kernel_interrupt(m_handle, kernel_interrupt_user_data);
+  } else if (dma_update) {
+    // A DMA-status interrupt - let the DMA object handle this
+    m_dma->service_interrupt();
+  }
+
+  // Unmask the kernel_irq to enable the interrupt again.
+  if (m_mmd_irq_handler_enable) {
+    status = this->unmask_irqs();
+  } else if (kernel_update) {
+    status = this->unmask_kernel_irq();
+  }
+  ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name);
+
+  return;
+}
+
+// Enable all interrupts (DMA and Kernel)
+// Won't enable kernel irq unless kernel interrupt callback has been initialized
+// Return 0 on success
+int ACL_PCIE_DEVICE::unmask_irqs() {
+  int status = 0;
+  if (kernel_interrupt == NULL) {
+    // No masking for DMA interrupt.
+
+  } else {
+    status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC));
+  }
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask all interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Disable all interrupts to service kernel that triggered interrupt
+// If other kernels finish while the interrupt is masked, MSI will trigger again when
+// interrupts are re-enabled.
+int ACL_PCIE_DEVICE::mask_irqs() {
+  int status = 0;
+  UINT32 val = 0;
+  status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to mask the kernel interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Enable the kernel interrupt only
+// Return 0 on success
+int ACL_PCIE_DEVICE::unmask_kernel_irq() {
+  int status = 0;
+  UINT32 val = 0;
+
+  status |= (int)(m_io->pcie_cra->read32(PCIE_CRA_IRQ_ENABLE, &val));
+  val |= ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC);
+  status |= (int)(m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val));
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask the kernel interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Disable the interrupt
+// Return 0 on success
+int ACL_PCIE_DEVICE::disable_interrupts() {
+  int status;
+
+  if (m_mmd_irq_handler_enable) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Disabling interrupts.\n", m_name);
+
+    status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0);
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to disable pcie interrupt.\n", m_name);
+
+#if defined(WINDOWS)
+    // Disable KMD interrupt handling for Windows
+    fpga_properties prop = {0};
+    fpga_result result = FPGA_OK;
+    uint32_t num_interrupts = 0;
+    uint32_t i = 0;
+
+    // Get number of interrupts in the device from the properties structure
+    result = fpgaGetPropertiesFromHandle(m_device, &prop);
+    ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name);
+
+    result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts);
+    if (result != FPGA_OK) {
+      fpgaDestroyProperties(&prop);
+      ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name);
+    }
+
+    if (dev_event_handle != NULL) {
+      // Loop through all the interrupts and unregister the event and
+      // destroy event handle associated with the interrupt
+      for (i = 0; i < num_interrupts; i++) {
+        result = fpgaUnregisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i]);
+
+        if (result != FPGA_OK) {
+          fpgaDestroyProperties(&prop);
+          ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaRegisterEvent Failed\n", m_name);
+        }
+
+        result = fpgaDestroyEventHandle(&dev_event_handle[i]);
+        if (result != FPGA_OK) {
+          fpgaDestroyProperties(&prop);
+          ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaCreateEventHandle Failed\n", m_name);
+        }
+      }
+      free(dev_event_handle);
+      dev_event_handle = NULL;
+    }
+    fpgaDestroyProperties(&prop);
+#endif  // WINDOWS
+    m_mmd_irq_handler_enable = false;
+  }
+
+  return 0;  // success
+}
+
+#if defined(WINDOWS)
+
+// Enable PCI express interrupts.  Set up the KMD to mask the interrupt enable bit when
+//    an interrupt is received to prevent the level-sensitive interrupt from immediately
+//    firing again.
+// Return 0 on success
+int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) {
+  int status;
+  fpga_properties prop = NULL;
+  fpga_result result = FPGA_OK;
+  uint32_t num_interrupts = 0;
+  uint32_t i = 0;
+  HANDLE deviceStopWaitObj = NULL;
+  BOOLEAN flag;
+  int ret_value = 0;  // return 0 on success
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts.\n", m_name);
+
+  // Mask off hardware interrupts before enabling them
+  status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to mask off all interrupts before enabling them.\n", m_name);
+
+  // Enable interrupts in the KMD
+
+  // Get number of interrupts in the device from the properties structure
+  result = fpgaGetPropertiesFromHandle(m_device, &prop);
+  ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name);
+
+  result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts);
+  if (result != FPGA_OK) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name);
+  }
+
+  dev_event_handle = NULL;
+  dev_event_handle = (fpga_event_handle *)malloc(sizeof(fpga_event_handle) * num_interrupts);
+  if (dev_event_handle == NULL) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] malloc for event handle array Failed\n", m_name);
+  }
+
+  // Loop through all the interrupts and register an event and
+  // create event handle associated with the interrupt
+
+  for (i = 0; i < num_interrupts; i++) {
+    result = fpgaCreateEventHandle(&dev_event_handle[i]);
+    if (result != FPGA_OK) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaCreateEventHandle Failed\n", m_name);
+    }
+
+    result = fpgaRegisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i], i);
+    if (result != FPGA_OK) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name);
+    }
+
+    // Register the user-mode interrupt handler
+    // Executed after interrupt is recieved and processed in kernel
+    flag = (BOOLEAN)RegisterWaitForSingleObject(&deviceStopWaitObj,
+                                                dev_event_handle[i],
+                                                (WAITORTIMERCALLBACK)pcie_interrupt_handler,
+                                                static_cast<void *>(this),
+                                                INFINITE,
+                                                WT_EXECUTEINWAITTHREAD);
+
+    if (flag == 0) {
+      ret_value = -1;
+      ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name);
+    }
+  }
+  status = this->unmask_irqs();
+  if (status) {
+    ret_value = -1;
+    ACL_PCIE_ERROR_IF(1, goto End, "[%s] failed to enable interrupts.\n", m_name);
+  }
+
+  m_mmd_irq_handler_enable = true;
+
+  // Resource cleanup
+End:
+  fpgaDestroyProperties(&prop);
+  return ret_value;
+}
+
+// Use irq status to determine type of interrupt
+// Result is returned in kernel_update/dma_update arguments.
+// Return 0 on success
+int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update,
+                                        unsigned int *dma_update,
+                                        unsigned int irq_type_flag) {
+  UINT32 irq_status;
+  unsigned int dma_status;
+  int status;
+
+  status = m_io->pcie_cra->read32(PCIE_CRA_IRQ_STATUS, &irq_status);
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to interrupt type.\n", m_name);
+
+  *kernel_update = ACL_PCIE_READ_BIT(irq_status, ACL_PCIE_KERNEL_IRQ_VEC);
+
+  status = m_dma->check_dma_interrupt(&dma_status);
+  if (status != 1) {
+    *dma_update = dma_status;
+  }
+
+  return 0;  // success
+}
+
+#endif  // WINDOWS
+#if defined(LINUX)
+
+// For Linux, it will set-up a signal handler for signals for kernel driver
+// Return 0 on success
+int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) {
+  int status;
+  ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts on Linux (via signals).\n", m_name);
+
+  // All interrupt controls are in the kernel driver.
+  m_mmd_irq_handler_enable = false;
+
+  // Send the globally allocated signal number to the driver
+  struct acl_cmd signal_number_cmd {};
+  signal_number_cmd.bar_id = ACLPCI_CMD_BAR;
+  signal_number_cmd.command = ACLPCI_CMD_SET_SIGNAL_NUMBER;
+  signal_number_cmd.device_addr = NULL;
+  signal_number_cmd.user_addr = &user_signal_number;
+  signal_number_cmd.size = sizeof(user_signal_number);
+  status = write(m_device, &signal_number_cmd, sizeof(signal_number_cmd));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set signal number for interrupts.\n", m_name);
+
+  // Sanity check, did the driver get it
+  int readback_signal_number;
+  signal_number_cmd.user_addr = &readback_signal_number;
+  signal_number_cmd.command = ACLPCI_CMD_GET_SIGNAL_NUMBER;
+  signal_number_cmd.size = sizeof(readback_signal_number);
+  status = read(m_device, &signal_number_cmd, sizeof(signal_number_cmd));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to get signal number for interrupts.\n", m_name);
+  ACL_PCIE_ERROR_IF(readback_signal_number != user_signal_number,
+                    return -1,
+                    "[%s] got wrong signal number %d, expected %d\n",
+                    m_name,
+                    readback_signal_number,
+                    user_signal_number);
+
+  // Set "our" device id (the handle id received from acl_pcie.cpp) to correspond to
+  // the device managed by the driver. Will get back this id
+  // with signal from the driver. Will allow us to differentiate
+  // the source of kernel-done signals with multiple boards.
+
+  // the last bit is reserved as a flag for DMA completion
+  int result = m_handle << 1;
+  struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_SET_SIGNAL_PAYLOAD, NULL, &result};
+  status = write(m_device, &read_cmd, sizeof(result));
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to enable interrupts.\n", m_name);
+
+  return 0;  // success
+}
+
+// Determine the interrupt type using the irq_type_flag
+// Return 0 on success
+int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update,
+                                        unsigned int *dma_update,
+                                        unsigned int irq_type_flag) {
+  // For Linux, the interrupt type is mutually exclusive
+  *kernel_update = irq_type_flag ? 0 : 1;
+  *dma_update = 1 - *kernel_update;
+
+  return 0;  // success
+}
+
+#endif  // LINUX
+
+// Called by the host program when there are spare cycles
+int ACL_PCIE_DEVICE::yield() {
+  // Give the DMA object a chance to crunch any pending data
+  return m_dma->yield();
+}
+
+// Set kernel interrupt and event update callbacks
+// return 0 on success
+int ACL_PCIE_DEVICE::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  int status;
+
+  kernel_interrupt = fn;
+  kernel_interrupt_user_data = user_data;
+
+  if (m_device != INVALID_HANDLE_VALUE) {
+    status = this->unmask_kernel_irq();
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set kernel interrupt callback funciton.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data) {
+  int status;
+
+  device_interrupt = fn;
+  device_interrupt_user_data = user_data;
+
+  if (m_device != INVALID_HANDLE_VALUE) {
+    status = this->unmask_kernel_irq();
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set device interrupt callback funciton.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) {
+  event_update = fn;
+  event_update_user_data = user_data;
+
+  return 0;  // success
+}
+
+// The callback function set by "set_status_handler"
+// It's used to notify/update the host whenever an event is finished
+void ACL_PCIE_DEVICE::event_update_fn(aocl_mmd_op_t op, int status) {
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update is called with a empty update function pointer.\n", m_name);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, ":: [%s] Update for event e=%p.\n", m_name, op);
+  event_update(m_handle, event_update_user_data, op, status);
+}
+
+// Forward get buffer call to host channel
+void *ACL_PCIE_DEVICE::hostchannel_get_buffer(size_t *buffer_size, int channel, int *status) {
+  return m_hostch->get_buffer(buffer_size, channel, status);
+}
+// Forward ack call to host channel
+size_t ACL_PCIE_DEVICE::hostchannel_ack_buffer(size_t send_size, int channel, int *status) {
+  return m_hostch->ack_buffer(send_size, channel, status);
+}
+
+// Memory I/O
+// return 0 on success
+int ACL_PCIE_DEVICE::write_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) {
+#ifdef DLA_MMD
+  ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::write_block");
+#else
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+#endif
+  int status = -1;  // assume failure
+
+  switch (mmd_interface) {
+    case AOCL_MMD_KERNEL:
+      status = m_io->kernel_if->write_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_MEMORY:
+      status = read_write_block(e, host_addr, dev_addr, size, false /*writing*/);
+      break;
+    case AOCL_MMD_PLL:
+      status = m_io->pll->write_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_HOSTCH:
+    default:
+      ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name);
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to write block.\n", m_name);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_DEVICE::read_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) {
+#ifdef DLA_MMD
+  ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::read_block");
+#else
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+#endif
+  int status = -1;  // assume failure
+
+  switch (mmd_interface) {
+    case AOCL_MMD_KERNEL:
+      status = m_io->kernel_if->read_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_MEMORY:
+      status = read_write_block(e, host_addr, dev_addr, size, true /*reading*/);
+      break;
+    case AOCL_MMD_PLL:
+      status = m_io->pll->read_block(dev_addr, size, host_addr);
+      break;
+    case AOCL_MMD_HOSTCH:
+    default:
+      ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name);
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read block.\n", m_name);
+
+  return 0;  // success
+}
+
+// Copy a block between two locations in device memory
+// return 0 on success
+int ACL_PCIE_DEVICE::copy_block(
+    aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size) {
+  ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                             ":: [%s] Copying " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X " (device) to 0x" SIZE_FMT_X
+                             " (device), with e=%p\n",
+                             m_name,
+                             size,
+                             src,
+                             dst,
+                             e);
+
+#define BLOCK_SIZE (8 * 1024 * 1024)
+#if defined(WINDOWS)
+  __declspec(align(128)) static unsigned char data[BLOCK_SIZE];
+#endif  // WINDOWS
+#if defined(LINUX)
+  static unsigned char data[BLOCK_SIZE] __attribute__((aligned(128)));
+#endif  // LINUX
+
+  do {
+    size_t transfer_size = (size > BLOCK_SIZE) ? BLOCK_SIZE : size;
+    read_block(NULL /* blocking read  */, mmd_interface, data, src, transfer_size);
+    write_block(NULL /* blocking write */, mmd_interface, data, dst, transfer_size);
+
+    src += transfer_size;
+    dst += transfer_size;
+    size -= transfer_size;
+  } while (size > 0);
+
+  if (e) {
+    this->event_update_fn(e, 0);
+  }
+
+  return 0;  // success
+}
+
+// Forward create hostchannel call to host channel
+int ACL_PCIE_DEVICE::create_hostchannel(char *name, size_t queue_depth, int direction) {
+  return m_hostch->create_hostchannel(name, queue_depth, direction);
+}
+
+// Forward destroy hostchannel call to host channel
+int ACL_PCIE_DEVICE::destroy_channel(int channel) { return m_hostch->destroy_hostchannel(channel); }
+
+// Read or Write a block of data to device memory.
+// Use either DMA or directly read/write through BAR
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(host_addr);
+
+  int status = 0;
+  size_t dma_size = 0;
+
+#ifdef DLA_MMD
+  // CoreDLA runtime assumes host/device transfers are thread safe, enforce that here
+  // mutex will unlock when its lock goes out of scope
+  std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex);
+#endif
+
+  if (reading) {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Reading " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X
+                               " (device) to %p (host), with e=%p\n",
+                               m_name,
+                               size,
+                               dev_addr,
+                               host_addr,
+                               e);
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Writing " SIZE_FMT_U " bytes data from %p (host) to 0x" SIZE_FMT_X
+                               " (device), with e=%p\n",
+                               m_name,
+                               size,
+                               host_addr,
+                               dev_addr,
+                               e);
+  }
+
+  // Return immediately if size is zero
+  if (size == 0) {
+    if (e) {
+      this->event_update_fn(e, 0);
+    }
+    return 0;
+  }
+
+  bool aligned = ((uintptr_host & DMA_ALIGNMENT_BYTE_MASK) | (dev_addr & DMA_ALIGNMENT_BYTE_MASK)) == 0;
+  if (m_use_dma_for_big_transfers && aligned && (size >= 1024)) {
+    // DMA transfers must END at aligned boundary.
+    // If that's not the case, use DMA up to such boundary, and regular
+    // read/write for the remaining part.
+    dma_size = size - (size & DMA_ALIGNMENT_BYTE_MASK);
+  } else if (m_use_dma_for_big_transfers && (size >= 1024)) {
+    ACL_PCIE_WARN_MSG("[%s] NOT using DMA to transfer " SIZE_FMT_U
+                      " bytes from %s to %s because of lack of alignment\n"
+                      "**                 host ptr (%p) and/or dev offset (0x" SIZE_FMT_X
+                      ") is not aligned to %u bytes\n",
+                      m_name,
+                      size,
+                      (reading ? "device" : "host"),
+                      (reading ? "host" : "device"),
+                      host_addr,
+                      dev_addr,
+                      DMA_ALIGNMENT_BYTES);
+  }
+
+  // Perform read/write through BAR if the data is not fit for DMA or if there is remaining part from DMA
+  if (dma_size < size) {
+    void *host_addr_new = reinterpret_cast<void *>(uintptr_host + dma_size);
+    size_t dev_addr_new = dev_addr + dma_size;
+    size_t remain_size = size - dma_size;
+
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP,
+                               ":: [%s] Perform read/write through BAR for remaining " SIZE_FMT_U
+                               " bytes (out of " SIZE_FMT_U " bytes)\n",
+                               m_name,
+                               remain_size,
+                               size);
+
+    status = read_write_block_bar(host_addr_new, dev_addr_new, remain_size, reading);
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to perform read/write through BAR.\n", m_name);
+  }
+
+  if (dma_size != 0) {
+    m_dma->read_write(host_addr, dev_addr, dma_size, e, reading);
+
+    // Block if event is NULL
+    if (e == NULL) {
+      m_dma->stall_until_idle();
+    }
+  } else {
+    if (e != NULL) {
+      this->event_update_fn(e, 0);
+    }
+  }
+
+  return 0;  // success
+}
+
+// Read or Write a block of data to device memory through BAR
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  void *cur_host_addr = host_addr;
+  size_t cur_dev_addr = dev_addr;
+  size_t bytes_transfered = 0;
+
+  for (bytes_transfered = 0; bytes_transfered < size;) {
+    // decide the size to transfer for current iteration
+    size_t cur_size = ACL_PCIE_MEMWINDOW_SIZE - (cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE);
+    if (bytes_transfered + cur_size >= size) {
+      cur_size = size - bytes_transfered;
+    }
+
+    // set the proper window segment
+    set_segment(cur_dev_addr);
+    size_t window_rel_ptr_start = cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE;
+    size_t window_rel_ptr = window_rel_ptr_start;
+
+    // A simple blocking read
+    // The address should be in the global memory range, we assume
+    // any offsets are already accounted for in the offset
+    ACL_PCIE_ASSERT(window_rel_ptr + cur_size <= ACL_PCIE_MEMWINDOW_SIZE,
+                    "[%s] trying to access out of the range of the memory window.\n",
+                    m_name);
+
+    // Workaround a bug in Jungo driver.
+    // First, transfer the non 8 bytes data at the front, one byte at a time
+    // Then, transfer multiple of 8 bytes (size of size_t) using read/write_block
+    // At the end, transfer the remaining bytes, one byte at a time
+    size_t dev_odd_start = std::min(sizeof(size_t) - window_rel_ptr % sizeof(size_t), cur_size);
+    if (dev_odd_start != sizeof(size_t)) {
+      read_write_small_size(cur_host_addr, window_rel_ptr, dev_odd_start, reading);
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, dev_odd_start);
+      cur_size -= dev_odd_start;
+    }
+
+    size_t tail_size = cur_size % sizeof(size_t);
+    size_t size_mul_8 = cur_size - tail_size;
+
+    if (size_mul_8 != 0) {
+      if (reading) {
+        m_io->mem->read_block(window_rel_ptr, size_mul_8, cur_host_addr);
+      } else {
+        m_io->mem->write_block(window_rel_ptr, size_mul_8, cur_host_addr);
+      }
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, size_mul_8);
+    }
+
+    if (tail_size != 0) {
+      read_write_small_size(cur_host_addr, window_rel_ptr, tail_size, reading);
+      incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, tail_size);
+      cur_size -= tail_size;
+    }
+
+    // increase the current device address to be transferred
+    cur_dev_addr += (window_rel_ptr - window_rel_ptr_start);
+  }
+
+  return 0;  // success
+}
+
+// Read or Write a small size of data to device memory, one byte at a time
+// Return 0 on success
+int ACL_PCIE_DEVICE::read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading) {
+  UINT8 *ucharptr_host = static_cast<UINT8 *>(host_addr);
+  int status;
+
+  for (size_t i = 0; i < size; ++i) {
+    if (reading) {
+      status = m_io->mem->read8(dev_addr + i, ucharptr_host + i);
+    } else {
+      status = m_io->mem->write8(dev_addr + i, ucharptr_host[i]);
+    }
+    ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read write with odd size.\n", m_name);
+  }
+
+  return 0;  // success
+}
+
+// Set the segment that the memory windows is accessing to
+// Return 0 on success
+int ACL_PCIE_DEVICE::set_segment(size_t addr) {
+  UINT64 segment_readback;
+  UINT64 cur_segment = addr & ~(ACL_PCIE_MEMWINDOW_SIZE - 1);
+  int status = 0;
+
+  // Only execute the PCI write if we need to *change* segments
+  if (cur_segment != m_segment) {
+    // PCIe reordering rules could cause the segment change to get reordered,
+    // so read before and after!
+    status |= (int)(m_io->window->read64(0, &segment_readback));
+
+    status |= (int)(m_io->window->write64(0, cur_segment));
+    m_segment = cur_segment;
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::::: [%s] Changed segment id to %llu.\n", m_name, m_segment);
+
+    status |= (int)(m_io->window->read64(0, &segment_readback));
+  }
+
+  ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set segment for memory access windows.\n", m_name);
+
+  return 0;  // success
+}
+
+void ACL_PCIE_DEVICE::incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr) {
+  const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(*host);
+
+  *host = reinterpret_cast<void *>(uintptr_host + incr);
+  *dev += incr;
+  *counter += incr;
+}
+
+// Query the on-chip temperature sensor
+bool ACL_PCIE_DEVICE::get_ondie_temp_slow_call(cl_int *temp) {
+  cl_int read_data;
+
+  // We assume this during read later
+  ACL_PCIE_ASSERT(sizeof(cl_int) == sizeof(INT32), "sizeof(cl_int) != sizeof(INT32)");
+
+#ifndef ACL_PCIE_HAS_TEMP_SENSOR
+  ACL_PCIE_DEBUG_MSG(":: [%s] On-chip temperature sensor not supported by this board.\n", m_name);
+  return false;
+#endif
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Querying on-chip temperature sensor...\n", m_name);
+
+  // read temperature sensor
+  m_io->temp_sensor->read32(0, (UINT32 *)&read_data);
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Read temp sensor data.  Value is: %i\n", m_name, read_data);
+  *temp = read_data;
+  return true;
+}
+
+void *ACL_PCIE_DEVICE::shared_mem_alloc(size_t size, unsigned long long *device_ptr_out) {
+#if defined(WINDOWS)
+  return NULL;
+#endif  // WINDOWS
+#if defined(LINUX)
+#ifdef ACL_HOST_MEMORY_SHARED
+  void *host_ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, m_device, 0);
+
+  if (device_ptr_out != NULL && host_ptr == (void *)-1) {
+    // when mmap fails, it returns (void*)-1, not NULL
+    host_ptr = NULL;
+    *device_ptr_out = (unsigned long long)0;
+
+  } else if (device_ptr_out != NULL) {
+    /* map received host_ptr to FPGA-usable address. */
+    void *dev_ptr = NULL;
+    struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PHYS_PTR_FROM_VIRT, &dev_ptr, &host_ptr, sizeof(dev_ptr)};
+
+    bool failed_flag = (read(m_device, &read_cmd, sizeof(dev_ptr)) != 0);
+    ACL_PCIE_DEBUG_MSG(
+        "  Mapped vaddr %p to phys addr %p. %s\n", host_ptr, dev_ptr, failed_flag == 0 ? "OK" : "FAILED");
+    if (failed_flag) {
+      *device_ptr_out = (unsigned long long)NULL;
+    } else {
+      /* When change to 64-bit pointers on the device, update driver code
+       * to deal with larger-than-void* ptrs. */
+      *device_ptr_out = (unsigned long long)dev_ptr;
+
+      /* Now need to add offset of the shared system. */
+    }
+  }
+
+  return host_ptr;
+#else
+  return NULL;
+#endif
+#endif  // LINUX
+}
+
+void ACL_PCIE_DEVICE::shared_mem_free(void *vptr, size_t size) {
+#if defined(WINDOWS)
+  return;
+#endif  // WINDOWS
+#if defined(LINUX)
+  if (vptr != NULL) {
+    munmap(vptr, size);
+  }
+#endif  // LINUX
+}
+
+#ifdef DLA_MMD
+
+int ACL_PCIE_DEVICE::pause_and_save_pcie()
+{
+  int failed_cont_reg_save;
+
+  // set the being_programmed flag
+  m_being_programmed = true;
+
+  // disable interrupt and save control registers
+  const int failed_int_disable = this->disable_interrupts();
+  ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup_save, "could not disable interrupt.\n");
+
+  // Do this last before programming
+  failed_cont_reg_save = m_config->save_pci_control_regs();
+  ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup_save, "could not save control regs\n");
+
+  return 0;
+
+  cleanup_save:
+
+  m_being_programmed = false;
+  return 1;
+}
+
+int ACL_PCIE_DEVICE::restore_and_resume_pcie()
+{
+#if defined(LINUX)
+  m_config->load_pci_control_regs();
+#endif
+
+  if (wait_for_uniphy()) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+
+    m_being_programmed = false;
+
+    return 1;
+  }
+
+  m_being_programmed = false;
+  return 0;
+}
+
+// JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core
+// Return 0 on success
+int ACL_PCIE_DEVICE::reprogram_sof(const char *sof_filename, const bool skipSaveRestore) {
+  int saveRetCode = 0;
+
+  if (!skipSaveRestore)
+  {
+    saveRetCode = pause_and_save_pcie();
+    if (saveRetCode)
+    {
+      return saveRetCode;
+    }
+  }
+
+  int reprogram_failed = 1;  // assume failure
+
+  // JTAG programming the device
+  ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name);
+  reprogram_failed = m_config->program_with_SOF_file(sof_filename, "0" /*ad_cable*/, "0" /*ad_device_index*/);
+
+  int restoreRetCode = 0;
+
+  if (!skipSaveRestore)
+  {
+    restoreRetCode = restore_and_resume_pcie();
+    if (restoreRetCode)
+    {
+      return restoreRetCode;
+    }
+  }
+
+  if (!(reprogram_failed)) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name);
+  }
+
+  return reprogram_failed;
+}
+#else
+// perform PR reprogram by attempting to program the board using an RBF. If this is not possible due to
+// 1) Envoking the user of JTAG_PROGRAMMING via ACL_PCIE_USE_JTAG_PROGRAMMING
+// 2) RBF or HASH are not present
+// 3) PR Base ID does not match that with which the RBF was compiled
+// 4) UniPhy fails to calibrate
+// Then returns 1. Returns 0 on success. Always returns flag from arguments indicating source of failure
+int ACL_PCIE_DEVICE::pr_reprogram(struct acl_pkg_file *pkg,
+                                  const char *SOFNAME,
+                                  int *rbf_or_hash_not_provided,
+                                  int *hash_mismatch,
+                                  unsigned *use_jtag_programming,
+                                  int *quartus_compile_version_mismatch) {
+  // Environment variable to control when to use JTAG instead of PR (overriding the default programming method: PR)
+  int reprogram_failed = 1;
+  size_t core_rbf_len = 0, pr_import_version_len = 0, quartus_version_len = 0, pll_config_len = 0;
+  *use_jtag_programming = 0;
+  char *str_use_jtag_programming = getenv("ACL_PCIE_USE_JTAG_PROGRAMMING");
+  if (str_use_jtag_programming) *use_jtag_programming = 1;
+
+  // 1. Default programming method: PR
+  if (!*use_jtag_programming) {
+    // checking that rbf and hash sections exist in fpga.bin
+    if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_CORE_RBF, &core_rbf_len) &&
+        acl_pkg_section_exists(pkg, ACL_PKG_SECTION_HASH, &pr_import_version_len) &&
+        (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len) || m_skip_quartus_version_check)) {
+      *rbf_or_hash_not_provided = 0;
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Programming kernel region using PR with rbf file size %i\n", m_name, (UINT32)core_rbf_len);
+
+      // read rbf and hash from fpga.bin
+      char *core_rbf;
+      acl_aligned_malloc((void **)&core_rbf, core_rbf_len + 1);
+      int read_core_rbf_ok = acl_pkg_read_section(pkg, ACL_PKG_SECTION_CORE_RBF, core_rbf, core_rbf_len + 1);
+
+      if (!m_skip_quartus_version_check) {
+        char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1);
+        if (quartus_compile_version_str) {
+          int quartus_compile_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1);
+
+          if (quartus_compile_version_ok) {
+            // Remove Linux and Windows new-line ending in .acl.qversion
+            if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' ||
+                                              quartus_compile_version_str[quartus_version_len - 1] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 1] = '\0';
+            }
+            if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 2] = '\0';
+            }
+
+            *quartus_compile_version_mismatch = quartus_ver_test(quartus_compile_version_str);
+          } else {
+            *quartus_compile_version_mismatch = 1;
+          }
+          free(quartus_compile_version_str);
+          quartus_compile_version_str = NULL;
+        } else {
+          *quartus_compile_version_mismatch = 1;
+        }
+      } else {
+        *quartus_compile_version_mismatch = 0;
+      }
+
+      if (*quartus_compile_version_mismatch == 0) {
+        char *pr_import_version_str = (char *)malloc(pr_import_version_len + 1);
+        if (pr_import_version_str) {
+          int pr_import_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_HASH, pr_import_version_str, pr_import_version_len + 1);
+
+          // checking that hash was successfully read from section .acl.hash within fpga.bin
+          if (pr_import_version_ok) {
+            unsigned int pr_import_version = (unsigned int)strtol(pr_import_version_str, NULL, 10);
+
+            // checking that base revision hash matches import revision hash and aocx and programmed sof is from same
+            // Quartus version
+            if (pr_base_id_test(pr_import_version) == 0) {
+              *hash_mismatch = 0;
+
+              // Kernel driver wants it aligned to 4 bytes.
+              int aligned_to_4_bytes(0 == (3 & (uintptr_t)(core_rbf)));
+              reprogram_failed = 1;  // Default to fail before PRing
+
+              // checking that rbf was successfully read from section .acl.core.rbf within fpga.bin
+              if (read_core_rbf_ok && !(core_rbf_len % 4) && aligned_to_4_bytes && !version_id_test()) {
+                // reprogram Arria 10 devices
+                if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name);
+                  reprogram_failed = m_config->program_core_with_PR_file_a10((char *)core_rbf, core_rbf_len);
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name);
+                };
+
+                // reprogram Stratix 10 devices
+                if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+                  acl_pkg_section_exists(pkg, ACL_PKG_SECTION_PLL_CONFIG, &pll_config_len);
+                  char *pll_config_str = (char *)malloc(pll_config_len + 1);
+                  if (pll_config_str) {
+                    int pll_config_ok =
+                        acl_pkg_read_section(pkg, ACL_PKG_SECTION_PLL_CONFIG, pll_config_str, pll_config_len + 1);
+                    if (pll_config_ok) {
+                      ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name);
+                      reprogram_failed = m_config->program_core_with_PR_file_s10(
+                          (char *)core_rbf, core_rbf_len, (char *)pll_config_str);
+                      ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name);
+                    };
+                  };
+                  free(pll_config_str);
+                  pll_config_str = NULL;
+                };
+
+                if (reprogram_failed) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] PR programming failed.\n", m_name);
+                  // PR failed. Check if device I/O is blocked.
+                  if (check_kernel_region_status() == -1) {
+                    ACL_PCIE_INFO("[%s] Partial Reconfiguration of FPGA has failed.\n", m_name);
+                    ACL_PCIE_INFO("[%s] FPGA device will not be available until host has been powercycled.\n", m_name);
+                    exit(1);
+                  }
+                } else if (version_id_test()) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name);
+                  reprogram_failed = 1;
+                } else if (wait_for_uniphy()) {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+                  reprogram_failed = 1;
+                } else {
+                  ACL_PCIE_DEBUG_MSG(":: [%s] PR programming passed.\n", m_name);
+                }
+              }
+            }
+          }
+          free(pr_import_version_str);
+          pr_import_version_str = NULL;
+        }
+      }
+      acl_aligned_free(core_rbf);
+    }
+  }
+
+  return reprogram_failed;
+}
+
+// Reprogram the device with given binary file.
+// There are two ways to program:
+// 1. PR to replace the OpenCL kernel partition
+// 2. JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core
+// Return 0 on success
+int ACL_PCIE_DEVICE::reprogram(void *data, size_t data_size, int program_mode) {
+  int reprogram_failed = 1;           // assume failure
+  int rbf_or_hash_not_provided = 1;   // assume no rbf or hash are provided in fpga.bin
+  int hash_mismatch = 1;              // assume base revision and import revision hashes do not match
+  unsigned use_jtag_programming = 0;  // assume no need for jtag programming
+  int quartus_compile_version_mismatch = 1;
+  size_t quartus_version_len;
+
+  const char *SOFNAME = "reprogram_temp.sof";
+  size_t sof_len = 0;
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Starting to program device...\n", m_name);
+
+  struct acl_pkg_file *pkg = acl_pkg_open_file_from_memory((char *)data, data_size, ACL_PKG_SHOW_ERROR);
+  ACL_PCIE_ERROR_IF(pkg == NULL, return reprogram_failed, "cannot open file from memory using pkg editor.\n");
+
+  // set the being_programmed flag
+  m_being_programmed = true;
+
+  // the new reprogram flow: first try PR, if failed falls back to the old reprogram flow
+  int try_pr_failed = 0;
+  // if choose to try reprogram with preserving memory
+  if (program_mode == ACL_PCIE_PROGRAM_PR) {
+    // only try PR, no fall back to JTAG
+    ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name);
+    reprogram_failed = pr_reprogram(pkg,
+                                    SOFNAME,
+                                    &rbf_or_hash_not_provided,
+                                    &hash_mismatch,
+                                    &use_jtag_programming,
+                                    &quartus_compile_version_mismatch);
+    // clean up
+    if (reprogram_failed || use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch ||
+        (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) {
+      // try PR failed
+      try_pr_failed = 1;
+    }
+    if (pkg) acl_pkg_close_file(pkg);
+    m_being_programmed = false;
+    return try_pr_failed;
+  }
+
+  // the old reprogram flow. Try PR and then Try JTAG
+  // 1. Default to PR reprogramming
+  ACL_PCIE_DEBUG_MSG("[%s] Reprogram the device with data saving and restoring\n", m_name);
+  ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name);
+  reprogram_failed = pr_reprogram(pkg,
+                                  SOFNAME,
+                                  &rbf_or_hash_not_provided,
+                                  &hash_mismatch,
+                                  &use_jtag_programming,
+                                  &quartus_compile_version_mismatch);
+
+  // Autodetect JTAG cable & device index
+  // Cable and Index value should't overflow
+  char ad_cable[AD_CABLE_SIZE];
+  char ad_device_index[AD_CABLE_SIZE];
+
+  // 2. Fallback programming method: JTAG full-chip programming
+  if (use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch ||
+      (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) {
+    ACL_PCIE_DEBUG_MSG("[%s] Trying Full-Chip Reconfiguration (JTAG)\n", m_name);
+
+    // checking that sof section exist in fpga.bin
+    if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_SOF, &sof_len)) {
+      // check if aocx is fast-compiled or not - if so, then sof is a base revision,
+      // and does not necessarily contain the desired kernel. Requires sof with
+      // matching pr_base.id to be programmed (base.sof) followed by PR programming
+      // with the given .rbf
+      size_t fast_compile_len = 0;
+      char *fast_compile_contents = NULL;
+      int fast_compile = 0;
+      if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_len) &&
+          acl_pkg_read_section_transient(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_contents)) {
+        fast_compile = 1;
+        ACL_PCIE_DEBUG_MSG(":: [%s] Fast-compile fpga.bin detected.\n", m_name);
+      }
+      // Find jtag cable for the board
+      // Returns 0 for both ad_cable,ad_device_index if not found
+      // or if Autodetect is disabled
+      this->find_jtag_cable(ad_cable, ad_device_index);
+
+      // write out a SOF file
+      const int wrote_sof = acl_pkg_read_section_into_file(pkg, ACL_PKG_SECTION_SOF, SOFNAME);
+      ACL_PCIE_ERROR_IF(!wrote_sof, goto cleanup, "could not write %s.\n", SOFNAME);
+
+      // disable interrupt and save control registers
+      const int failed_int_disable = this->disable_interrupts();
+      ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup, "could not disable interrupt.\n");
+
+      // Do this last before programming
+      const int failed_cont_reg_save = m_config->save_pci_control_regs();
+      ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup, "could not save control regs\n");
+
+      // JTAG programming the device
+      ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name);
+      reprogram_failed = m_config->program_with_SOF_file(SOFNAME, ad_cable, ad_device_index);
+
+#if defined(LINUX)
+      m_config->load_pci_control_regs();
+#endif
+
+      ACL_PCIE_ERROR_IF(reprogram_failed, goto cleanup, "Failed to JTAG program\n");
+
+      if (!m_skip_quartus_version_check &&
+          acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len)) {
+        char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1);
+        if (quartus_compile_version_str) {
+          int quartus_compile_version_ok =
+              acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1);
+          if (quartus_compile_version_ok) {
+            // Remove Linux and Windows new-line ending in .acl.qversion
+            if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' ||
+                                              quartus_compile_version_str[quartus_version_len - 1] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 1] = '\0';
+            }
+            if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) {
+              quartus_compile_version_str[quartus_version_len - 2] = '\0';
+            }
+            // Last character is NULL added by acl_pkg_read_section
+            m_io->quartus_ver->write_block(0, quartus_version_len + 1, quartus_compile_version_str);
+          }
+          free(quartus_compile_version_str);
+          quartus_compile_version_str = NULL;
+        }
+      }
+
+      if (version_id_test()) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name);
+        reprogram_failed = 1;
+      } else if (wait_for_uniphy()) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name);
+        reprogram_failed = 1;
+      }
+      if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+        // S10 PR
+        if (deassert_pr_reset()) {
+          ACL_PCIE_DEBUG_MSG(":: [%s] PR region controller reset source deasserted.\n", m_name);
+        }
+      };
+      if (fast_compile) {
+        // need to rerun pr_reprogram because design should be loaded now
+        hash_mismatch = 0;
+        rbf_or_hash_not_provided = 0;
+        reprogram_failed = pr_reprogram(pkg,
+                                        SOFNAME,
+                                        &rbf_or_hash_not_provided,
+                                        &hash_mismatch,
+                                        &use_jtag_programming,
+                                        &quartus_compile_version_mismatch);
+      }
+      if (!(reprogram_failed)) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name);
+      }
+
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Could not read SOF file from fpga.bin.\n", m_name);
+      reprogram_failed = 1;
+    }
+  }
+
+cleanup:
+  // Clean up
+  if (pkg) acl_pkg_close_file(pkg);
+  m_being_programmed = false;
+
+  return reprogram_failed;
+}
+#endif
+
+// Perform a simple version id read to test the basic PCIe read functionality
+// Return 0 on success
+int ACL_PCIE_DEVICE::version_id_test() {
+  unsigned int version = ACL_VERSIONID ^ 1;  // make sure it's not what we hope to find.
+  unsigned int iattempt;
+  unsigned int max_attempts = 1;
+  unsigned int usleep_per_attempt = 20;  // 20 ms per.
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Doing PCIe-to-fabric read test ...\n", m_name);
+  for (iattempt = 0; iattempt < max_attempts; iattempt++) {
+    m_io->version->read32(0, &version);
+    if ((version >= (unsigned int)ACL_VERSIONID_MIN) && (version <= (unsigned int)ACL_VERSIONID)) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] PCIe-to-fabric read test passed\n", m_name);
+      return 0;
+    }
+#if defined(WINDOWS)
+    Sleep(usleep_per_attempt);
+#endif  // WINDOWS
+#if defined(LINUX)
+    usleep(usleep_per_attempt * 1000);
+#endif  // LINUX
+  }
+
+  // Kernel read command succeed, but got bad data. (version id doesn't match)
+  ACL_PCIE_INFO("[%s] PCIe-to-fabric read test failed, read 0x%0x after %u attempts\n", m_name, version, iattempt);
+  return -1;
+}
+
+// Perform a read of the kernel region status IP
+// Return 0 on success (PR region is unfrozen and ready to use)
+int ACL_PCIE_DEVICE::check_kernel_region_status() {
+#if defined(LINUX)
+  unsigned int value;
+  struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PR_REGION_STATUS, NULL, &value, sizeof(value)};
+  if (read(m_device, &driver_cmd, sizeof(driver_cmd)) == -1) {
+    return -1;
+  } else {
+    return value;
+  }
+#endif  // Linux
+  return 0;
+}
+
+// Performs a write to PR region controller to deassert reset to PR region
+// Return 0 on success
+int ACL_PCIE_DEVICE::deassert_pr_reset() {
+  ACL_PCIE_DEBUG_MSG(":: [%s] Deasserting PR region controller reset ...\n", m_name);
+  m_io->pr_region_ctrl->write32(FREEZE_CTRL_OFFSET, 0);
+
+  return 0;
+}
+
+// Quartus Compile Version check
+// Return 0 on success
+int ACL_PCIE_DEVICE::quartus_ver_test(char *pkg_qversion_str) {
+  char *fpga_qversion_str;
+  unsigned int version;
+
+  // Check version ID to ensure feature supported in HW
+  m_io->version->read32(0, &version);
+  if (version < (unsigned int)ACL_QUARTUSVER_VERSIONID) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] Programming on board without Quartus Version RAM\n", m_name);
+    return 1;
+  }
+
+  // Allocate buffer for Quartus version read from FPGA with
+  // largest expected size + 1 for NULL
+  fpga_qversion_str = reinterpret_cast<char*>(malloc(ACL_QUARTUSVER_ROM_SIZE + 1));
+  if (NULL == fpga_qversion_str) {
+    ACL_PCIE_DEBUG_MSG(":: Memory allocation failed, allocating %d bytes\n", ACL_QUARTUSVER_ROM_SIZE + 1);
+    free(fpga_qversion_str);
+    return 1;
+  }
+  // Make sure it's not what we hope to find
+  memset(fpga_qversion_str, 0, ACL_QUARTUSVER_ROM_SIZE + 1);
+
+  m_io->quartus_ver->read_block(0, ACL_QUARTUSVER_ROM_SIZE, fpga_qversion_str);
+
+  size_t fpga_qversion_len = 0;
+  fpga_qversion_len = strnlen(fpga_qversion_str, MAX_LEN);
+
+  size_t pkg_qversion_len = 0;
+  if (pkg_qversion_str) {
+    pkg_qversion_len = strnlen(pkg_qversion_str, MAX_LEN);
+
+    if (fpga_qversion_len != pkg_qversion_len) {
+      // Kernel read command succeed, but got bad data. (Quartus Version doesn't match)
+      ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name);
+      ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+      ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+      free(fpga_qversion_str);
+      return 1;
+    }
+
+    if (strncmp(pkg_qversion_str, fpga_qversion_str, fpga_qversion_len) == 0) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Quartus versions for base and import compile match\n", m_name);
+      ACL_PCIE_DEBUG_MSG(":: [%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+      ACL_PCIE_DEBUG_MSG(":: [%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+      free(fpga_qversion_str);
+      return 0;
+    }
+
+    // Kernel read command succeed, but got bad data. (Quartus Version doesn't match)
+    ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name);
+    ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str);
+    ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str);
+  }
+  free(fpga_qversion_str);
+  return 1;
+}
+
+// Perform a simple read to the PR base ID in the static region and compare it with the given ID
+// Return 0 on success
+int ACL_PCIE_DEVICE::pr_base_id_test(unsigned int pr_import_version) {
+  unsigned int pr_base_version = 0;  // make sure it's not what we hope to find.
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Reading PR base ID from fabric ...\n", m_name);
+  m_io->pr_base_id->read32(0, &pr_base_version);
+  if (pr_base_version == pr_import_version) {
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR base and import compile IDs match\n", m_name);
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version);
+    ACL_PCIE_DEBUG_MSG(":: [%s] PR import compile ID is 0x%0x\n", m_name, pr_import_version);
+    return 0;
+  };
+
+  // Kernel read command succeed, but got bad data. (version id doesn't match)
+  ACL_PCIE_DEBUG_MSG("[%s] PR base and import compile IDs do not match\n", m_name);
+  ACL_PCIE_DEBUG_MSG("[%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version);
+  ACL_PCIE_DEBUG_MSG("[%s] PR import compile expects ID to be 0x%0x\n", m_name, pr_import_version);
+  return -1;
+}
+
+// 1. Write a random value to cade_id register, do a read to confirm the write
+// 2. Use the random value to find the JTAG cable for that board
+// 3. Return "0" on ad_cable,ad_device_index if cable not found
+void ACL_PCIE_DEVICE::find_jtag_cable(char *ad_cable, char *ad_device_index) {
+  bool jtag_ad_disabled = false;
+  bool jtag_ad_cable_found = false;
+  unsigned int version = 0;
+
+  // Check if Autodetect is disabled
+  const char *cable = getenv("ACL_PCIE_JTAG_CABLE");
+  const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX");
+  if (cable || device_index) {
+    jtag_ad_disabled = true;
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled!!!\n", m_name);
+  }
+
+  // Check version ID to ensure feature supported in HW
+  m_io->version->read32(0, &version);
+  if (version < (unsigned int)ACL_CADEID_VERSIONID) {
+    jtag_ad_disabled = true;
+    ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled due to old HW version!!!\n", m_name);
+  }
+
+  // If JTAG autodetect is enabled, program the CADEID register
+  // and look for the value using in system sources and probes
+  if (!jtag_ad_disabled) {
+    // Only use random device here because we only want one value. Normally use mersenne twister for more values
+    std::random_device rd;
+    std::uniform_int_distribution<unsigned int> dist(0u, 0xFFFFFFFFu);
+    unsigned int cade_id_write = dist(rd) & 0xFFFFFFFF;
+    cade_id_write = cade_id_write | 0x80000000;  // Write a full 32 bit value
+    unsigned int cade_id_read = 0x0;
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Writing Cade ID to fabric ...\n", m_name);
+    m_io->cade_id->write32(0, cade_id_write);
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Reading Cade ID from fabric ...\n", m_name);
+    m_io->cade_id->read32(0, &cade_id_read);
+
+    if (cade_id_write == cade_id_read) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read success ...\n", m_name);
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Cade ID  cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read);
+
+      // Returns NULL on ad_cable,ad_device_index if no cable found
+      jtag_ad_cable_found = m_config->find_cable_with_ISSP(cade_id_write, ad_cable, ad_device_index);
+
+      if (!jtag_ad_cable_found) {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Using default cable 1 ...\n", m_name);
+      } else {
+        ACL_PCIE_DEBUG_MSG(":: [%s] Found Cable ...\n", m_name);
+      }
+    } else {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read failed. Check BSP version or PCIE link...\n", m_name);
+      ACL_PCIE_DEBUG_MSG(
+          ":: [%s] Cade ID  cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read);
+    }
+  }
+
+  if (jtag_ad_disabled || !jtag_ad_cable_found) {
+    snprintf(ad_cable, AD_CABLE_SIZE, "%s", "0");
+    snprintf(ad_device_index, AD_CABLE_SIZE, "%s", "0");
+  }
+}
+
+// Wait until the uniphy calibrated
+// Return 0 on success
+int ACL_PCIE_DEVICE::wait_for_uniphy() {
+  const unsigned int ACL_UNIPHYSTATUS = 0;
+  unsigned int status = 1, retries = 0;
+
+  while (retries++ < 8) {
+    m_io->uniphy_status->read32(0, &status);
+
+    if (status == ACL_UNIPHYSTATUS) {
+      ACL_PCIE_DEBUG_MSG(":: [%s] Uniphys are calibrated\n", m_name);
+      return 0;  // success
+    }
+
+    ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy status read was %x\n", m_name, status);
+    ACL_PCIE_DEBUG_MSG(":: [%s] Resetting Uniphy try %d\n", m_name, retries);
+    m_io->uniphy_reset->write32(0, 1);
+
+#if defined(WINDOWS)
+    Sleep(400);
+#endif  // WINDOWS
+#if defined(LINUX)
+    usleep(400 * 1000);
+#endif  // LINUX
+  }
+
+  ACL_PCIE_INFO("[%s] uniphy(s) did not calibrate.  Expected 0 but read %x\n", m_name, status);
+
+  // Failure! Was it communication error or actual calibration failure?
+  if (ACL_PCIE_READ_BIT(status, 3))  // This bit is hardcoded to 0
+    ACL_PCIE_INFO(
+        "                Uniphy calibration status is corrupt.  This is likely a communication error with the board "
+        "and/or uniphy_status module.\n");
+  else {
+    // This is a 32-bit interface with the first 4 bits aggregating the
+    // various calibration signals.  The remaining 28-bits would indicate
+    // failure for their respective memory core.  Tell users which ones
+    // failed
+    for (int i = 0; i < 32 - 4; i++) {
+      if (ACL_PCIE_READ_BIT(status, 4 + i)) ACL_PCIE_INFO("  Uniphy core %d failed to calibrate\n", i);
+    }
+    ACL_PCIE_INFO("     If there are more failures than Uniphy controllers connected, \n");
+    ACL_PCIE_INFO("     ensure the uniphy_status core is correctly parameterized.\n");
+  }
+
+  return -1;  // failure
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h
new file mode 100644
index 0000000..29f5128
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h
@@ -0,0 +1,209 @@
+#ifndef ACL_PCIE_DEVICE_H
+#define ACL_PCIE_DEVICE_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_device.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle operations on a single device.           */
+/* The actual implementation of the class lives in the acl_pcie_device.cpp         */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// Forward declaration for classes used by ACL_PCIE_DEVICE
+class ACL_PCIE_DMA;
+class ACL_PCIE_CONFIG;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_HOSTCH;
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif  // LINUX
+
+#ifdef DLA_MMD
+// CoreDLA runtime assumes host/device transfers are thread safe
+#include <mutex>
+// don't assume opencl has been installed
+typedef int cl_int;
+#endif
+
+// Encapsulates the functionality of an ACL device connected to the host
+// through a PCI express bus.
+class ACL_PCIE_DEVICE {
+ public:
+  ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number);
+  ~ACL_PCIE_DEVICE();
+  ACL_PCIE_DEVICE(const ACL_PCIE_DEVICE&) = delete;
+  ACL_PCIE_DEVICE& operator= (const ACL_PCIE_DEVICE&) = delete;
+
+  bool is_valid() { return m_device != INVALID_HANDLE_VALUE; };
+  bool is_initialized() { return m_initialized; };
+  bool is_being_programmed() { return m_being_programmed; };
+
+  // Perform operations required when an interrupt is received for this device
+  void service_interrupt(unsigned int irq_type_flag = 0);
+  // This function can be used for triggering a fake device exception for
+  void test_trigger_device_interrupt();
+
+  // The callback function set by "set_status_handler"
+  // It's used to notify/update the host whenever an event is finished
+  void event_update_fn(aocl_mmd_op_t op, int status);
+
+  // Called by the host program when there are spare cycles
+  int yield();
+
+  // Memory I/O
+  // return 0 on success
+  int write_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int read_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size);
+  int copy_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size);
+
+  // Create channel. return handle to channel on success, negative otherwise
+  int create_hostchannel(char *name, size_t queue_depth, int direction);
+
+  // return 0 on success
+  int destroy_channel(int channel);
+
+  // return pointer that user can write to for write channel, and read from for read channel
+  void *hostchannel_get_buffer(size_t *buffer_size, int channel, int *status);
+
+  // return the size in bytes of the amount of buffer that was acknlowedged to channel
+  size_t hostchannel_ack_buffer(size_t send_size, int channel, int *status);
+
+  // Set kernel, device interrupts and event update callbacks
+  // return 0 on success
+  int set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+  int set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data);
+  int set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data);
+
+  // Query PCIe information of the device
+  char *get_dev_pcie_info() { return m_info.pcie_info_str; };
+
+  // Query on-die temperature sensor, if available
+  bool get_ondie_temp_slow_call(cl_int *temp);
+
+  // Shared memory manipulation functions
+  void *shared_mem_alloc(size_t size, unsigned long long *device_ptr_out);
+  void shared_mem_free(void *host_ptr, size_t size);
+
+  // Reprogram the device with given binary file
+  // return 0 on success
+#ifdef DLA_MMD
+  int pause_and_save_pcie();
+  int restore_and_resume_pcie();
+  int reprogram_sof(const char *sof_filename, const bool skipSaveRestore = false);
+#else
+  int reprogram(void *data, size_t data_size, int program_mode);
+#endif
+
+ private:
+  // Helper routines for interrupts
+  // return 0 on success, negative on error
+  int mask_irqs();
+  int unmask_irqs();
+  int unmask_kernel_irq();
+  int disable_interrupts();
+  int enable_interrupts(int user_signal_number);
+  int get_interrupt_type(unsigned int *kernel_update, unsigned int *dma_update, unsigned int irq_type_flag);
+#if defined(WINDOWS)
+  void enable_msi(bool enable);
+#endif  // WINDOWS
+
+  // Helper routines for read or write operations
+  // return 0 on success, negative on error (except for the "incr_ptrs" routine)
+  int read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading);
+  int set_segment(size_t addr);
+  void incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr);
+  int does_base_periph_match_new_periph(struct acl_pkg_file *pkg, const char *dev_name);
+
+  // Helper routines for simple functionality test
+  // return 0 on success, negative on error
+  int version_id_test();
+  int wait_for_uniphy();
+  int pr_base_id_test(unsigned int pr_import_version);
+  int deassert_pr_reset();
+  int quartus_ver_test(char *pkg_qversion_str);
+  int check_kernel_region_status();
+
+  // Write a random value to cade_id register, do a read to confirm the write
+  // Use the random value to find the JTAG cable for that board
+  // Return 0 on ad_cable,ad_device_index if cable not found
+  void find_jtag_cable(char *ad_cable, char *ad_device_index);
+
+#ifndef DLA_MMD
+  // Performs PR reprogramming if possible, and returns different statuses on
+  // PR Hash, JTAG programming, RBF or Hash Presence
+  // Returns 0 on success, 1 on reprogram fail
+  int pr_reprogram(struct acl_pkg_file *pkg,
+                   const char *SOFNAME,
+                   int *rbf_or_hash_not_provided,
+                   int *hash_mismatch,
+                   unsigned *use_jtag_programming,
+                   int *quartus_compile_version_mismatch);
+#endif
+
+  // Kernel interrupt handler and event update callbacks
+  aocl_mmd_interrupt_handler_fn kernel_interrupt;
+  void *kernel_interrupt_user_data;
+  aocl_mmd_device_interrupt_handler_fn device_interrupt;
+  void *device_interrupt_user_data;
+  aocl_mmd_status_handler_fn event_update;
+  void *event_update_user_data;
+  int m_user_signal_number;
+
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DMA *m_dma;
+  ACL_PCIE_HOSTCH *m_hostch;
+  ACL_PCIE_CONFIG *m_config;
+
+  static const int MAX_NAME_LENGTH = 32;
+  int m_handle;
+  char m_name[MAX_NAME_LENGTH];
+  fpga_handle m_device;
+  ACL_PCIE_DEVICE_DESCRIPTION m_info;
+
+  bool m_use_dma_for_big_transfers;
+  bool m_mmd_irq_handler_enable;
+  bool m_initialized;
+  bool m_being_programmed;
+  bool m_skip_quartus_version_check;
+
+  // IRQ acknowledgement commands in the KMD
+  static const unsigned int NUM_ACK_CMDS = 3;
+#if defined(WINDOWS)
+  fpga_event_handle *dev_event_handle;
+#endif  // WINDOWS
+
+  // For the host, memory is segmented.  This stores the last used segment
+  // ID so we don't needlessly update it in hardware
+  UINT64 m_segment;
+
+#ifdef DLA_MMD
+  std::mutex m_dma_mutex;
+#endif
+};
+
+#endif  // ACL_PCIE_DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h
new file mode 100644
index 0000000..ec9fdb1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h
@@ -0,0 +1,37 @@
+#ifndef ACL_PCIE_DMA_H
+#define ACL_PCIE_DMA_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma.h  ----------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+#if defined(LINUX)
+#include "acl_pcie_dma_linux.h"
+#endif  // LINUX
+
+#endif  // ACL_PCIE_DMA_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp
new file mode 100644
index 0000000..a83b0dd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp
@@ -0,0 +1,141 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_dma_linux.cpp  --------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Linux-specific DMA operations.         */
+/* The declaration of the class lives in the acl_pcie_dma_linux.h                  */
+/* The actual implementation of DMA operation is inside the Linux kernel driver.   */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+
+// common and its own header files
+#include "acl_pcie_dma_linux.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+
+// other standard header files
+#include <stdio.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie) {
+  ACL_PCIE_ASSERT(dev != INVALID_DEVICE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+
+  m_handle = dev;
+  m_pcie = pcie;
+  m_io = io;
+  m_event = NULL;
+}
+
+ACL_PCIE_DMA::~ACL_PCIE_DMA() {
+  struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_DMA_STOP, NULL, NULL};
+  int bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+  ACL_PCIE_ASSERT(bytes_read != -1, "failed to read driver command \n");
+}
+
+bool ACL_PCIE_DMA::is_idle() {
+  unsigned int result = 0;
+  int bytes_read;
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = ACLPCI_CMD_BAR;
+  driver_cmd.command = ACLPCI_CMD_GET_DMA_IDLE_STATUS;
+  driver_cmd.device_addr = NULL;
+  driver_cmd.user_addr = &result;
+  driver_cmd.size = sizeof(result);
+  bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+
+  return (bytes_read != -1 && result != 0);
+}
+
+// Perform operations required when a DMA interrupt comes
+// For Linux,
+//    All of the DMA related interrupts are handled inside the kernel driver,
+//    so when MMD gets a signal from the kernel driver indicating DMA is finished,
+//    it only needs to call the event_update_fn when it's needed.
+void ACL_PCIE_DMA::service_interrupt() {
+  if (m_event) {
+    // Use a temporary variable to save the event data and reset m_event
+    // before calling event_update_fn to avoid race condition that the main
+    // thread may start a new DMA transfer before this work-thread is able to
+    // reset the m_event.
+    // therefore, an assertion is implemented here, as defensively preventing
+    // sending interrupt signals incorrectly.
+    ACL_PCIE_ASSERT(
+        this->is_idle(),
+        "The dma is still in running, cannot service an interrupt to invoke another read/write operation\n");
+    aocl_mmd_op_t temp_event = m_event;
+    m_event = NULL;
+
+    m_pcie->event_update_fn(temp_event, 0);
+  }
+}
+
+// relinquish the CPU to let any other thread to run
+// return 0 since there is no useful work to be performed here
+int ACL_PCIE_DMA::yield() {
+  usleep(0);
+  return 0;
+}
+
+// Transfer data between host and device
+// This function returns right after the transfer is scheduled
+// Return 0 on success
+int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) {
+  // Currently dma cannot operate multiple read/write the same time.
+  // This means the read/write should be executed if and only if the dma is idle.
+  // Otherwise, it would cause assertion failure in the kernel space of the OS,
+  // which result in hanging, and even kernel panic and machine frozen as worst case.
+  // An assertion is implemented here, as defensively preventing race condition or incorrect sending of signal.
+  ACL_PCIE_ASSERT(this->is_idle(),
+                  "The dma is still in running, cannot perform another %s operation concurrently.\n",
+                  reading ? "read" : "write");
+
+  m_event = e;
+
+  // There are two scenarios of the read/write operation
+  // 1. the referred event is NULL, MMD would be stalled and keep polling the DMA until it is idle.
+  // 2. the referred event is valid, MMD would return immediately, runtime will wait for
+  //    the DMA service interrupt signal to update the status of the read/write operation.
+  //
+  // Therefore, the dma service interrupt is expected only when the event is valid.
+  struct acl_cmd driver_cmd {};
+  driver_cmd.bar_id = ACLPCI_DMA_BAR;
+  driver_cmd.command = m_event ? ACLPCI_CMD_DMA_SERVICE_SIGNAL : ACLPCI_CMD_DMA_NO_SIGNAL;
+  driver_cmd.device_addr = reinterpret_cast<void *>(dev_addr);
+  driver_cmd.user_addr = host_addr;
+  driver_cmd.size = bytes;
+  if (reading) {
+    if (read(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1;  // reading failed
+  } else {
+    if (write(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1;
+  }
+  return 0;  // success
+}
+
+#endif  // LINUX
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h
new file mode 100644
index 0000000..2ad1762
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h
@@ -0,0 +1,75 @@
+#ifndef ACL_PCIE_DMA_LINUX_H
+#define ACL_PCIE_DMA_LINUX_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma_linux.h  ----------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Linux-specific DMA operations.           */
+/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include "aocl_mmd.h"
+typedef int fpga_handle;
+#endif
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+
+class ACL_PCIE_DMA {
+ public:
+  ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie);
+  ~ACL_PCIE_DMA();
+
+  bool is_idle();
+  void stall_until_idle() {
+    while (!is_idle()) yield();
+  };
+
+  // Perform operations required when a DMA interrupt comes
+  void service_interrupt();
+
+  // Relinquish the CPU to let any other thread to run
+  // Return 0 since there is no useful work to be performed here
+  int yield();
+
+  // Transfer data between host and device
+  // This function returns right after the transfer is scheduled
+  // Return 0 on success
+  int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading);
+
+ private:
+  aocl_mmd_op_t m_event;
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+};
+
+#endif  // LINUX
+
+#endif  // ACL_PCIE_DMA_LINUX_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp
new file mode 100644
index 0000000..ab5e7b2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp
@@ -0,0 +1,1381 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_dma_windows.cpp  ------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Windows-specific DMA operations.       */
+/* The declaration of the class lives in the acl_pcie_dma_windows.h                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(WINDOWS)
+
+// common and its own header files
+#include "acl_pcie.h"
+#include "acl_pcie_dma_windows.h"
+#include "hw_pcie_constants.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie_timer.h"
+#include "acl_pcie_debug.h"
+#include <iostream>
+#include <stdlib.h>
+
+#define ACL_PCIE_DMA_DEBUG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, m, __VA_ARGS__)
+
+// The callback function to be scheduled inside the interrupt handler
+// It will release the semaphore to allow new work to be scheduled and
+// perform the dma update function
+void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  ReleaseSemaphore(m_dma->m_workqueue_semaphore, 1, NULL);
+
+  m_dma->update(true);
+}
+
+void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  m_dma->unpin_from_queue();
+}
+
+void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) {
+  ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context;
+
+  m_dma->prepin_memory();
+}
+
+ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie)
+    : hostch_data(),
+      m_table_virt_addr(NULL),
+      m_table_dma_addr(),
+      m_table_dma_phys_addr(0),
+      m_active_descriptor(NULL),
+      m_last_pinned_size(0),
+      m_last_pinned_addr(NULL),
+      m_prepinned(0),
+      m_last_id(0),
+      m_event(NULL),
+      m_dev_addr(0),
+      m_host_addr(NULL),
+      m_bytes(0),
+      m_bytes_sent(0),
+      m_bytes_rem(0),
+      m_read(0),
+      m_idle(0),
+      m_interrupt_disabled(0),
+      m_pcie(NULL),
+      m_io(NULL),
+      m_timer(NULL),
+      m_callback_env(),
+      m_work(NULL),
+      m_workqueue_semaphore(NULL),
+      m_dma_unpin_pending(),
+      m_unpin_callback_env(),
+      m_unpin_threadpool(NULL),
+      m_unpin_work(NULL),
+      m_pin_callback_env(),
+      m_pin_threadpool(NULL),
+      m_pin_work(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+
+  m_handle = handle;
+  m_io = io;
+  m_pcie = pcie;
+
+  HOSTCH_DESC *h = &hostch_data;
+
+  const char *use_msi = getenv("ACL_PCIE_DMA_USE_MSI");
+  if (use_msi)
+    m_use_polling = 0;
+  else
+    m_use_polling = 1;
+
+  SecureZeroMemory(&m_active_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&m_pre_pinned_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&m_done_mem, sizeof(PINNED_MEM));
+
+  // Initialize Host Channel
+  SecureZeroMemory(&h->m_hostch_rd_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_wr_mem, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_rd_pointer, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_hostch_wr_pointer, sizeof(PINNED_MEM));
+  SecureZeroMemory(&h->m_sync_thread_pointer, sizeof(PINNED_MEM));
+  h->push_valid = 0;
+  h->pull_valid = 0;
+
+  m_timer = new ACL_PCIE_TIMER();
+
+  // create the threadpool to perform work the interrupt
+  m_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_threadpool, 1);
+  bool status = SetThreadpoolThreadMinimum(m_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_callback_env);
+  SetThreadpoolCallbackPool(&m_callback_env, m_threadpool);
+
+  m_work = CreateThreadpoolWork(myWorkCallback, (void *)this, &m_callback_env);
+  ACL_PCIE_ERROR_IF(m_work == NULL, return, "failed to create work for threadpool.\n");
+
+  m_workqueue_semaphore = CreateSemaphore(NULL, 1, 1, NULL);
+  ACL_PCIE_ERROR_IF(m_workqueue_semaphore == NULL, return, "failed to create semaphore.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // Unpin thread
+  m_unpin_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_unpin_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_unpin_threadpool, 1);
+  status = SetThreadpoolThreadMinimum(m_unpin_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_unpin_callback_env);
+  SetThreadpoolCallbackPool(&m_unpin_callback_env, m_unpin_threadpool);
+
+  m_unpin_work = CreateThreadpoolWork(myWorkUnpinCallback, (void *)this, &m_unpin_callback_env);
+  ACL_PCIE_ERROR_IF(m_unpin_work == NULL, return, "failed to create work for unpin threadpool.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // pin thread
+  m_pin_threadpool = CreateThreadpool(NULL);
+  ACL_PCIE_ERROR_IF(m_pin_threadpool == NULL, return, "failed to create threadpool.\n");
+
+  // set the number of work threads to 1
+  // so that no scheduled work will be running in parallel between them
+  SetThreadpoolThreadMaximum(m_pin_threadpool, 1);
+  status = SetThreadpoolThreadMinimum(m_pin_threadpool, 1);
+  ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n");
+
+  // create the work for threadpool and its semaphore
+  InitializeThreadpoolEnvironment(&m_pin_callback_env);
+  SetThreadpoolCallbackPool(&m_pin_callback_env, m_pin_threadpool);
+
+  m_pin_work = CreateThreadpoolWork(myWorkPinCallback, (void *)this, &m_pin_callback_env);
+  ACL_PCIE_ERROR_IF(m_pin_work == NULL, return, "failed to create work for unpin threadpool.\n");
+
+  ///////////////////////////////////////////////////////////////////////////////////////////
+  // Contiguous DMA'able memory allocation for descriptor table
+
+  fpga_result FPGA_status;
+  size_t desc_table_size = sizeof(struct DMA_DESC_TABLE);
+  size_t page_table_size = sizeof(struct HOSTCH_TABLE);
+
+  // Lock DMA_DESC_TABLE using WsId
+  FPGA_status = fpgaPrepareBuffer(
+      m_handle, (UINT64)desc_table_size, (PVOID *)&m_table_virt_addr, &m_table_dma_addr.WsId, FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), &m_table_dma_addr.WsId, NULL, 0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  m_table_dma_addr.Page = (sg_element *)malloc(m_table_dma_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (m_table_dma_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, (void *)m_table_dma_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked DMA descriptor table memory.\n");
+  ACL_PCIE_ASSERT(m_table_dma_addr.dwPages == 1, "fpgaPrepareBuffer function allocated more than 1 page.\n");
+
+  if (m_table_dma_addr.Page != NULL) m_table_dma_phys_addr = m_table_dma_addr.Page[0].phys_addr;
+
+  // Lock HOSTCH_TABLE push channel using WsId
+  FPGA_status = fpgaPrepareBuffer(m_handle,
+                                  (UINT64)page_table_size,
+                                  (PVOID *)&h->push_page_table,
+                                  &hostch_data.push_page_table_addr.WsId,
+                                  FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(m_handle,
+                                     GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS),
+                                     (PVOID)&hostch_data.push_page_table_addr.WsId,
+                                     NULL,
+                                     0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, hostch_data.push_page_table_addr.WsId, (uint64_t *)&hostch_data.push_page_table_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  hostch_data.push_page_table_addr.Page =
+      (sg_element *)malloc(hostch_data.push_page_table_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (hostch_data.push_page_table_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle,
+                                       hostch_data.push_page_table_addr.WsId,
+                                       (uint64_t *)&hostch_data.push_page_table_addr.dwPages,
+                                       (void *)hostch_data.push_page_table_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table for Hostchannel memory.\n");
+  ACL_PCIE_ASSERT(hostch_data.push_page_table_addr.dwPages == 1,
+                  "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n");
+
+  if (hostch_data.push_page_table_addr.Page != NULL)
+    hostch_data.push_page_table_bus_addr = hostch_data.push_page_table_addr.Page[0].phys_addr;
+
+  // Lock HOSTCH_TABLE pull channel
+  FPGA_status = fpgaPrepareBuffer(m_handle,
+                                  (UINT64)page_table_size,
+                                  (PVOID *)&h->pull_page_table,
+                                  &hostch_data.pull_page_table_addr.WsId,
+                                  FPGA_BUF_QUIET);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function for Hostchannel failed. \n");
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status = fpgaProcessDeviceCmd(m_handle,
+                                     GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS),
+                                     (PVOID)&hostch_data.pull_page_table_addr.WsId,
+                                     NULL,
+                                     0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(
+      m_handle, hostch_data.pull_page_table_addr.WsId, (uint64_t *)&hostch_data.pull_page_table_addr.dwPages, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  // Allocate memory for SG List
+  hostch_data.pull_page_table_addr.Page =
+      (sg_element *)malloc(hostch_data.pull_page_table_addr.dwPages * sizeof(sg_element));
+
+  // Throw an exception in case of malloc failure
+  if (hostch_data.pull_page_table_addr.Page == NULL) throw std::bad_alloc();
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle,
+                                       hostch_data.pull_page_table_addr.WsId,
+                                       (uint64_t *)&hostch_data.pull_page_table_addr.dwPages,
+                                       (void *)hostch_data.pull_page_table_addr.Page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table memory.\n");
+  ACL_PCIE_ASSERT(hostch_data.pull_page_table_addr.dwPages == 1,
+                  "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n");
+
+  if (hostch_data.pull_page_table_addr.Page != NULL)
+    hostch_data.pull_page_table_bus_addr = hostch_data.pull_page_table_addr.Page[0].phys_addr;
+
+  // set idle status to true when finish initialization
+  m_idle = true;
+}
+
+ACL_PCIE_DMA::~ACL_PCIE_DMA() {
+  fpga_result FPGA_status;
+  stall_until_idle();
+
+  // make sure no more work queued for threadpool
+  WaitForThreadpoolWorkCallbacks(m_work, FALSE);
+
+  // hostch_destroy is expected to be called by user but to make sure, call in the destructor
+  hostch_destroy(ACL_HOST_CHANNEL_0_ID);
+  hostch_destroy(ACL_HOST_CHANNEL_1_ID);
+
+  // Unlock all the previously allocated tables from the constructor
+  FPGA_status = fpgaReleaseBuffer(m_handle, m_table_dma_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (m_table_dma_addr.Page != NULL) {
+    free(m_table_dma_addr.Page);
+    m_table_dma_addr.Page = NULL;
+  }
+
+  FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.push_page_table_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (hostch_data.push_page_table_addr.Page != NULL) {
+    free(hostch_data.push_page_table_addr.Page);
+    hostch_data.push_page_table_addr.Page = NULL;
+  }
+
+  FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.pull_page_table_addr.WsId);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n");
+
+  if (hostch_data.pull_page_table_addr.Page != NULL) {
+    free(hostch_data.pull_page_table_addr.Page);
+    hostch_data.pull_page_table_addr.Page = NULL;
+  }
+
+  CloseHandle(m_workqueue_semaphore);
+  CloseThreadpoolWork(m_work);
+  CloseThreadpool(m_threadpool);
+
+  CloseThreadpoolWork(m_unpin_work);
+  CloseThreadpool(m_unpin_threadpool);
+
+  CloseThreadpoolWork(m_pin_work);
+  CloseThreadpool(m_pin_threadpool);
+
+  if (m_timer) {
+    delete m_timer;
+    m_timer = NULL;
+  }
+}
+
+int ACL_PCIE_DMA::check_dma_interrupt(unsigned int *dma_update) {
+  if (!m_use_polling) {
+    if (m_last_id > 0 && m_last_id <= ACL_PCIE_DMA_DESC_MAX_ENTRIES) {
+      *dma_update = (m_table_virt_addr->header.flags[m_last_id - 1]);
+    } else {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void ACL_PCIE_DMA::unpin_from_queue() {
+  fpga_result result;
+  ACL_PCIE_ASSERT(!m_dma_unpin_pending.empty(), "m_dma_unpin_pending is empty but unpin mem thread was called\n");
+
+  QUEUE_STRUCT entry;
+
+  entry = m_dma_unpin_pending.front();
+  m_dma_unpin_pending.pop();
+
+  // IOCTL call to flush IO buffers
+  result = fpgaProcessDeviceCmd(
+      m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID) & (entry.WsId), NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // Unlock the allocated tables associated with wsId
+  result = fpgaReleaseBuffer(m_handle, entry.WsId);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n");
+  if (entry.SGListPtr != NULL) free(entry.SGListPtr);
+}
+
+void ACL_PCIE_DMA::prepin_memory() { pin_memory(&m_pre_pinned_mem, true); }
+
+void ACL_PCIE_DMA::wait_finish() {
+  UINT32 wait_timer;
+
+  while (1) {
+    wait_timer = ACL_PCIE_DMA_TIMEOUT;
+    while (wait_timer > 0) {
+      wait_timer--;
+
+      if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) {
+        ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait done\n");
+        set_desc_table_header();
+        if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+          SubmitThreadpoolWork(m_work);
+        }
+        return;
+      }
+    }
+
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait timed out. Sleeping for 1ms.\n");
+    Sleep(1);
+  }
+}
+
+#if defined(GEN3_x16)
+  // Add extra descriptor for DMA controller to report 'done status' in the DMA table
+void ACL_PCIE_DMA::add_extra_dma_desc() {
+  /*
+    One extra descriptor is required to be fetched. Two if using interrupts.
+    For reads (Host <-- FPGA), the last descriptor sets the DMA done status.
+    For writes (Host --> FPGA), the last descriptor fetches the status
+      descriptor which then sets the DMA done status.
+    When using interrupts, there is an additional descriptor that sends the
+    interrupt, handled in the same way as the above.
+   */
+  // Clear done status flag.
+  m_table_virt_addr->header.flags[m_last_id - 1] = 0; // ID = m_last_id - 1
+
+  if (m_read) {
+    // descriptor[m_last_id]: write 0x1ULL to flags[m_last_id-1] which is used to indicate DMA done.
+    set_immediate_desc( // Set status bit
+            &(m_table_virt_addr->descriptors[m_last_id]), // descriptor[m_last_id] location in user space
+            m_table_dma_phys_addr + 4*(m_last_id - 1), // physical address for 0x1ULL to write (flags[m_last_id].. flag filed size is 4 byte)
+            0x1ULL,
+            255
+	);
+  } else {
+    // Need to fetch status desc into different destination.
+	// descriptor[m_last_id]: DMA Descriptor[m_last_id+1](32 byte) to WDP register set in DMA controller.
+	m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]);
+	set_read_desc(m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER) + (m_last_id + 1) * 32,  // src: set_immediate_desc descriptor location
+                  WRITE_DESC_PRIO_OFFSET + DESC_OFFSET, // des, location of WDP register set
+                  32/4 // copy 32-byte, 8 word
+    );
+
+	// descriptor[m_last_id+1]: write 0x1ULL(4-byte) to status[m_last_id-1] which is used to indicate DMA done.
+	set_immediate_desc( // Set status bit
+            &(m_table_virt_addr->descriptors[m_last_id + 1]),
+            m_table_dma_phys_addr + 4*(m_last_id - 1), //4: size per status entry
+            0x1ULL,
+            255
+    );
+  }
+  MemoryBarrier();
+}
+#endif
+
+void ACL_PCIE_DMA::send_dma_desc() {
+  // Disabling interrupt is used in hostch_create function during polling
+#if defined(GEN3_x8)
+  if (m_read) {
+    m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI);
+    m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+    if (m_interrupt_disabled)
+      m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT);
+    else
+      m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+    MemoryBarrier();
+    m_io->dma->write32(ACL_PCIE_DMA_WR_LAST_PTR, m_last_id - 1);
+  } else {
+    m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO);
+    m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI);
+    m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+    if (m_interrupt_disabled)
+      m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT);
+    else
+      m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+    MemoryBarrier();
+    m_io->dma->write32(ACL_PCIE_DMA_RD_LAST_PTR, m_last_id - 1);
+  }
+#elif defined(GEN3_x16)
+  DMA_DESC_ENTRY dt_fetch_desc;
+  UINT32 ctrl, *pValue32;
+  UINT64 dt_fetch_queue_addr64;
+  int i;
+
+  add_extra_dma_desc();
+  // init a descriptor for start dma
+  dt_fetch_desc.src_addr = m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER); // physical addrees of first desciptor (assume dma always start from ID 0)
+  dt_fetch_desc.dst_addr = m_read ? WRITE_DESC_NORM_OFFSET : READ_DESC_NORM_OFFSET;
+  dt_fetch_desc.dst_addr += DESC_OFFSET;
+  ctrl = ((m_last_id - 1) + 2) * 8; // interrupt is not enabled case ... (ID+3)*8 if interrupted is enabled (note: ID = m_last_id-1)
+  ctrl |= 1 << 20;    // Single destination
+  ctrl |= 0xFE << 24; // Special descriptor ID
+  dt_fetch_desc.ctrl = ctrl;
+
+  dt_fetch_queue_addr64 = m_read ? READ_DESC_PRIO_OFFSET : READ_DESC_NORM_OFFSET;
+  pValue32 = (UINT32 *)(&dt_fetch_desc);
+  for (i = 0; i < 4; i++) {
+    m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + i * 4, *(pValue32 + i));
+  }
+  // Most significant DWord must be written last.
+  MemoryBarrier();
+  m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + 4 * 4,*(((uint32_t *)(&dt_fetch_desc)) + 4));
+  MemoryBarrier();
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+void ACL_PCIE_DMA::setup_dma_desc() {
+#if defined(GEN3_x8)
+  m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+  m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI);
+  m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+
+  m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL);
+  m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO);
+  m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI);
+  m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1);
+#endif
+}
+
+void ACL_PCIE_DMA::set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len) {
+#if defined(GEN3_x8)
+  m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL);
+  m_active_descriptor->src_addr_udw = (source >> 32);
+  m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL);
+  m_active_descriptor->dest_addr_udw = (dest >> 32);
+  m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#elif defined(GEN3_x16)
+  m_active_descriptor->src_addr = source;
+  m_active_descriptor->dst_addr = dest;
+  m_active_descriptor->ctrl = (ctl_dma_len | (m_last_id << 24));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+void ACL_PCIE_DMA::set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len) {
+#if defined(GEN3_x8)
+  m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL);
+  m_active_descriptor->src_addr_udw = (source >> 32);
+  m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL);
+  m_active_descriptor->dest_addr_udw = (dest >> 32);
+  m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18));
+  m_active_descriptor->reserved[0] = 0;
+  m_active_descriptor->reserved[1] = 0;
+  m_active_descriptor->reserved[2] = 0;
+#elif defined(GEN3_x16)
+  set_read_desc(source, dest, ctl_dma_len);
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+}
+
+#if defined(GEN3_x16)
+void ACL_PCIE_DMA::set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id) {
+  uint32_t ctrl;
+
+  desc->src_addr = data;      // The data to write to given address
+  desc->dst_addr = addr;
+  ctrl = 1;                   // 1 DW status
+  ctrl |= 1 << 18;            // Immediate access
+  ctrl |= id << 24;           // Status descriptor ID
+  desc->ctrl = ctrl;
+  desc->reserved[0] = 0x0;
+  desc->reserved[1] = 0x0;
+  desc->reserved[2] = 0x0;
+}
+#endif
+
+void ACL_PCIE_DMA::set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num) {
+  page_entry->page_addr_ldw = (page_addr & 0xffffffffUL);
+  page_entry->page_addr_udw = (page_addr >> 32);
+  page_entry->page_num = page_num;
+  page_entry->reserved[0] = 0;
+  page_entry->reserved[1] = 0;
+  page_entry->reserved[2] = 1;
+  page_entry->reserved[3] = 0;
+  page_entry->reserved[4] = 0;
+}
+
+void ACL_PCIE_DMA::set_desc_table_header() {
+  int i;
+  for (i = 0; i < ACL_PCIE_DMA_DESC_MAX_ENTRIES; i++) m_table_virt_addr->header.flags[i] = 0;
+}
+
+// Perform operations required when a DMA interrupt comes
+void ACL_PCIE_DMA::service_interrupt() {
+  if (!m_use_polling) {
+    // only submit a new work to the pool when there is not work in queued
+    if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+      set_desc_table_header();
+      SubmitThreadpoolWork(m_work);
+    }
+  }
+}
+
+void ACL_PCIE_DMA::spin_loop_ns(UINT64 wait_ns) {
+  cl_ulong start = m_timer->get_time_ns();
+  cl_ulong finish;
+
+  do {
+    finish = m_timer->get_time_ns();
+  } while (finish - start < wait_ns);
+}
+
+void ACL_PCIE_DMA::check_last_id(UINT32 *last_id) {
+  ACL_PCIE_ASSERT(*last_id <= (ACL_PCIE_DMA_RESET_ID + 1), "last id was greater than 255.\n");
+
+  if (*last_id == (ACL_PCIE_DMA_RESET_ID + 1)) {
+    *last_id = 0;
+    return;
+  } else if (*last_id == ACL_PCIE_DMA_TABLE_SIZE) {
+    *last_id = 0;
+    return;
+  }
+  ACL_PCIE_ASSERT(*last_id < (ACL_PCIE_DMA_TABLE_SIZE), "last id was greater than 127.\n");
+}
+
+// Relinquish the CPU to let any other thread to run
+// Return 0 since there is no useful work to be performed here
+int ACL_PCIE_DMA::yield() {
+  Sleep(0);
+  return 0;
+}
+
+// Add a byte-offset to a void* pointer
+inline void *ACL_PCIE_DMA::compute_address(void *base, uintptr_t offset) {
+  uintptr_t p = reinterpret_cast<uintptr_t>(base);
+  return reinterpret_cast<void *>(p + offset);
+}
+
+int ACL_PCIE_DMA::hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem) {
+  fpga_result FPGA_status;
+  UINT64 wsid;
+
+  // No active segment of pinned memory - pin one
+
+  // Lock HOSTCH_TABLE using WsId
+  FPGA_status = fpgaPrepareBuffer(m_handle, (UINT64)len, (PVOID *)&addr, &wsid, FPGA_BUF_PREALLOCATED);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaPrepareBuffer function for Hostchannel failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n");
+
+  new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element));
+
+  FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n");
+
+  new_mem->WsId = wsid;
+  new_mem->UsrVa = (PVOID)addr;
+  new_mem->next_page = new_mem->dma_page;
+
+  // IOCTL call to flush CPU buffers
+  FPGA_status =
+      fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0);
+  ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh Pinning 0x%zx bytes at 0x%p.\n", len, addr);
+
+  return 0;
+}
+
+// Only 1 pin_memory can be running at a time
+void ACL_PCIE_DMA::pin_memory(PINNED_MEM *new_mem, bool prepin) {
+  fpga_result result;
+  UINT64 wsid = 0x0;
+
+  // No active segment of pinned memory - pin one
+  m_bytes_rem = prepin ? (m_bytes_rem - m_last_pinned_size) : (m_bytes - m_bytes_sent);
+  UINT32 last_id = prepin ? 0 : m_last_id;
+  check_last_id(&last_id);
+  size_t last_id_size_offset = last_id * PAGE_SIZE;
+  size_t lock_size = (m_bytes_rem > ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset)
+                         ? ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset
+                         : m_bytes_rem;
+  void *lock_addr =
+      prepin ? compute_address(m_last_pinned_addr, m_last_pinned_size) : compute_address(m_host_addr, m_bytes_sent);
+  uintptr_t last_page_portion = (reinterpret_cast<uintptr_t>(lock_addr) + lock_size) & ACL_PCIE_DMA_PAGE_ADDR_MASK;
+
+  // If doing max pinning, check if will *end* on page boundary. If not, better
+  // to pin a bit less and end up on the boundary. This way, will have fewer
+  // descriptors to send.
+  if (lock_size == (ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset) && last_page_portion != 0) {
+    lock_size -= (size_t)last_page_portion;
+  }
+
+  assert(lock_size < MAXDWORD);
+
+  // Lock memory using WsId
+  result = fpgaPrepareBuffer(m_handle, (UINT64)lock_size, (PVOID *)&lock_addr, &wsid, FPGA_BUF_PREALLOCATED);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "HostCh : fpgaPrepareBuffer function failed.\n");
+
+  // Obtain Physical address for the Page associated with the buffer
+  result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element));
+
+  result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n");
+
+  new_mem->WsId = wsid;
+  new_mem->UsrVa = (PVOID)lock_addr;
+  new_mem->next_page = new_mem->dma_page;
+
+  // IOCTL call to flush CPU buffers
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  m_last_pinned_size = lock_size;
+  m_last_pinned_addr = lock_addr;
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Pinning 0x%zx bytes at 0x%p.\n", lock_size, lock_addr);
+}
+
+// Unpin Memory
+void ACL_PCIE_DMA::unpin_memory(PINNED_MEM *old_mem) {
+  fpga_result result = FPGA_OK;
+  UINT64 wsId = old_mem->WsId;
+
+  // IOCTL call to flush I/O buffers
+  result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID)&wsId, NULL, 0);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n");
+
+  // UnLock previously locked memory using WsId
+  result = fpgaReleaseBuffer(m_handle, wsId);
+  ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n");
+
+  if (old_mem->dma_page != NULL) free(old_mem->dma_page);
+
+  old_mem->next_page = NULL;
+  old_mem->dma_page = NULL;
+  old_mem->pages_rem = 0;
+  old_mem->UsrVa = NULL;
+}
+
+// Check if user's 'ack' API updated end pointer of circular buf
+// Update end pointer in IP
+int ACL_PCIE_DMA::hostch_push_update() {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (h->rd_buf_end_pointer != *h->user_rd_end_pointer) {
+    h->rd_buf_end_pointer = *h->user_rd_end_pointer;
+  } else {
+    h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter;
+    return 1;
+  }
+  h->loop_counter = HOSTCH_LOOP_COUNTER;
+
+  m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, (UINT32)h->rd_buf_end_pointer);
+
+  return 0;
+}
+
+// Check if user's 'ack' API updated front pointer of circular buf
+// Update end pointer in IP
+int ACL_PCIE_DMA::hostch_pull_update() {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (h->wr_buf_front_pointer != *h->user_wr_front_pointer) {
+    h->wr_buf_front_pointer = *h->user_wr_front_pointer;
+  } else {
+    h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter;
+    return 1;
+  }
+  h->loop_counter = HOSTCH_LOOP_COUNTER;
+
+  m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, (UINT32)h->wr_buf_front_pointer);
+  return 0;
+}
+
+// Transfer data between host and device
+// This function returns right after the transfer is scheduled
+// Return 0 on success
+int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) {
+  ACL_PCIE_ASSERT(m_event == NULL, "non-empty event before a new DMA read/write.\n");
+
+  // Copy the parameters over and mark the job as running
+  m_event = e;
+  m_read = reading;
+  m_bytes = bytes;
+  m_host_addr = host_addr;
+  m_dev_addr = dev_addr;
+
+  // Start processing the request
+  m_bytes_sent = 0;
+  m_last_id = ACL_PCIE_DMA_RESET_ID;
+  m_prepinned = 0;
+
+#if defined(GEN3_x8)
+  if (m_read) {
+    m_io->dma->read32(ACL_PCIE_DMA_WR_LAST_PTR, &m_last_id);
+    m_last_id++;
+  } else {
+    m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id);
+    m_last_id++;
+  }
+
+#elif defined(GEN3_x16)
+  m_last_id = 0;
+#else
+  #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option"
+#endif
+  m_idle = false;
+
+  // setup the work inside the threadpool to perform the first DMA transaction
+  ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                    return -1,
+                    "failed to schedule the first work for DMA read/write.\n");
+
+  SubmitThreadpoolWork(m_work);
+
+  return 0;  // success
+}
+
+// function to be scheduled to execute whenever an interrupt arrived
+bool ACL_PCIE_DMA::update(bool forced) {
+  cl_ulong start;
+  int status;
+  UINT32 max_transfer;
+  unsigned int i;
+  HOSTCH_DESC *h = &hostch_data;
+  size_t current_transfer_size = 0;
+
+  if (!forced) return false;
+
+  if (h->pull_valid && m_idle) {
+    // Check user memory to see if there was update to user buffer pointer for pull
+    status = hostch_pull_update();
+  }
+
+  if (h->push_valid && m_idle) {
+    // Check user memory to see if there was update to user buffer pointer for push
+    status = hostch_push_update();
+  }
+
+  if ((h->push_valid | h->pull_valid) && m_idle && (h->thread_sync_valid && h->loop_counter > 0)) {
+    // setup the work inside the threadpool to perform the first DMA transaction
+    ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                      return false,
+                      "HostCh : failed to schedule the first work for DMA read/write.\n");
+    SubmitThreadpoolWork(m_work);
+    return false;
+
+  } else if (m_idle && (h->thread_sync_valid && h->loop_counter == 0)) {
+    *h->user_thread_sync = 0;
+    return false;
+
+  } else if (m_idle) {
+    return false;
+  }
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Bytes left %zu\n", m_bytes - m_bytes_sent);
+  // Process any descriptors that have completed
+  set_desc_table_header();
+  cl_ulong finish = 0;
+  if (ACL_PCIE_DEBUG >= VERBOSITY_BLOCKTX) finish = m_timer->get_time_ns();
+
+  // Check if the transaction is complete
+  if (m_bytes_sent == m_bytes) {
+    if (m_active_mem.UsrVa != NULL) unpin_memory(&m_active_mem);
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Transaction complete!\n");
+    ACL_PCIE_ASSERT(m_active_mem.UsrVa == NULL, "there is still active pinned memory after the DMA read/write.\n");
+    WaitForThreadpoolWorkCallbacks(m_unpin_work, false);
+    if (!m_dma_unpin_pending.empty()) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] Done, but pinned memory still in queue. Wait until queue is empty.\n");
+      if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+        SubmitThreadpoolWork(m_work);
+      }
+
+      Sleep(0);
+      return true;
+    }
+
+    m_last_id = ACL_PCIE_DMA_RESET_ID;
+    m_idle = true;
+
+    if (m_event) {
+      // Use a temporary variable to save the event data and reset m_event before calling event_update_fn
+      // to avoid race condition that the main thread may start a new DMA transfer before this work-thread
+      // is able to reset the m_event.
+      aocl_mmd_op_t temp_event = m_event;
+      m_event = NULL;
+
+      m_pcie->event_update_fn(temp_event, 0);
+    }
+
+    if ((h->push_valid | h->pull_valid) && (h->thread_sync_valid && h->loop_counter > 0)) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return false,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+
+    return true;
+  }
+
+  // Check if we are done with previously pinned memory.
+  if (m_active_mem.UsrVa == NULL || m_active_mem.pages_rem == 0) {
+    m_done_mem = m_active_mem;
+
+    WaitForThreadpoolWorkCallbacks(m_pin_work, false);
+
+    // Get pre-pinned memory if there are any.
+    if (m_pre_pinned_mem.UsrVa != NULL) {
+      m_active_mem = m_pre_pinned_mem;
+      m_pre_pinned_mem.UsrVa = NULL;
+      m_prepinned = 0;
+    } else if (m_prepinned) {
+      if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) {
+        SubmitThreadpoolWork(m_work);
+      }
+      Sleep(1);
+      return true;
+    } else {
+      pin_memory(&m_active_mem, false);
+    }
+  }
+
+  // Main DMA execution
+  // 1. Transfers up to 128 descriptors
+  //    - Each descriptor can transfer up to ACL_PCIE_DMA_MAX_TRANSFER_SIZE bytes
+  // 2. Launch a thread to unpin memory
+  // 3. Launch a thread to pre-pin next memory
+  if (m_active_mem.pages_rem > 0) {
+    // Calculate how many descriptors can be sent
+    check_last_id(&m_last_id);
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] last id was %u\n", m_last_id);
+    max_transfer = ACL_PCIE_DMA_TABLE_SIZE - m_last_id;
+
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] max_transfer %u\n", max_transfer);
+
+    // Build descriptor table
+    for (i = 0; i < max_transfer; i++) {
+      if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) {
+        // A10 DMA
+        m_active_descriptor = &(m_table_virt_addr->descriptors[i]);
+      };
+      if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) {
+        // S10 DMA
+        m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]);
+      };
+      if (m_read) {
+        if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) {
+          ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for read. Page size is %u bytes\n",
+                             ACL_PCIE_DMA_MAX_TRANSFER_SIZE,
+                             m_active_mem.next_page->length);
+          set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4);
+          m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+        } else {
+          set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, m_active_mem.next_page->length / 4);
+          m_dev_addr += m_active_mem.next_page->length;
+          m_bytes_sent += m_active_mem.next_page->length;
+          current_transfer_size += m_active_mem.next_page->length;
+          ++m_active_mem.next_page;
+          m_active_mem.pages_rem--;
+        }
+      } else {
+        if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) {
+          ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for write. Page size is %u bytes\n",
+                             ACL_PCIE_DMA_MAX_TRANSFER_SIZE,
+                             m_active_mem.next_page->length);
+          set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4);
+          m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+          current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE;
+        } else {
+          set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, m_active_mem.next_page->length / 4);
+          m_dev_addr += m_active_mem.next_page->length;
+          m_bytes_sent += m_active_mem.next_page->length;
+          current_transfer_size += m_active_mem.next_page->length;
+          ++m_active_mem.next_page;
+          m_active_mem.pages_rem--;
+        }
+      }
+      m_last_id++;
+      if (m_active_mem.pages_rem == 0) break;
+    }
+    ACL_PCIE_DMA_DEBUG(":::: [DMA] Transferring %zu bytes using %u descriptors\n", current_transfer_size, i);
+
+    MemoryBarrier();
+    // Send descriptor table to DMA
+    start = m_timer->get_time_ns();
+    m_interrupt_disabled = FALSE;
+    send_dma_desc();
+    int pinning = 0;
+    int unpinning = 0;
+    cl_ulong unpin_start = 0, unpin_finish = 0;
+
+    // Launch unpin thread
+    if (m_done_mem.UsrVa != NULL) {
+      unpin_start = m_timer->get_time_ns();
+      unpinning = 1;
+
+      // wait for previous unpin to finish
+      WaitForThreadpoolWorkCallbacks(m_unpin_work, false);
+
+      QUEUE_STRUCT entry;
+
+      entry.WsId = m_done_mem.WsId;
+      entry.SGListPtr = (PVOID)(m_done_mem.dma_page);
+
+      m_dma_unpin_pending.push(entry);
+
+      // Make sure Push into unpin queue comes before launching unpin thread
+      MemoryBarrier();
+
+      // Launch unpin thread
+      SubmitThreadpoolWork(m_unpin_work);
+
+      m_done_mem.next_page = NULL;
+
+      // if (m_done_mem.dma_page != NULL)
+      // free(m_done_mem.dma_page);
+
+      m_done_mem.dma_page = NULL;
+
+      m_done_mem.UsrVa = NULL;
+      unpin_finish = m_timer->get_time_ns();
+    }
+
+    // Launch pre-pin thread
+    cl_ulong pin_start = 0, pin_finish = 0;
+    if (((m_bytes_rem - m_last_pinned_size) > 0) && (m_prepinned == 0)) {
+      pin_start = m_timer->get_time_ns();
+      pinning = 1;
+      m_prepinned = 1;
+
+      // This wait should pass right through.
+      // There is another wait above, before switching active and prepin memory
+      WaitForThreadpoolWorkCallbacks(m_pin_work, false);
+      SubmitThreadpoolWork(m_pin_work);
+      pin_finish = m_timer->get_time_ns();
+    }
+
+    if (m_use_polling) {
+      wait_finish();
+      finish = m_timer->get_time_ns();
+      ACL_PCIE_DMA_DEBUG(
+          ":::: [DMA] Transfer (%zu bytes) completed in %.2f us - %.2f MB/s :: pinning %i in %.2f us :: unpinning %i "
+          "in %.2f us :: pages rem %li\n",
+          current_transfer_size,
+          (finish - start) / 1000.0,
+          1000000000.0 * current_transfer_size / (finish - start) / (1024.0 * 1024.0),
+          pinning,
+          (pin_finish - pin_start) / 1000.0,
+          unpinning,
+          (unpin_finish - unpin_start) / 1000.0,
+          m_active_mem.pages_rem);
+    }
+
+    return true;
+  }
+
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] Nothing happened\n");
+  return true;
+}
+
+// Poll DMA transfer
+// Only used during host channel create
+// Used to transfer the page table of pinned down MMD circular buffer to host channel IP
+// The size of this transfer is known to be small
+void ACL_PCIE_DMA::poll_wait() {
+  UINT32 wait_timer;
+
+  while (1) {
+    wait_timer = ACL_PCIE_DMA_TIMEOUT;
+    while (wait_timer > 0) {
+      wait_timer--;
+
+      if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) {
+        ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh : Wait done\n");
+        set_desc_table_header();
+#if defined(GEN3_x8)
+        if (m_read)
+          m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+        else
+          m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT);
+#endif
+        m_interrupt_disabled = FALSE;
+
+        return;
+      }
+      // Delay the CPU from checking the memory for 1us. CPU is still running this thread.
+      // but reduces memory access from CPU
+      spin_loop_ns(1000);
+    }
+
+    // If DMA hasn't finished yet, free up the CPU for 1ms
+    ACL_PCIE_DMA_DEBUG(
+        ":::: [DMA] HostCh : Poll wait failed while transferring host channel page table to IP. Sleeping for 1ms.\n");
+    Sleep(1);
+  }
+}
+
+// Set IP's parameters for host channel.
+// Parameters are txs address to write updated front/end pointer to on host memory,
+// Address to DMA data to, to stream data into kernel
+void ACL_PCIE_DMA::hostch_start(int channel) {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if (channel == (int)ACL_HOST_CHANNEL_0_ID) {
+    // Fix this Line
+    h->user_rd_front_pointer_bus_addr = h->m_hostch_rd_pointer.dma_page[0].phys_addr;
+
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_LOW, h->user_rd_front_pointer_bus_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_HIGH, (h->user_rd_front_pointer_bus_addr) >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_LOW, ACL_HOST_CHANNEL_0_DMA_ADDR & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_HIGH, ACL_HOST_CHANNEL_0_DMA_ADDR >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_BUF_SIZE, (UINT32)h->buffer_size);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, 0);
+    m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 1);
+
+  } else if (channel == (int)ACL_HOST_CHANNEL_1_ID) {
+    h->user_wr_end_pointer_bus_addr = h->m_hostch_wr_pointer.dma_page[0].phys_addr + sizeof(size_t);
+
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_LOW, h->user_wr_end_pointer_bus_addr & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_HIGH, (h->user_wr_end_pointer_bus_addr) >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_LOW, ACL_HOST_CHANNEL_1_DMA_ADDR & 0xffffffffUL);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_HIGH, ACL_HOST_CHANNEL_1_DMA_ADDR >> 32);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_BUF_SIZE, (UINT32)h->buffer_size);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, 0);
+    m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 1);
+  }
+}
+
+void ACL_PCIE_DMA::hostch_thread_sync(void *user_addr) {
+  int status;
+  HOSTCH_DESC *h = &hostch_data;
+
+  if ((user_addr == NULL) & (h->thread_sync_valid)) {
+    if ((h->push_valid | h->pull_valid) && m_idle && (*h->user_thread_sync == 0)) {
+      h->loop_counter = HOSTCH_LOOP_COUNTER;
+      SubmitThreadpoolWork(m_work);
+      *h->user_thread_sync = 1;
+    }
+  } else {
+    status = hostch_buffer_lock(user_addr, sizeof(size_t), &(h->m_sync_thread_pointer));
+    h->user_thread_sync = (size_t *)h->m_sync_thread_pointer.UsrVa;
+    h->loop_counter = HOSTCH_LOOP_COUNTER;
+    *h->user_thread_sync = 0;
+    h->thread_sync_valid = 1;
+  }
+}
+
+int ACL_PCIE_DMA::hostch_create(void *user_addr, void *buf_pointer, size_t size, int channel) {
+  int status;
+  uint32_t i;
+  HOSTCH_DESC *h = &hostch_data;
+
+  DMA_ADDR dma_address;
+  h->buffer_size = size;
+
+  setup_dma_desc();
+#if defined(GEN3_x8)
+  m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id);
+  ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: read dma_rd_last_id %u\n", (unsigned)m_last_id);
+
+  // Set variables before calling dma helper functions
+  m_last_id++;
+#endif
+  m_read = 0;
+
+  // Only create push channel if it's not already open
+  if ((int)ACL_HOST_CHANNEL_0_ID == channel && !h->push_valid) {
+    h->user_rd_buffer = user_addr;
+
+    // Pin push user buffer
+    status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_rd_mem));
+    status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_rd_pointer));
+
+    // Map circular push buffer's end pointer so that the driver can poll on it for update from user space
+    h->user_rd_front_pointer = (size_t *)h->m_hostch_rd_pointer.UsrVa;
+    h->user_rd_end_pointer = h->user_rd_front_pointer + 1;
+
+    // Send the circular push buffer's pinned address to IP, so IP can initiate DMA transfer by itself.
+    for (i = 0; i < (size / PAGE_SIZE); i++) {
+      dma_address = h->m_hostch_rd_mem.next_page->phys_addr;
+      set_hostch_page_entry(&(h->push_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i);
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: push page entry[%u] = %#016llx size = %#016x\n",
+                         (unsigned)i,
+                         (UINT64)dma_address,
+                         h->m_hostch_rd_mem.next_page->length);
+
+      // Make 4KB pages from an array of pages of m_hostch_rd_mem
+      if (h->m_hostch_rd_mem.next_page->length == PAGE_SIZE) {
+        ++h->m_hostch_rd_mem.next_page;
+        h->m_hostch_rd_mem.pages_rem--;
+      } else {
+        h->m_hostch_rd_mem.next_page->length -= PAGE_SIZE;
+        h->m_hostch_rd_mem.next_page->phys_addr += PAGE_SIZE;
+      }
+    }
+
+    set_desc_table_header();
+    check_last_id(&m_last_id);
+
+#if defined(GEN3_x8)
+    // Set variable before calling dma helper functions
+    m_active_descriptor = &(m_table_virt_addr->descriptors[0]);
+    set_read_desc(
+        h->push_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_RD_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4));
+    m_last_id++;
+
+    // Read Interrupt will be disabled from send_dma_desc till poll_wait
+    m_interrupt_disabled = TRUE;
+    send_dma_desc();
+    poll_wait();
+#endif
+
+    // Reset and enable the push channel on IP
+    UINT32 data;
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data);
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 1);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data);
+
+    // Set IP's control registers for push channel
+    hostch_start((int)ACL_HOST_CHANNEL_0_ID);
+
+    h->push_valid = 1;
+
+    // Only launch queue if pull channel is not open and if there is no DMA transfer
+    if (!h->pull_valid && m_idle) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return -1,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+    return 0;
+
+  } else if ((int)ACL_HOST_CHANNEL_1_ID == channel && !h->pull_valid) {
+    h->user_wr_buffer = user_addr;
+
+    // Pin pull user buffer
+    status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_wr_mem));
+    status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_wr_pointer));
+
+    // Map circular pull buffer's end pointer so that the driver can poll on it for update from user space
+    h->user_wr_front_pointer = (size_t *)h->m_hostch_wr_pointer.UsrVa;
+    h->user_wr_end_pointer = h->user_wr_front_pointer + 1;
+
+    // Send the circular pull buffer's pinned address to IP, so IP can initiate DMA transfer by itself.
+    for (i = 0; i < (size / PAGE_SIZE); i++) {
+      dma_address = h->m_hostch_wr_mem.next_page->phys_addr;
+      set_hostch_page_entry(&(h->pull_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i);
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: pull page entry[%u] = %#016llx size = %#016x\n",
+                         (unsigned)i,
+                         (UINT64)dma_address,
+                         h->m_hostch_wr_mem.next_page->length);
+
+      // Make 4KB pages from an array of pages of m_hostch_wr_mem
+      if (h->m_hostch_wr_mem.next_page->length == PAGE_SIZE) {
+        ++h->m_hostch_wr_mem.next_page;
+        h->m_hostch_wr_mem.pages_rem--;
+      } else {
+        h->m_hostch_wr_mem.next_page->length -= PAGE_SIZE;
+        h->m_hostch_wr_mem.next_page->phys_addr += PAGE_SIZE;
+      }
+    }
+
+    set_desc_table_header();
+    check_last_id(&m_last_id);
+
+#if defined(GEN3_x8)
+    // Set variable before calling dma helper functions
+    m_active_descriptor = &(m_table_virt_addr->descriptors[0]);
+    set_read_desc(
+        h->pull_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_WR_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4));
+    m_last_id++;
+
+    // Read Interrupt will be disabled from send_dma_desc till poll_wait
+    m_interrupt_disabled = TRUE;
+    send_dma_desc();
+    poll_wait();
+#endif
+
+    // Reset and enable the pull channel on IP
+    UINT32 temp;
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp);
+    m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 1);
+    m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp);
+
+    // Set IP's control registers for pull channel
+    hostch_start((int)ACL_HOST_CHANNEL_1_ID);
+
+    h->pull_valid = 1;
+
+    // Only launch queue if push channel is not open and if there is no DMA transfer
+    if (!h->push_valid && m_idle) {
+      ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0,
+                        return -1,
+                        "HostCh : failed to schedule the first work for DMA read/write.\n");
+      SubmitThreadpoolWork(m_work);
+    }
+    return 0;
+
+  } else {
+    return ERROR_INVALID_CHANNEL;
+  }
+}
+
+// Destroy channel call from user.
+// Unlock all buffers and reset IP
+int ACL_PCIE_DMA::hostch_destroy(int channel) {
+  HOSTCH_DESC *h = &hostch_data;
+
+  if ((int)ACL_HOST_CHANNEL_0_ID == channel) {
+    if (h->push_valid) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying push host channel.");
+      m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 0);
+      MemoryBarrier();
+      m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0);
+      MemoryBarrier();
+
+      if (h->m_hostch_rd_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_mem);
+      if (h->m_hostch_rd_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_pointer);
+      h->push_valid = 0;
+
+      if (!h->pull_valid) {
+        if (h->thread_sync_valid) {
+          h->thread_sync_valid = 0;
+          if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer);
+        }
+        if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false);
+      }
+    }
+  } else if ((int)ACL_HOST_CHANNEL_1_ID == channel) {
+    if (h->pull_valid) {
+      ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying pull host channel.");
+      m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 0);
+      MemoryBarrier();
+      m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0);
+      MemoryBarrier();
+
+      if (h->m_hostch_wr_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_mem);
+      if (h->m_hostch_wr_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_pointer);
+      h->pull_valid = 0;
+
+      if (!h->push_valid) {
+        if (h->thread_sync_valid) {
+          h->thread_sync_valid = 0;
+          if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer);
+        }
+        if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false);
+      }
+    }
+  }
+
+  return 0;
+}
+
+#endif  // WINDOWS
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h
new file mode 100644
index 0000000..311c634
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h
@@ -0,0 +1,262 @@
+#ifndef ACL_PCIE_DMA_WINDOWS_H
+#define ACL_PCIE_DMA_WINDOWS_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_dma_windows.h  --------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Windows-specific DMA operations.         */
+/* The actual implementation of the class lives in the acl_pcie_dma_windows.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+// TODO: update DMA related stuff and add wsid
+
+#if defined(WINDOWS)
+
+#include "hw_host_channel.h"
+#include "hw_pcie_dma.h"
+
+#include <windows.h>
+#include <queue>
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_TIMER;
+
+typedef struct _PAGE_INFO {
+  ULONG64 pPhysicalAddr;
+  UINT32 dwBytes;
+} PAGE_INFO, *PPAGE_INFO;
+
+typedef struct _DMA_PAGE {
+  sg_element *Page;
+  DWORD dwPages;
+  UINT64 WsId;
+} DMA_PAGE, *PDMA_PAGE;
+
+typedef struct _QUEUE_STRUCT {
+  UINT64 WsId;
+  PVOID SGListPtr;
+
+} QUEUE_STRUCT, *PQUEUE_STRUCT;
+
+class ACL_PCIE_DMA {
+ public:
+  ACL_PCIE_DMA(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie);
+  ~ACL_PCIE_DMA();
+
+  bool is_idle() { return m_idle; };
+  void stall_until_idle() {
+    while (!is_idle()) yield();
+  };
+
+  // Called by acl_pcie_device to check dma interrupt status
+  int check_dma_interrupt(unsigned int *dma_update);
+
+  // Perform operations required when a DMA interrupt comes
+  void service_interrupt();
+
+  // Relinquish the CPU to let any other thread to run
+  // Return 0 since there is no useful work to be performed here
+  int yield();
+
+  // Transfer data between host and device
+  // This function returns right after the transfer is scheduled
+  // Return 0 on success
+  int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading);
+
+  // the callback function to be scheduled inside the interrupt handler
+  friend void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Seperate function to unpin memory
+  friend void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Seperate function to pin memory
+  friend void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work);
+
+  // Host channel functions
+  int hostch_create(void *user_addr, void *buf_pointer, size_t size, int reading);
+  int hostch_destroy(int reading);
+  void hostch_thread_sync(void *m_sync_thread);
+
+ private:
+  ACL_PCIE_DMA &operator=(const ACL_PCIE_DMA &) { return *this; }
+
+  ACL_PCIE_DMA(const ACL_PCIE_DMA &src) {}
+
+  struct PINNED_MEM {
+    sg_element *next_page;
+    DWORD pages_rem;
+    sg_element *dma_page;  // Pointer to the original array
+    UINT64 WsId;
+    PVOID UsrVa;
+  };
+
+  struct HOSTCH_DESC {
+    size_t buffer_size;
+    unsigned int loop_counter;
+
+    // Host channel valid
+    // If channel is open, equal to 1
+    int push_valid;
+    int pull_valid;
+
+    // User memory circular buffer
+    void *user_rd_buffer;
+    void *user_wr_buffer;
+
+    // Array of physical addresses of locked hostch pages
+    HOSTCH_TABLE *push_page_table;
+    HOSTCH_TABLE *pull_page_table;
+
+    DMA_PAGE push_page_table_addr;
+    DMA_PAGE pull_page_table_addr;
+
+    // Physical address of the page table
+    DMA_ADDR push_page_table_bus_addr;
+    DMA_ADDR pull_page_table_bus_addr;
+
+    PINNED_MEM m_hostch_rd_mem;
+    PINNED_MEM m_hostch_wr_mem;
+
+    // User memory circular buffer front and end pointers
+    size_t *user_rd_front_pointer;
+    size_t *user_rd_end_pointer;
+    size_t *user_wr_front_pointer;
+    size_t *user_wr_end_pointer;
+
+    DMA_ADDR user_rd_front_pointer_bus_addr;
+    DMA_ADDR user_wr_end_pointer_bus_addr;
+
+    PINNED_MEM m_hostch_rd_pointer;
+    PINNED_MEM m_hostch_wr_pointer;
+
+    // Keep track of push end pointer
+    size_t rd_buf_end_pointer;
+
+    // Keep track of pull front pointer
+    size_t wr_buf_front_pointer;
+
+    // User and driver thread synchronizer
+    int thread_sync_valid;
+    size_t *user_thread_sync;
+    DMA_ADDR user_thread_sync_bus_addr;
+    PINNED_MEM m_sync_thread_pointer;
+  };
+
+  // function to be scheduled to execute whenever an interrupt arrived
+  bool update(bool force_update = false);
+
+  // Helper functions
+  inline void *compute_address(void *base, uintptr_t offset);
+  void set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len);
+  void set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len);
+  void set_desc_table_header();
+  void send_dma_desc();
+  void check_last_id(UINT32 *last_id);
+  void pin_memory(PINNED_MEM *new_mem, bool prepin);
+  void unpin_memory(PINNED_MEM *old_mem);
+  void wait_finish();
+  void unpin_from_queue();
+  void prepin_memory();
+
+  void set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id);
+  void add_extra_dma_desc();
+  // Hostchannel helper function
+  void hostch_start(int channel);
+  int hostch_push_update();
+  int hostch_pull_update();
+  int hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem);
+  void poll_wait();
+  void set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num);
+  void setup_dma_desc();
+  void spin_loop_ns(UINT64 wait_ns);
+
+  // From environment variable
+  int m_use_polling;
+
+  // The dma object we are currently building transactions for
+  PINNED_MEM m_active_mem;
+  PINNED_MEM m_pre_pinned_mem;
+  PINNED_MEM m_done_mem;
+
+  // Hostchannel Struct
+  HOSTCH_DESC hostch_data;
+
+  // The transaction we are currently working on
+  DMA_DESC_TABLE *m_table_virt_addr;
+  DMA_PAGE m_table_dma_addr;
+  DMA_ADDR m_table_dma_phys_addr;
+  DMA_DESC_ENTRY *m_active_descriptor;
+
+  size_t m_last_pinned_size;
+  void *m_last_pinned_addr;
+
+  // Signal to stop multiple pre-pinning from running
+  bool m_prepinned;
+
+  // Local copy of last transfer id. Read once when DMA transfer starts
+  UINT32 m_last_id;
+
+  // variables for the read/write request
+  aocl_mmd_op_t m_event;
+  size_t m_dev_addr;
+  void *m_host_addr;
+  size_t m_bytes;
+  size_t m_bytes_sent;
+  size_t m_bytes_rem;
+  bool m_read;
+  bool m_idle;
+  bool m_interrupt_disabled;
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_TIMER *m_timer;
+
+  // variables needed for the threadpool and works that submitted to it
+  TP_CALLBACK_ENVIRON m_callback_env;
+  PTP_POOL m_threadpool;
+  PTP_WORK m_work;
+
+  // This variable is accessed by the callback function defined in acl_pcie_dma_windows.cpp
+  // This semaphore is intended to keep at most 1 work in queued (not running)
+  HANDLE m_workqueue_semaphore;
+
+  // Seperate thread to unpin
+
+  std::queue<QUEUE_STRUCT> m_dma_unpin_pending;
+
+  TP_CALLBACK_ENVIRON m_unpin_callback_env;
+  PTP_POOL m_unpin_threadpool;
+  PTP_WORK m_unpin_work;
+
+  // Separate thread to pre-pin
+
+  TP_CALLBACK_ENVIRON m_pin_callback_env;
+  PTP_POOL m_pin_threadpool;
+  PTP_WORK m_pin_work;
+};
+
+#endif  // WINDOWS
+
+#endif  // ACL_PCIE_DMA_WINDOWS_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp
new file mode 100644
index 0000000..0dc6d74
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp
@@ -0,0 +1,764 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_hostch.cpp  ------------------------------------------ C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle Linux-specific DMA operations.         */
+/* The declaration of the class lives in the acl_pcie_dma_linux.h                  */
+/* The actual implementation of DMA operation is inside the Linux kernel driver.   */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_hostch.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+#include "acl_pcie_device.h"
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie_timer.h"
+#include "hw_host_channel.h"
+
+// other standard header files
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <iostream>
+
+#if defined(LINUX)
+#include <unistd.h>
+#endif  // LINUX
+#if defined(WINDOWS)
+#include "acl_pcie_dma_windows.h"
+#endif  // WINDOWS
+
+void acl_aligned_malloc(void **result, size_t size) {
+#if defined(LINUX)
+  int posix_success;
+  *result = NULL;
+  posix_success = posix_memalign(result, PAGE_SIZE, size);
+  ACL_PCIE_ASSERT(posix_success == 0, "posix_memalign has failed.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+  *result = _aligned_malloc(size, PAGE_SIZE);
+#endif  // WINDOWS
+}
+
+void acl_aligned_free(void *ptr) {
+#if defined(LINUX)
+  free(ptr);
+#endif  // LINUX
+#if defined(WINDOWS)
+  _aligned_free(ptr);
+#endif  // WINDOWS
+}
+
+ACL_PCIE_HOSTCH::ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma)
+    : m_push_queue(NULL),
+      m_push_queue_local_end_p(0),
+      m_push_queue_size(0),
+      m_pull_queue(NULL),
+      m_pull_queue_local_front_p(0),
+      m_pull_queue_size(0),
+      m_pull_queue_available(0),
+      m_pull_queue_pointer(NULL),
+      m_push_queue_pointer(NULL),
+      m_pull_queue_front_p(NULL),
+      m_pull_queue_end_p(NULL),
+      m_push_queue_front_p(NULL),
+      m_push_queue_end_p(NULL),
+      m_sync_thread(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n");
+  ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n");
+  ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n");
+  ACL_PCIE_ASSERT(dma != NULL, "passed in an empty pointer for dma when creating dma object.\n");
+
+  m_handle = handle;
+  m_pcie = pcie;
+  m_io = io;
+  m_dma = dma;
+  m_timer = new ACL_PCIE_TIMER();
+
+  // Set the valid for all the channels and helper function that checks status of driver thread
+  // to 0
+  m_hostch_push_valid = 0;
+  m_hostch_pull_valid = 0;
+  m_sync_thread_valid = 0;
+
+  const char *dma_timer = getenv("ACL_PCIE_DMA_TIMER");
+  if (dma_timer)
+    m_use_timer = 1;
+  else
+    m_use_timer = 0;
+}
+
+ACL_PCIE_HOSTCH::~ACL_PCIE_HOSTCH() {
+  // If push channel (channel 0) is valid, reset its IP and unpin the MMD buffer
+  if (m_hostch_push_valid) {
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Save the device id for the selected board
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_destroy(ACL_HOST_CHANNEL_0_ID);
+#endif  // WINDOWS
+
+    if (m_push_queue) {
+      acl_aligned_free(m_push_queue);
+      m_push_queue = NULL;
+    }
+
+    if (m_push_queue_pointer) {
+      acl_aligned_free(m_push_queue_pointer);
+      m_push_queue_pointer = NULL;
+    }
+
+    m_hostch_push_valid = 0;
+  }
+
+  // If pull channel (channel 1) is valid, reset its IP and unpin the MMD buffer
+  if (m_hostch_pull_valid) {
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Save the device id for the selected board
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_destroy(ACL_HOST_CHANNEL_1_ID);
+#endif  // WINDOWS
+
+    if (m_pull_queue) {
+      acl_aligned_free(m_pull_queue);
+      m_pull_queue = NULL;
+    }
+
+    if (m_pull_queue_pointer) {
+      acl_aligned_free(m_pull_queue_pointer);
+      m_pull_queue_pointer = NULL;
+    }
+
+    m_hostch_pull_valid = 0;
+  }
+
+  if (m_timer) {
+    delete m_timer;
+    m_timer = NULL;
+  }
+}
+
+// Get host channel version of currently programmed device
+unsigned int ACL_PCIE_HOSTCH::get_hostch_version() {
+  // Make sure version is not what you expect
+  unsigned int version = ACL_VERSIONID ^ 1;
+  unsigned int hostch_version = ACL_HOSTCH_ZERO_CHANNELS ^ 1;
+
+  // Read device version
+  m_io->version->read32(0, &version);
+
+  if (!ACL_HOSTCH_ENABLE) {
+    return ACL_HOSTCH_ZERO_CHANNELS;
+  }
+
+  // Read hostchannel version
+  m_io->hostch_ver->read32(0, &hostch_version);
+
+  return hostch_version;
+}
+
+// Function to check that the driver thread that update host channel IP with
+// user's updates to MMD buffer's end and front index, is still running.
+// Ack call will call sync_thread() if driver thread has timed out.
+// Linux kernel space driver thread is set to timeout in 1ms
+// if there hasn't been any changes to circular buffer pointer from the host.
+int ACL_PCIE_HOSTCH::launch_sync_thread() {
+  if (m_sync_thread_valid == 0) {
+    acl_aligned_malloc((void **)&m_sync_thread, sizeof(size_t));
+
+    if (m_sync_thread == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+#if defined(LINUX)
+    // Save the device id for the selected board
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = m_sync_thread;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_thread_sync(m_sync_thread);
+#endif  // WINDOWS
+
+    m_sync_thread_valid = 1;
+  } else {
+    return 1;
+  }
+  return 0;
+}
+
+int ACL_PCIE_HOSTCH::sync_thread() {
+  if (m_sync_thread_valid && (*m_sync_thread == 0)) {
+#if defined(LINUX)
+    // Save the device id for the selected board
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC;
+    driver_cmd.device_addr = NULL;
+    driver_cmd.user_addr = NULL;
+    driver_cmd.size = 0;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_thread_sync(NULL);
+#endif  // WINDOWS
+
+    return 0;
+  }
+  return 1;
+}
+
+// This is called only when there aren't any host channels open
+// m_sync_thread is unpinned as part of destroy call to driver. Now free it.
+void ACL_PCIE_HOSTCH::destroy_sync_thread() {
+  if (m_sync_thread_valid) {
+    if (m_sync_thread != NULL) acl_aligned_free(m_sync_thread);
+
+    m_sync_thread_valid = 0;
+    m_sync_thread = NULL;
+  }
+}
+
+// Create host channel. Allocate circular buffer and pin it.
+// Then set channel to valid.
+int ACL_PCIE_HOSTCH::create_hostchannel(char *name, size_t queue_depth, int direction) {
+  int status;
+  unsigned int hostch_version;
+
+  hostch_version = get_hostch_version();
+  ACL_PCIE_DEBUG_MSG_VERBOSE(
+      VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel version read was %u\n", hostch_version);
+
+  // Check if channel name user wants to open exists
+  if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) &&
+      (strncmp(ACL_HOST_CHANNEL_0_NAME, name, strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) == 0)) {
+    int channel = ACL_HOST_CHANNEL_0_ID;
+    // Check if hostchannel version is one that has ACL_HOST_CHANNEL_0_ID
+    if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX,
+                                 ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n",
+                                 ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_INVALID_CHANNEL;
+    }
+
+    // check if the direction for the channel is correct
+    if (direction != ACL_HOST_CHANNEL_0_WRITE) return ERROR_INCORRECT_DIRECTION;
+
+    // Check if channel was already opened previously
+    if (m_hostch_push_valid) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_CHANNEL_PREVIOUSLY_OPENED;
+    }
+
+    // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size
+    size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.)));
+    size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE)
+                               ? HOSTCH_MAX_BUF_SIZE
+                               : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE);
+
+    // Make sure the channel depth is at least 4KB
+    if (!channel_depth) channel_depth = PAGE_SIZE;
+
+    // Create circular buffer for push
+    acl_aligned_malloc(&m_push_queue, channel_depth);
+
+    if (m_push_queue == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+    // Create buffer to hold front and end pointer for the circular buffer
+    acl_aligned_malloc((void **)&m_push_queue_pointer, sizeof(size_t) * 2);
+
+    if (m_push_queue_pointer == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      acl_aligned_free(m_push_queue);
+      return -1;
+    }
+
+    // Set parameters for the push channel
+    m_push_queue_size = channel_depth;
+    m_push_queue_local_end_p = 0;
+
+    m_push_queue_front_p = m_push_queue_pointer;
+    m_push_queue_end_p = (m_push_queue_pointer + 1);
+
+    *m_push_queue_front_p = 0;
+    *m_push_queue_end_p = 0;
+
+    // sync_thread() used to check if kernel thread is still running when user has additional data available.
+    status = launch_sync_thread();
+    if (status == -1) {
+      acl_aligned_free(m_push_queue);
+      acl_aligned_free(m_push_queue_pointer);
+      return -1;
+    }
+
+#if defined(LINUX)
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    // Send the pointers for the 2 buffers to driver, along with queue size
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_RD;
+    driver_cmd.device_addr = m_push_queue_pointer;
+    driver_cmd.user_addr = m_push_queue;
+    driver_cmd.size = channel_depth;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_create(m_push_queue, m_push_queue_pointer, channel_depth, channel);
+#endif  // WINDOWS
+
+    m_hostch_push_valid = 1;
+    return channel;
+  } else if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) &&
+             (strncmp(ACL_HOST_CHANNEL_1_NAME, name, strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) == 0)) {
+    int channel = ACL_HOST_CHANNEL_1_ID;
+
+    // Check if hostchannel version is one that has ACL_HOST_CHANNEL_1_ID
+    if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX,
+                                 ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n",
+                                 ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_INVALID_CHANNEL;
+    }
+
+    // Check if direction is correct
+    if (direction != ACL_HOST_CHANNEL_1_WRITE) return ERROR_INCORRECT_DIRECTION;
+
+    // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size
+    size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.)));
+    size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE)
+                               ? HOSTCH_MAX_BUF_SIZE
+                               : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE);
+
+    // Make sure the circular buffer is at least 4KB
+    if (!channel_depth) channel_depth = PAGE_SIZE;
+
+    // Check if pull channel was previously opened
+    if (m_hostch_pull_valid) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_CHANNEL_PREVIOUSLY_OPENED;
+    }
+
+    // Create circular buffer
+    acl_aligned_malloc(&m_pull_queue, channel_depth);
+
+    if (m_pull_queue == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      return -1;
+    }
+
+    // Create buffer to hold front and end pointer of the circular buffer
+    acl_aligned_malloc((void **)&m_pull_queue_pointer, sizeof(size_t) * 2);
+
+    if (m_pull_queue_pointer == NULL) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n");
+      acl_aligned_free(m_pull_queue);
+      return -1;
+    }
+
+    // Set pull channel parameters
+    m_pull_queue_size = channel_depth;
+    m_pull_queue_available = 0;
+    m_pull_queue_local_front_p = 0;
+
+    m_pull_queue_front_p = m_pull_queue_pointer;
+    m_pull_queue_end_p = (m_pull_queue_pointer + 1);
+
+    *m_pull_queue_front_p = 0;
+    *m_pull_queue_end_p = 0;
+
+    // sync_thread() used to check if kernel thread is dead or alive when user pulls data
+    status = launch_sync_thread();
+    if (status == -1) {
+      acl_aligned_free(m_pull_queue);
+      acl_aligned_free(m_pull_queue_pointer);
+      return -1;
+    }
+
+#if defined(LINUX)
+    // Send the pointers for the 2 buffers to driver, along with queue size, and initiate IP
+    struct acl_cmd driver_cmd;
+    int bytes_read;
+    driver_cmd.bar_id = ACLPCI_CMD_BAR;
+    driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_WR;
+    driver_cmd.device_addr = m_pull_queue_pointer;
+    driver_cmd.user_addr = m_pull_queue;
+    driver_cmd.size = channel_depth;
+    bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+    ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+    m_dma->hostch_create(m_pull_queue, m_pull_queue_pointer, channel_depth, channel);
+#endif  // WINDOWS
+
+    m_hostch_pull_valid = 1;
+    return channel;
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel does not exist.\n");
+    return ERROR_INVALID_CHANNEL;
+  }
+}
+
+// Destroy Channel. Unlock all buffer, and set channel to invalid.
+int ACL_PCIE_HOSTCH::destroy_hostchannel(int channel) {
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    if (m_hostch_push_valid) {
+      // set pull IP to reset and unlock all buffers
+#if defined(LINUX)
+      struct acl_cmd driver_cmd;
+      int bytes_read;
+      driver_cmd.bar_id = ACLPCI_CMD_BAR;
+      driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD;
+      driver_cmd.device_addr = NULL;
+      driver_cmd.user_addr = NULL;
+      driver_cmd.size = 0;
+      bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+      ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+      m_dma->hostch_destroy(channel);
+#endif  // WINDOWS
+
+      if (m_push_queue) {
+        acl_aligned_free(m_push_queue);
+        m_push_queue = NULL;
+      }
+      if (m_push_queue_pointer) {
+        acl_aligned_free(m_push_queue_pointer);
+        m_push_queue_pointer = NULL;
+      }
+
+      m_hostch_push_valid = 0;
+      if (m_hostch_pull_valid == 0) {
+        destroy_sync_thread();
+      }
+      return 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      return ERROR_CHANNEL_CLOSED;
+    }
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid) {
+#if defined(LINUX)
+      // set push IP to reset and unlock all buffers
+      struct acl_cmd driver_cmd;
+      int bytes_read;
+      driver_cmd.bar_id = ACLPCI_CMD_BAR;
+      driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR;
+      driver_cmd.device_addr = NULL;
+      driver_cmd.user_addr = NULL;
+      driver_cmd.size = 0;
+      bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+      ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n");
+#endif  // LINUX
+#if defined(WINDOWS)
+      m_dma->hostch_destroy(channel);
+#endif  // WINDOWS
+
+      if (m_pull_queue) {
+        acl_aligned_free(m_pull_queue);
+        m_pull_queue = NULL;
+      }
+
+      if (m_pull_queue_pointer) {
+        acl_aligned_free(m_pull_queue_pointer);
+        m_pull_queue_pointer = NULL;
+      }
+
+      m_hostch_pull_valid = 0;
+
+      if (m_hostch_push_valid == 0) {
+        destroy_sync_thread();
+      }
+
+      return 0;
+    } else {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      return ERROR_CHANNEL_CLOSED;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+  }
+
+  return ERROR_INVALID_CHANNEL;
+}
+
+// Call for user to get pointer to location in circular buffer
+// User can then write data or read data from the buffer, depending on direction.
+void *ACL_PCIE_HOSTCH::get_buffer(size_t *buffer_size, int channel, int *status) {
+  // Check if channel exists
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    // Check if channel was created
+    if (m_hostch_push_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      *buffer_size = 0;
+      return NULL;
+    }
+    *status = 0;
+
+    char *temp_input_queue = (char *)m_push_queue;
+
+    size_t push_queue_end, push_queue_front;
+
+    // m_push_queue_front_p is directly updated by host channel IP
+    // through write over Txs. Save value in local variable,
+    // so it doesn't get modified in middle of get_buffer call
+    push_queue_end = *m_push_queue_end_p;
+    push_queue_front = *m_push_queue_front_p;
+
+    // Calculate available free space in host to device push buffer
+    size_t push_buf_avail;
+    if (push_queue_end > push_queue_front)
+      push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32;
+    else if (push_queue_end < push_queue_front)
+      push_buf_avail = push_queue_front - push_queue_end - 32;
+    else
+      push_buf_avail = m_push_queue_size - 32;
+
+    // Calculate how much of the free space is before loop around and after loop around
+    size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail)
+                           ? push_buf_avail
+                           : m_push_queue_size - m_push_queue_local_end_p;
+    size_t loop_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail)
+                           ? 0
+                           : (m_push_queue_local_end_p + push_buf_avail - m_push_queue_size);
+
+    // Return to user the pointer to circular buffer for
+    // space that's available without loop around
+    if (cont_push > 0) {
+      *buffer_size = cont_push;
+      return temp_input_queue + m_push_queue_local_end_p;
+    } else if (loop_push > 0) {
+      *buffer_size = loop_push;
+      return temp_input_queue;
+    } else {
+      *status = 0;
+      *buffer_size = 0;
+
+      // See if the driver thread is still running
+      sync_thread();
+
+      return NULL;
+    }
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      *buffer_size = 0;
+      return NULL;
+    }
+    *status = 0;
+
+    char *temp_output_queue = (char *)m_pull_queue;
+
+    size_t pull_queue_end, pull_queue_front;
+
+    // m_pull_queue_end_p is directly updated by host channel IP
+    // through write over Txs. Save value in local variable,
+    // so it doesn't get modified in middle of get_buffer call
+    pull_queue_end = *m_pull_queue_end_p;
+    pull_queue_front = *m_pull_queue_front_p;
+
+    // Calculate available new data in device to host pull buffer
+    if (pull_queue_end > pull_queue_front)
+      m_pull_queue_available = pull_queue_end - pull_queue_front;
+    else if (pull_queue_end < pull_queue_front)
+      m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end;
+    else
+      m_pull_queue_available = 0;
+
+    // Calculate how much of the data is before loop around and after loop around
+    size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available)
+                           ? m_pull_queue_available
+                           : (m_pull_queue_size - m_pull_queue_local_front_p);
+    size_t loop_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available)
+                           ? 0
+                           : (m_pull_queue_local_front_p + m_pull_queue_available - m_pull_queue_size);
+
+    // Return to user the pointer to circular buffer for
+    // data that's available without loop around
+    if (cont_pull > 0) {
+      *buffer_size = cont_pull;
+      return temp_output_queue + m_pull_queue_local_front_p;
+    } else if (loop_pull > 0) {
+      *buffer_size = loop_pull;
+      return temp_output_queue;
+    } else {
+      *buffer_size = 0;
+
+      // See if the driver thread is still running
+      sync_thread();
+
+      return NULL;
+    }
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+    *status = ERROR_INVALID_CHANNEL;
+    *buffer_size = 0;
+    return NULL;
+  }
+}
+
+// User has acknowledged the buffer, meaning data was written to or read from the buffter.
+// Hand off to API using end pointer if push channel, and front pointer if pull channel.
+size_t ACL_PCIE_HOSTCH::ack_buffer(size_t send_size, int channel, int *status) {
+  if (channel == ACL_HOST_CHANNEL_0_ID) {
+    if (m_hostch_push_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      return 0;
+    }
+    *status = 0;
+
+    size_t push_queue_end, push_queue_front;
+
+    // Same calculations as get buffer call to see how much
+    // space is available in MMD circular buffer
+    push_queue_end = *m_push_queue_end_p;
+    push_queue_front = *m_push_queue_front_p;
+
+    size_t push_buf_avail;
+    if (push_queue_end > push_queue_front)
+      push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32;
+    else if (push_queue_end < push_queue_front)
+      push_buf_avail = push_queue_front - push_queue_end - 32;
+    else
+      push_buf_avail = m_push_queue_size - 32;
+
+    // Check to see if user wants to send more than the space available in buffer
+    // Chose lesser of the two to send
+    size_t user_words = send_size / 32;
+    size_t current_push = ((user_words * 32) > push_buf_avail) ? push_buf_avail : (user_words * 32);
+
+    // User can't write back to beginning of MMD buffer, since they can't loop around from the pointer
+    // they got from get_buffer. Only send up to the end of MMD circular buffer to host channel IP
+    size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + current_push)
+                           ? current_push
+                           : (m_push_queue_size - m_push_queue_local_end_p);
+
+    // Update the end index that the driver thread will read, to write the update to host channel IP
+    // and loop around
+    m_push_queue_local_end_p =
+        (m_push_queue_local_end_p + current_push >= m_push_queue_size) ? 0 : m_push_queue_local_end_p + current_push;
+    *m_push_queue_end_p = m_push_queue_local_end_p;
+
+    // See if the driver thread is still running
+    sync_thread();
+
+    return cont_push;
+  } else if (channel == ACL_HOST_CHANNEL_1_ID) {
+    if (m_hostch_pull_valid == 0) {
+      ACL_PCIE_DEBUG_MSG_VERBOSE(
+          VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME);
+      *status = ERROR_CHANNEL_CLOSED;
+      return 0;
+    }
+    *status = 0;
+
+    size_t driver_pulled;
+
+    size_t pull_queue_end, pull_queue_front;
+
+    // Same calculations as get buffer call to see how much
+    // data is available in MMD circular buffer
+    pull_queue_end = *m_pull_queue_end_p;
+    pull_queue_front = *m_pull_queue_front_p;
+
+    if (pull_queue_end > pull_queue_front)
+      m_pull_queue_available = pull_queue_end - pull_queue_front;
+    else if (pull_queue_end < pull_queue_front)
+      m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end;
+    else
+      m_pull_queue_available = 0;
+
+    // Check to see if user read more than the data available in buffer
+    // Chose lesser of the two to tell the user how much was actually
+    // freed up for host channel IP to write to.
+    driver_pulled = (send_size > m_pull_queue_available) ? m_pull_queue_available : send_size;
+
+    // User can't loop around and read from the beginning of MMD buffer
+    // Tell the host channel IP that the buffer is free, only up to the end of the circular buffer
+    size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + driver_pulled)
+                           ? driver_pulled
+                           : (m_pull_queue_size - m_pull_queue_local_front_p);
+
+    // Update the front index that the driver thread will read, to write the update to host channel IP
+    // and loop around
+    m_pull_queue_local_front_p = (m_pull_queue_local_front_p + driver_pulled >= m_pull_queue_size)
+                                     ? 0
+                                     : m_pull_queue_local_front_p + driver_pulled;
+    *m_pull_queue_front_p = m_pull_queue_local_front_p;
+
+    // See if the driver thread is still running
+    sync_thread();
+
+    return cont_pull;
+  } else {
+    ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel);
+    *status = ERROR_INVALID_CHANNEL;
+    return 0;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h
new file mode 100644
index 0000000..e86fa61
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h
@@ -0,0 +1,136 @@
+#ifndef ACL_PCIE_HOSTCH_H
+#define ACL_PCIE_HOSTCH_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_hostch.h  -------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle Linux-specific DMA operations.           */
+/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#if defined(LINUX)
+typedef int fpga_handle;
+#else
+#include <opae/fpga.h>
+#endif
+#endif
+
+class ACL_PCIE_DEVICE;
+class ACL_PCIE_MM_IO_MGR;
+class ACL_PCIE_TIMER;
+class ACL_PCIE_DMA;
+
+class ACL_PCIE_HOSTCH {
+ public:
+  ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma);
+
+  ~ACL_PCIE_HOSTCH();
+
+  // Initialize host channel specified by name, and return handle to it
+  int create_hostchannel(char *name, size_t queue_depth, int direction);
+
+  // Destroy host channel specified by channel handle
+  // return 0 on success and negative otherwise
+  int destroy_hostchannel(int channel);
+
+  // Provide pointer to user with pointer to write and read to host channel
+  // IP with. Pointer is pointer to MMD circular buffer, that's pre-pinned.
+  // Address of this pre-pinned memory is transferred to IP during create
+  void *get_buffer(size_t *buffer_size, int channel, int *status);
+
+  // Acknowledge from user that send_size bytes of data has be written to
+  // or read from host channel MMD buffer, that's provided by the channel
+  // handle. This will move end index for push channel, and front index for
+  // pull channel
+  size_t ack_buffer(size_t send_size, int channel, int *status);
+
+ private:
+  ACL_PCIE_HOSTCH &operator=(const ACL_PCIE_HOSTCH &) { return *this; }
+
+  ACL_PCIE_HOSTCH(const ACL_PCIE_HOSTCH &src) {}
+
+  // Host Channel version of programmed device
+  unsigned int get_hostch_version();
+
+  // Helper functions to see if the thread that updates
+  // host channel IP with user's buffer updates, is still running
+  int launch_sync_thread();
+  int sync_thread();
+  void destroy_sync_thread();
+
+  fpga_handle m_handle;
+  ACL_PCIE_DEVICE *m_pcie;
+  ACL_PCIE_MM_IO_MGR *m_io;
+  ACL_PCIE_DMA *m_dma;
+
+  ACL_PCIE_TIMER *m_timer;
+  int m_use_timer;
+
+  // Host Channel valid
+  // If channel is open, equal to 1
+  int m_hostch_push_valid;
+  int m_hostch_pull_valid;
+
+  // Input Queue
+  // Write data into circular buffer in MMD, that host channel
+  // can read from
+  void *m_push_queue;
+  size_t m_push_queue_local_end_p;
+  size_t m_push_queue_size;
+
+  // Information to track input queue
+  void *m_pull_queue;
+  size_t m_pull_queue_local_front_p;
+  size_t m_pull_queue_size;
+  size_t m_pull_queue_available;
+
+  // Shared front and end pointer with driver
+  // Circular buffer in MMD that the host channel IP can
+  // write into. Host will then read from it
+  size_t *m_pull_queue_pointer;
+  size_t *m_push_queue_pointer;
+
+  size_t *m_pull_queue_front_p;
+  size_t *m_pull_queue_end_p;
+  size_t *m_push_queue_front_p;
+  size_t *m_push_queue_end_p;
+
+  // User space memory that Linux kernel space has write
+  // access to. Since the MMD buffer is circular, whenever
+  // user writes to reads from it, the index for end and front
+  // changes, respectively. This needs to be sent to host channel IP
+  // and the thread in driver handles that. However, this thread will
+  // die after 1ms of inactivity to free up the CPU. When it does that,
+  // it will write to m_sync_thread with value of 0, so that MMD knows to
+  // launch it again, for subsequent get_buffer and ack_buffer calls.
+  int m_sync_thread_valid;
+  size_t *m_sync_thread;
+};
+
+void acl_aligned_malloc(void **result, size_t size);
+void acl_aligned_free(void *ptr);
+
+#endif  // ACL_PCIE_HOSTCH_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp
new file mode 100644
index 0000000..92c9cf0
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp
@@ -0,0 +1,556 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_mm_io.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to handle memory mapped IO over PCIe.            */
+/* The declaration of the class lives in the acl_pcie_mm_io.h.                     */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_mm_io.h"
+#include "acl_pcie.h"
+
+// other header files inside MMD driver
+#include "acl_pcie_debug.h"
+
+// other standard header files
+#include <string.h>
+
+#if defined(LINUX)
+#include <unistd.h>  // template
+#endif               // LINUX
+
+ACL_PCIE_MM_IO_DEVICE::ACL_PCIE_MM_IO_DEVICE(
+    fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid handle when creating mm_io object.\n");
+
+#if defined(WINDOWS)
+  strncpy_s(m_name, MAX_NAME_LENGTH - 1, name, (MAX_NAME_LENGTH - 1));
+#else
+  strncpy(m_name, name, (MAX_NAME_LENGTH - 1));
+#endif
+  m_name[(MAX_NAME_LENGTH - 1)] = '\0';
+
+  m_handle = handle;
+  m_bar = bar;
+  m_offset = device_offset;
+  m_diff_endian = diff_endian;
+
+  ACL_PCIE_DEBUG_MSG(":: [%s] Init: Bar " DWORD_FMT_U ", Total offset 0x%zu, diff_endian is %d \n",
+                     m_name,
+                     m_bar,
+                     (size_t)m_offset,
+                     m_diff_endian ? 1 : 0);
+}
+
+ACL_PCIE_MM_IO_DEVICE::~ACL_PCIE_MM_IO_DEVICE() {}
+
+#if defined(LINUX)
+// Helper functions to implement all other read/write functions
+template <typename T>
+DWORD linux_read(fpga_handle device, DWORD bar, KPTR address, T *data) {
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = bar;
+  driver_cmd.command = ACLPCI_CMD_DEFAULT;
+  driver_cmd.device_addr = reinterpret_cast<void *>(address);
+  driver_cmd.user_addr = data;
+  driver_cmd.size = sizeof(*data);
+  // function invoke linux_read will not write to global memory.
+  // So is_diff_endian is always false
+  driver_cmd.is_diff_endian = 0;
+
+  return read(device, &driver_cmd, sizeof(driver_cmd));
+}
+
+template <typename T>
+DWORD linux_write(fpga_handle device, DWORD bar, KPTR address, T data) {
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = bar;
+  driver_cmd.command = ACLPCI_CMD_DEFAULT;
+  driver_cmd.device_addr = reinterpret_cast<void *>(address);
+  driver_cmd.user_addr = &data;
+  driver_cmd.size = sizeof(data);
+  // function invoke linux_write will not write to global memory.
+  // So is_diff_endian is always false
+  driver_cmd.is_diff_endian = 0;
+
+  return write(device, &driver_cmd, sizeof(driver_cmd));
+}
+#endif  // LINUX
+
+int ACL_PCIE_MM_IO_DEVICE::read8(size_t addr, UINT8 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT8));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 8 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 8 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write8(size_t addr, UINT8 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT8));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 8 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 8 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read16(size_t addr, UINT16 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT16));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 16 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 16 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write16(size_t addr, UINT16 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT16));
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 16 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 16 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read32(size_t addr, UINT32 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaReadMMIO32(m_handle, m_bar, bar_addr, data);
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 32 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 32 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write32(size_t addr, UINT32 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  status = fpgaWriteMMIO32(m_handle, m_bar, bar_addr, data);
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 32 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 32 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read64(size_t addr, UINT64 *data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  // Original code had a 32-bit Read
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, data, 8);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_read(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Read 64 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Read 64 bits (0x%llx) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             *data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write64(size_t addr, UINT64 data) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+#if defined(WINDOWS)
+  // Original code had a 32-bit Write
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (void *)&data, 8);
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  status = linux_write(m_handle, m_bar, bar_addr, data);
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing 64 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    bar_addr,
+                    (size_t)bar_addr);
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Wrote 64 bits (0x%llx) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                             m_name,
+                             data,
+                             addr,
+                             (size_t)bar_addr);
+
+  return 0;  // success
+}
+
+int ACL_PCIE_MM_IO_DEVICE::write_block(size_t addr, size_t size, void *src) {
+  fpga_result status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X
+                             " with offset)\n",
+                             m_name,
+                             size,
+                             addr,
+                             (size_t)bar_addr);
+
+#if defined(WINDOWS)
+  DWORD FP_size = static_cast<DWORD>(size);
+  size_t alignment_size = size % 4;
+  DWORD FP_alignment_size = static_cast<DWORD>(alignment_size);
+  // 32-bit MMIO Write
+  status = fpgaWriteMmio(m_handle, m_bar, bar_addr, src, FP_size - FP_alignment_size);
+  if (alignment_size) {
+    void *alignment_addr = compute_address(src, size - alignment_size);
+    KPTR alignment_bar_addr = bar_addr + size - alignment_size;
+    status = fpgaWriteMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size);
+  }
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  // Can't use templated linux_write here because *src doesn't give you the size to read.
+  struct acl_cmd driver_cmd {};
+  driver_cmd.bar_id = m_bar;
+  driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr);
+  driver_cmd.user_addr = src;
+  driver_cmd.size = size;
+  // Notify the driver if the host and device's memory have different endianess.
+  driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0;
+  status = write(m_handle, &driver_cmd, sizeof(driver_cmd));
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    size,
+                    addr,
+                    (size_t)bar_addr);
+  return 0;  // success
+}
+
+inline void *ACL_PCIE_MM_IO_DEVICE::compute_address(void *base, uintptr_t offset) {
+  uintptr_t p = reinterpret_cast<uintptr_t>(base);
+  return reinterpret_cast<void *>(p + offset);
+}
+
+int ACL_PCIE_MM_IO_DEVICE::read_block(size_t addr, size_t size, void *dst) {
+  DWORD status;
+  KPTR bar_addr = convert_to_bar_addr(addr);
+
+  ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE,
+                             ":::::: [%s] Reading block (" SIZE_FMT_U " bytes) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X
+                             " with offset)\n",
+                             m_name,
+                             size,
+                             addr,
+                             (size_t)bar_addr);
+
+#if defined(WINDOWS)
+  DWORD FP_size = static_cast<DWORD>(size);
+  size_t alignment_size = size % 4;
+  DWORD FP_alignment_size = static_cast<DWORD>(alignment_size);
+  // 32-bit MMIO Read
+  status = fpgaReadMmio(m_handle, m_bar, bar_addr, dst, FP_size - FP_alignment_size);
+  if (alignment_size) {
+    void *alignment_addr = compute_address(dst, size - alignment_size);
+    KPTR alignment_bar_addr = bar_addr + size - alignment_size;
+    status |= fpgaReadMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size);
+  }
+
+#endif  // WINDOWS
+#if defined(LINUX)
+  // Can't use templated linux_write here because *src doesn't give you the size to read.
+  struct acl_cmd driver_cmd;
+  driver_cmd.bar_id = m_bar;
+  driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr);
+  driver_cmd.user_addr = dst;
+  driver_cmd.size = size;
+  // Notify the driver if the host and device's memory have different endianess.
+  driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0;
+  status = read(m_handle, &driver_cmd, sizeof(driver_cmd));
+#endif  // LINUX
+
+  ACL_PCIE_ERROR_IF(status != FPGA_OK,
+                    return -1,
+                    "[%s] Reading block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n",
+                    m_name,
+                    size,
+                    addr,
+                    (size_t)bar_addr);
+  return 0;  // success
+}
+
+ACL_PCIE_MM_IO_MGR::ACL_PCIE_MM_IO_MGR(fpga_handle handle)
+    : mem(NULL),
+      pcie_cra(NULL),
+      window(NULL),
+      version(NULL),
+      pr_base_id(NULL),
+      pr_region_ctrl(NULL),
+      quartus_ver(NULL),
+      cade_id(NULL),
+      uniphy_status(NULL),
+      uniphy_reset(NULL),
+      kernel_if(NULL),
+      pll(NULL),
+      temp_sensor(NULL),
+      hostch_ver(NULL) {
+  ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating mm_io_mgr.\n");
+
+  // This is the PCIe's interface for directly accessing memory (which is
+  // significantly slower than using DMA).  This view of memory is segmented
+  // so that the size of this address space can be smaller than the amount of
+  // physical device memory.  The window interface controls which region of
+  // physical memory this interface currently maps to.
+  // The last flag indicate if the device on both side of transferring have
+  // different endianess.
+#ifdef ACL_BIG_ENDIAN
+  mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", true);
+#else
+  mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", false);
+#endif
+
+  // This is the CRA port of our PCIe controller.  Used for configuring
+  // interrupts and things like that.
+  pcie_cra = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_CRA_BAR, ACL_PCI_CRA_OFFSET, "PCIE-CRA");
+
+  // This interface sets the high order address bits for the PCIe's direct
+  // memory accesses via "mem" (above).
+  window = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_MEMWINDOW_BAR, ACL_PCIE_MEMWINDOW_CRA, "MEMWINDOW");
+
+  // DMA interfaces
+  dma = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_DMA_INTERNAL_BAR, ACL_PCIE_DMA_INTERNAL_CTR_BASE, "DMA-CTR");
+
+  // Version ID check
+  version = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, "VERSION");
+
+  // PR base ID check
+  pr_base_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRBASEID_BAR, ACL_PRBASEID_OFFSET, "PRBASEID");
+
+  // PR region controller
+  pr_region_ctrl = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET, "PRREGIONCTRL");
+
+  // Quartus Version
+  quartus_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_QUARTUSVER_BAR, ACL_QUARTUSVER_OFFSET, "QUARTUS-VERSION");
+
+  // Quartus Version
+  hostch_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_HOSTCH_VERSION_BAR, ACL_HOSTCH_VERSION_OFFSET, "HOSTCH-VERSION");
+
+  // Cable auto detect ID
+  cade_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_CADEID_BAR, ACL_CADEID_OFFSET, "CADEID");
+
+  // Uniphy Status
+  uniphy_status = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYSTATUS_BAR, ACL_UNIPHYSTATUS_OFFSET, "UNIPHYSTATUS");
+
+  // Uniphy Reset
+  uniphy_reset = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYRESET_BAR, ACL_UNIPHYRESET_OFFSET, "UNIPHYRESET");
+
+  // Kernel interface
+  // The DLA BSP eliminates the kernel interface present in the original PR Terasic BSP
+  // We reuse the kernel_if object here to simplify the DLA-specific changes required
+#ifdef DLA_MMD
+  kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_DLA_CSR_OFFSET, "KERNEL");
+#else
+  kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_KERNEL_CSR_OFFSET, "KERNEL");
+#endif // DLA_MMD
+
+  // PLL interface
+  pll = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET, "PLL");
+
+  // temperature sensor
+#ifdef ACL_PCIE_HAS_TEMP_SENSOR
+  temp_sensor = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_PCIE_TEMP_SENSOR_ADDRESS, "TEMP-SENSOR");
+#endif
+}
+
+ACL_PCIE_MM_IO_MGR::~ACL_PCIE_MM_IO_MGR() {
+  if (mem) {
+    delete mem;
+    mem = NULL;
+  }
+  if (pcie_cra) {
+    delete pcie_cra;
+    pcie_cra = NULL;
+  }
+  if (window) {
+    delete window;
+    window = NULL;
+  }
+  if (version) {
+    delete version;
+    version = NULL;
+  }
+  if (pr_base_id) {
+    delete pr_base_id;
+    pr_base_id = NULL;
+  }
+  if (pr_region_ctrl) {
+    delete pr_region_ctrl;
+    pr_region_ctrl = NULL;
+  }
+  if (quartus_ver) {
+    delete quartus_ver;
+    quartus_ver = NULL;
+  }
+  if (cade_id) {
+    delete cade_id;
+    cade_id = NULL;
+  }
+  if (uniphy_status) {
+    delete uniphy_status;
+    uniphy_status = NULL;
+  }
+  if (uniphy_reset) {
+    delete uniphy_reset;
+    uniphy_reset = NULL;
+  }
+  if (kernel_if) {
+    delete kernel_if;
+    kernel_if = NULL;
+  }
+  if (pll) {
+    delete pll;
+    pll = NULL;
+  }
+  if (temp_sensor) {
+    delete temp_sensor;
+    temp_sensor = NULL;
+  }
+  if (hostch_ver) {
+    delete hostch_ver;
+    hostch_ver = NULL;
+  }
+  if (dma) {
+    delete dma;
+    dma = NULL;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h
new file mode 100644
index 0000000..4db5599
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h
@@ -0,0 +1,109 @@
+#ifndef ACL_PCIE_MM_IO_H
+#define ACL_PCIE_MM_IO_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_mm_io.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to handle memory mapped IO over PCIe.              */
+/* The actual implementation of the class lives in the acl_pcie_mm_io.cpp,         */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#if defined(LINUX)
+typedef int fpga_handle;
+#define FPGA_OK 0
+#endif  // LINUX
+
+#ifdef DLA_MMD
+#include "acl_pcie.h"
+#define ACL_DLA_CSR_OFFSET 0x0000
+#endif
+/*
+ *
+ */
+class ACL_PCIE_MM_IO_DEVICE {
+ public:
+  ACL_PCIE_MM_IO_DEVICE(fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian = false);
+  ~ACL_PCIE_MM_IO_DEVICE();
+
+  DWORD bar_id() { return m_bar; };
+  KPTR convert_to_bar_addr(size_t addr) { return addr + m_offset; };
+
+  // read/write functions to the memory-mapped io device
+  // return 0 on success, negative on error
+  int read8(size_t addr, UINT8 *data);
+  int write8(size_t addr, UINT8 data);
+  int read16(size_t addr, UINT16 *data);
+  int write16(size_t addr, UINT16 data);
+  int read32(size_t addr, UINT32 *data);
+  int write32(size_t addr, UINT32 data);
+  int read64(size_t addr, UINT64 *data);
+  int write64(size_t addr, UINT64 data);
+
+  int read_block(size_t addr, size_t size, void *dst);
+  int write_block(size_t addr, size_t size, void *src);
+
+ private:
+  static const int MAX_NAME_LENGTH = 32;
+
+  // Helper functions
+  inline void *compute_address(void *base, uintptr_t offset);
+
+  char m_name[MAX_NAME_LENGTH];
+  fpga_handle m_handle;
+  DWORD m_bar;
+  KPTR m_offset;
+  bool m_diff_endian;  // indicates if the host and this device have different endianess
+};
+
+/*
+ * Utility functions to clean up the various address translations for reads/writes
+ */
+class ACL_PCIE_MM_IO_MGR {
+ private:
+  ACL_PCIE_MM_IO_MGR &operator=(const ACL_PCIE_MM_IO_MGR &) { return *this; }
+
+  ACL_PCIE_MM_IO_MGR(const ACL_PCIE_MM_IO_MGR &src) {}
+
+ public:
+  ACL_PCIE_MM_IO_MGR(fpga_handle handle);
+  ~ACL_PCIE_MM_IO_MGR();
+
+  ACL_PCIE_MM_IO_DEVICE *mem;
+  ACL_PCIE_MM_IO_DEVICE *pcie_cra;
+  ACL_PCIE_MM_IO_DEVICE *dma;
+  ACL_PCIE_MM_IO_DEVICE *window;
+  ACL_PCIE_MM_IO_DEVICE *version;
+  ACL_PCIE_MM_IO_DEVICE *pr_base_id;
+  ACL_PCIE_MM_IO_DEVICE *pr_region_ctrl;
+  ACL_PCIE_MM_IO_DEVICE *quartus_ver;
+  ACL_PCIE_MM_IO_DEVICE *cade_id;
+  ACL_PCIE_MM_IO_DEVICE *uniphy_status;
+  ACL_PCIE_MM_IO_DEVICE *uniphy_reset;
+  ACL_PCIE_MM_IO_DEVICE *kernel_if;
+  ACL_PCIE_MM_IO_DEVICE *pll;
+  ACL_PCIE_MM_IO_DEVICE *temp_sensor;
+  ACL_PCIE_MM_IO_DEVICE *hostch_ver;
+};
+
+#endif  // ACL_PCIE_MM_IO_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp
new file mode 100644
index 0000000..855d6ba
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp
@@ -0,0 +1,67 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- acl_pcie_timer.cpp  ------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the class to query the host's system timer.                */
+/* The declaration of the class lives in the acl_pcie_timer.h                      */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_pcie_timer.h"
+#include "acl_pcie.h"
+
+// other standard header files
+#include <fstream>
+
+ACL_PCIE_TIMER::ACL_PCIE_TIMER() : m_ticks_per_second(0) {
+#if defined(WINDOWS)
+  // Cache the performance counter frequency
+  LARGE_INTEGER li;
+  QueryPerformanceFrequency(&li);
+  m_ticks_per_second = li.QuadPart;
+
+  ACL_PCIE_ASSERT(m_ticks_per_second != 0, "m_ticks_per_second == 0!\n");
+#endif  // WINDOWS
+}
+
+ACL_PCIE_TIMER::~ACL_PCIE_TIMER() {}
+
+cl_ulong ACL_PCIE_TIMER::get_time_ns() {
+#if defined(WINDOWS)
+  const INT64 NS_PER_S = 1000000000;
+  LARGE_INTEGER li;
+
+  QueryPerformanceCounter(&li);
+  INT64 ticks = li.QuadPart;
+  double seconds = ticks / (double)m_ticks_per_second;
+
+  return static_cast<cl_ulong>(seconds * NS_PER_S + 0.5);
+#endif  // WINDOWS
+#if defined(LINUX)
+  struct timespec a;
+  const cl_ulong NS_PER_S = 1000000000;
+  clock_gettime(CLOCK_REALTIME, &a);
+
+  return static_cast<cl_ulong>(a.tv_nsec) + static_cast<cl_ulong>(a.tv_sec * NS_PER_S);
+#endif  // LINUX
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h
new file mode 100644
index 0000000..646d681
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h
@@ -0,0 +1,50 @@
+#ifndef ACL_PCIE_TIMER_H
+#define ACL_PCIE_TIMER_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_pcie_timer.h  --------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) OpenCL MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file declares the class to query the host's system timer.                  */
+/* The actual implementation of the class lives in the acl_pcie_timer.cpp          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifdef DLA_MMD
+// don't assume opencl has been installed
+#include "acl_pcie.h"
+typedef UINT64 cl_ulong;
+#endif
+
+class ACL_PCIE_TIMER {
+ public:
+  ACL_PCIE_TIMER();
+  ~ACL_PCIE_TIMER();
+
+  // function to query the host's system timer
+  cl_ulong get_time_ns();
+
+ private:
+  INT64 m_ticks_per_second;
+};
+
+#endif  // ACL_PCIE_TIMER_H
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h
new file mode 100644
index 0000000..ffecc32
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h
@@ -0,0 +1 @@
+#define ACL_DRIVER_VERSION "20.4.d41d8cd98f00b204e9800998ecf8427e"
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h
new file mode 100644
index 0000000..6d5c85e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h
@@ -0,0 +1,640 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include <cstdint>  //uint32_t
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3)
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,            /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,                /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,                   /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,                 /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                      /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                        /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                       /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,                  /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,                 /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,               /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11,      /* total # of concurrent operations read + writes*/
+  AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,       /* Min alignment that the BSP supports for host allocations (size_t) */
+  AOCL_MMD_HOST_MEM_CAPABILITIES = 13,           /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+  AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,         /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+  AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,         /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+  AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/
+  AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+  AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+                                    aocl_mmd_info_t requested_info_id,
+                                    size_t param_value_size,
+                                    void* param_value,
+                                    size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signaled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_copy(
+    int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device. That means the kernels will be idle and no read/write/copy
+ * commands are active. Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size. The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again. At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+
+#ifdef DLA_MMD
+AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle) WEAK;
+AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle) WEAK;
+// CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to
+// reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask
+// the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared).
+// BEWARE: reprogramming will invalidate the handle
+AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename, const bool skipSaveRestore = false) WEAK;
+#else
+AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+#endif
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4
+#define AOCL_MMD_ERROR_INVALID_POINTER -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+/** Memory properties*/
+typedef enum {
+  /**
+   *  Specifies the name of a global memory that can be found in the
+   *  board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  global memory interface.
+   */
+  AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1,
+  /**
+   *  Specifies the index of a bank inside the global memory interface that can be found in
+   *  the board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  memory bank. It is invalid to specify this property without also specifying
+   *  AOCL_MMD_GLOBAL_MEMORY_INTERFACE.
+   */
+  AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK
+} aocl_mmd_mem_properties_t;
+
+/**
+ *  Host allocations provide memory that is allocated on the host. Host
+ *  allocations are accessible by the host and one or more devices.
+ *  The same pointer to a host allocation may be used on the host and all
+ *  supported devices; they have address equivalence. This memory must be
+ *  deallocated with aocl_mmd_free();
+ *
+ *  Once the device has signaled completion through
+ *  aocl_mmd_interrupt_handler_fn() the host can assume it has access to the
+ *  latest contents of the memory, allocated by this call.
+ *
+ *  @param handles Handles for devices that will need access to this memory
+ *  @param num_devices Number of devices in the handles
+ *  @param size The size of the memory region
+ *  @param alignment The alignment in bytes of the allocation
+ *  @param properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t* properties,
+                                        int* error) WEAK;
+
+/**
+ * Frees memory that has been allocated by MMD
+ *
+ * @param mem The pointer to the memory region. Must be a pointer that is
+ *   allocated by the MMD.
+ * @return AOCL_MMD_ERROR_SUCCESS if success, else error code
+ */
+AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK;
+
+/**
+ *  Allocate memory that is owned by the device. This pointer can only be
+ *  accessed by the kernel; can't be accessed by the host. The host is able to
+ *  manipulate the pointer (e.g. increment it) just not access the underlying
+ *  data. This memory must be deallocated by aocl_mmd_free();
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param  alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return Pointer that can be passed into the kernel. NULL on failure.
+ */
+AOCL_MMD_CALL void* aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+/**
+ *  Shared allocations may migrate between the host and one or more associated
+ *  device. The same pointer to a shared allocation may be used on the host and
+ *  the supported device; they have address equivalence.
+ *
+ *  If the device does not support concurrent access to memory allocated by
+ *  aocl_mmd_shared_alloc() then a call must be made to
+ *  aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should
+ *  be migrated to the device before the device accesses this memory.  For
+ *  example, a call to aocl_mmd_shared_mem_migrate() should be made before a
+ *  kernel accessing this memory is launched).  Conversely,
+ *  aocl_mmd_shared_mem_migrate() should be called again to indicate that the
+ *  shared allocation should be migrated to the host before the host accesses
+ *  this memory again.  If the device supports concurrent access to memory
+ *  allocated with aocl_mmd_shared_alloc(), then the call to
+ *  aocl_mmd_shared_mem_migrate() is not necessary, but may still be made.  In
+ *  the case of concurrent access, it is the responsibility of the MMD to ensure
+ *  both the device and host can access aocl_mmd_shared_alloc() allocations at
+ *  all times.
+ *
+ *  Memory allocated by aocl_mmd_shared_alloc() must be deallocated with
+ *  aocl_mmd_free().
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported properties are
+ *    listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_.
+ *    Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+AOCL_MMD_CALL void* aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t;
+
+/**
+ *  A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared
+ *  allocations any time the accessor of the allocation changes.  For example,
+ *  aocl_mmd_shared_migrate() should be called indicating that the allocation
+ *  should be migrated to the device before a kernel accessing the allocation
+ *  is launched on the device.  Similarly, aocl_mmd_shared_migrate() should be
+ *  called indicating that the allocation is migrated to the host before the
+ *  host accesses the memory after kernel completion.
+ *
+ *  For concurrent allocations this call may be used as a performance hint, but
+ *  is not strictly required for functionality.
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc()
+ *  @param size In bytes, the size of the migration. Must be of multiple of a
+ *   page boundary that the BSP supports.
+ *  @param destination The destination of migration
+ *  @return The error code defined by AOCL_MMD_ERROR*
+ */
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle,
+                                          void* shared_ptr,
+                                          size_t size,
+                                          aocl_mmd_migrate_t destination) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h
new file mode 100644
index 0000000..dc3eae2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h
@@ -0,0 +1,100 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file access.h
+ * @brief Functions to acquire, release, and reset OPAE FPGA resources
+ */
+
+#ifndef __FPGA_ACCESS_H__
+#define __FPGA_ACCESS_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Open an FPGA object
+ *
+ * Acquires ownership of the FPGA resource referred to by 'token'.
+ *
+ * Most often this will be used to open an accelerator object to directly interact
+ * with an accelerator function, or to open an FPGA object to perform
+ * management functions.
+ *
+ * @param[in]  token    Pointer to token identifying resource to acquire
+ *                      ownership of
+ * @param[out] handle   Pointer to preallocated memory to place a handle in.
+ *                      This handle will be used in subsequent API calls.
+ * @param[in]  flags    One of the following flags:
+ *                        * FPGA_OPEN_SHARED allows the resource to be opened
+ *                          multiple times (not supported in ASE)
+ * @returns             FPGA_OK on success. FPGA_NOT_FOUND if the resource for
+ *                      'token' could not be found. FPGA_INVALID_PARAM if
+ *                      'token' does not refer to a resource that can be
+ *                       opened, or if either argument is NULL or invalid.
+ *                      FPGA_EXCEPTION if an internal exception occurred while
+ *                      creating the handle. FPGA_NO_DRIVER if the driver is
+ *                      not loaded. FPGA_BUSY if trying to open a resource that
+ *                      has already been opened in exclusive mode.
+ *                      FPGA_NO_ACCESS if the current process' privileges are
+ *                      not sufficient to open the resource.
+ */
+ __FPGA_API__ fpga_result fpgaOpen(fpga_token token, fpga_handle *handle,
+              int flags);
+
+/**
+ * Close a previously opened FPGA object
+ *
+ * Relinquishes ownership of a previously fpgaOpen()ed resource. This enables
+ * others to acquire ownership if the resource was opened exclusively.
+ * Also deallocates / unmaps MMIO and UMsg memory areas.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA object
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to an acquired resource, or if handle is NULL.
+ *                      FPGA_EXCEPTION if an internal error occurred while
+ *                      accessing the handle.
+ */
+__FPGA_API__ fpga_result fpgaClose(fpga_handle handle);
+
+/**
+ * Reset an FPGA object
+ *
+ * Performs an accelerator reset.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA object
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to an acquired resource or to a resoure that
+ *                      cannot be reset. FPGA_EXCEPTION if an internal error
+ *                      occurred while trying to access the handle or resetting
+ *                      the resource.
+ */
+__FPGA_API__ fpga_result fpgaReset(fpga_handle handle);
+
+END_C_DECL
+
+#endif // __FPGA_ACCESS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h
new file mode 100644
index 0000000..e848182
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h
@@ -0,0 +1,154 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file buffer.h
+ * @brief Functions for allocating and sharing system memory with an FPGA
+ * accelerator
+ *
+ * To share memory between a software application and an FPGA accelerator,
+ * these functions set up system components (e.g. an IOMMU) to allow
+ * accelerator access to a provided memory region.
+ *
+ * There are a number of restrictions on what memory can be shared, depending
+ * on platform capabilities. Usually, FPGA accelerators to not have access to
+ * virtual address mappings of the CPU, so they can only access physical
+ * addresses. To support this, the OPAE C library on Linux uses hugepages to
+ * allocate large, contiguous pages of physical memory that can be shared with
+ * an accalerator. It also supports sharing memory that has already been
+ * allocated by an application, as long as that memory satisfies the
+ * requirements of being physically contigous and page-aligned.
+ */
+
+#ifndef __FPGA_BUFFER_H__
+#define __FPGA_BUFFER_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Prepare a shared memory buffer
+ *
+ * Prepares a memory buffer for shared access between an accelerator and the calling
+ * process. This may either include allocation of physcial memory, or
+ * preparation of already allocated memory for sharing. The latter case is
+ * indicated by supplying the FPGA_BUF_PREALLOCATED flag.
+ *
+ * This function will ask the driver to pin the indicated memory (make it
+ * non-swappable), and program the IOMMU to allow access from the accelerator. If the
+ * buffer was not pre-allocated (flag FPGA_BUF_PREALLOCATED), the function
+ * will also allocate physical memory of the requested size and map the
+ * memory into the caller's process' virtual address space. It returns in
+ * 'wsid' an fpga_buffer object that can be used to program address registers
+ * in the accelerator for shared access to the memory.
+ *
+ * When using FPGA_BUF_PREALLOCATED, the input len must be a non-zero multiple
+ * of the page size, else the function returns FPGA_INVALID_PARAM. When not
+ * using FPGA_BUF_PREALLOCATED, the input len is rounded up to the nearest
+ * multiple of page size.
+ *
+ * @param[in]  handle     Handle to previously opened accelerator resource
+ * @param[in]  len        Length of the buffer to allocate/prepare in bytes
+ * @param[inout] buf_addr Virtual address of buffer. Contents may be NULL (OS
+ *                        will choose mapping) or non-NULL (OS will take
+ *                        contents as a hint for the virtual address).
+ * @param[out] wsid       Handle to the allocated/prepared buffer to be used
+ *                        with other functions
+ * @param[in]  flags      Flags. FPGA_BUF_PREALLOCATED indicates that memory
+ *                        pointed at in '*buf_addr' is already allocated an
+ *                        mapped into virtual memory.
+ * @returns FPGA_OK on success. FPGA_NO_MEMORY if the requested memory could
+ * not be allocated. FPGA_INVALID_PARAM if invalid parameters were provided, or
+ * if the parameter combination is not valid. FPGA_EXCEPTION if an internal
+ * exception occurred while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaPrepareBuffer(fpga_handle handle,
+              uint64_t len,
+              void **buf_addr, uint64_t *wsid, int flags);
+
+/**
+ * Release a shared memory buffer
+ *
+ * Releases a previously prepared shared buffer. If the buffer was allocated
+ * using fpgaPrepareBuffer (FPGA_BUF_PREALLOCATED was not specified), this call
+ * will deallocate/free that memory. Otherwise, it will only be returned to
+ * it's previous state (pinned/unpinned, cached/non-cached).
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  wsid     Handle to the allocated/prepared buffer
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
+
+/**
+ * Retrieve base IO address for buffer
+ *
+ * This function is used to acquire the physical base address (on some platforms
+ * called IO Virtual Address or IOVA) for a shared buffer identified by wsid.
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  wsid     Buffer handle / workspace ID referring to the buffer for
+ *                      which the IO address is requested
+ * @param[out] ioaddr   Pointer to memory where the IO address will be returned
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer.
+ */
+__FPGA_API__ fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid,
+              uint64_t *ioaddr);
+
+/**
+ * Retrieve physical address for buffer
+ *
+ * This function is used to acquire the physical addresses in a scatter gather
+ * list form for a shared buffer identified by wsid.
+ *
+ * @param[in]  handle       Handle to previously opened accelerator resource
+ * @param[in]  wsid         Buffer handle / workspace ID referring to the buffer for
+ *                          which the physical address is requested
+ * @param[out] num_pages    Number of physical pages
+ * @param[out] sglist       SG list structure where physical addresses of pages and
+ *                          number of bytes in that page used will be returned.
+ *
+ * Note:  Call this API with sg_list as NULL to update num_pages. Allocate upto
+ *        (num_pages * sg_list) memory and call the API again with a pointer to this
+ *        memory location as the last argument to retrieve the sg_list struct.
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were
+ * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an
+ * internal exception occurred while trying to access the handle.
+ * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer.
+ */
+__FPGA_API__ fpga_result fpgaGetPhysicalAddress(fpga_handle handle, uint64_t wsid, uint64_t *num_pages,
+              void *sglist);
+
+END_C_DECL
+
+#endif // __FPGA_BUFFER_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h
new file mode 100644
index 0000000..8febd44
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h
@@ -0,0 +1,144 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file dma.h
+ * @brief Functions to acquire, release, and reset OPAE FPGA DMA resources
+ */
+
+#ifndef __DMA_ACCESS_H__
+#define __DMA_ACCESS_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/*
+*  The DMA driver supports host to FPGA, FPGA to host
+*  and FPGA to FPGA transfers. The FPGA interface can
+*  be streaming or memory-mapped. Streaming interfaces
+*  are not currently
+*  supported.
+*/
+typedef enum {
+    HOST_TO_FPGA_MM = 0,
+    FPGA_TO_HOST_MM,
+    FPGA_TO_FPGA_MM,
+    FPGA_MAX_TRANSFER_TYPE,
+}fpga_dma_transfer;
+
+
+typedef enum
+{
+    DMA_OPEN = 1,
+    DMA_BUSY,
+    DMA_CLOSED
+}fpga_dma_status;
+
+/*
+ * Dma handle in user space that will be populated during fpgaDmaOpen call.
+ */
+typedef struct _fpga_dma_handle
+{
+    //
+    // Stores the handle to the fpga that was opened after fpgaOpen
+    //
+    fpga_handle fpga_h;
+
+    //
+    // Stores the current status of the DMA AFC
+    // Set to the following values:
+    // DMA_OPEN - After call to fpgaDmaOpen() and when fpgaDmaTransferSync() exits
+    // DMA_BUSY - When fpgaDmaTransferSync() is called
+    //
+    uint64_t dma_status;
+}dma_handle, *fpga_dma_handle;
+
+
+
+/**
+*
+* Opens a handle to DMA
+* Sets the status of DMA engine to DMA_OPEN
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dma_h    DMA handle allocated by the user
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource.
+*
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaOpen(
+    fpga_handle      handle,
+    fpga_dma_handle  *dma_h
+);
+
+/**
+*
+* Closes a handle to DMA
+* Sets the status of DMA engine to DMA_CLOSED
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dma_h    DMA handle allocated by the user
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource.
+*
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaClose(
+    fpga_dma_handle     dma_h
+);
+
+
+/**
+*
+* Performs a synchronous DMA transfer between FPGA and host memory.
+*
+* @param[in]  handle   Handle to previously opened FPGA object
+* @param[in]  dst      Destination address for the data transfer
+* @param[in]  src      Source address for the data transfer
+* @param[in]  count    Length of data to be transferred from src to dst
+* @param[in]  flag     Flag to indicate nature of data transfer. Flag types =
+                       HOST_TO_FPGA_MM and FPGA_TO_HOST_MM.
+* @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+*                      not refer to an acquired resource or to a resoure that
+*                      cannot be reset. FPGA_EXCEPTION if an internal error
+*                      occurred while trying to access the handle or resetting
+*                      the resource.
+*/
+__FPGA_API__
+fpga_result
+fpgaDmaTransferSync(
+    fpga_dma_handle handle,
+    ULONG64         dst,
+    ULONG64         src,
+    ULONG64         count,
+    ULONG64         flag
+);
+
+END_C_DECL
+
+#endif // __DMA_ACCESS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h
new file mode 100644
index 0000000..ee3349b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h
@@ -0,0 +1,129 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file enum.h
+ * @brief APIs for resource enumeration and managing tokens
+ *
+ * These APIs are the first step for any application using OPAE to discover
+ * resources that are present on the system. They allow selective enumeration
+ * (i.e. getting a list of resources that match a given list of criteria) and
+ * methods to manage the lifecycle of tokens generated by fpgaEnumerate().
+ */
+
+#ifndef __FPGA_ENUM_H__
+#define __FPGA_ENUM_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Enumerate FPGA resources present in the system
+ *
+ * This call allows the user to query the system for FPGA resources that match
+ * a certain set of criteria, e.g. all accelerators that are assigned to a host
+ * interface and available, all FPGAs of a specific type, etc.
+ *
+ * fpgaEnumerate() will create a number of `fpga_token`s to represent the
+ * matching resources and populate the array `tokens` with these tokens. The
+ * `max_tokens` argument can be used to limit the number of tokens
+ * allocated/returned by fpgaEnumerate(); i.e., the number of tokens in the
+ * returned `tokens` array will be either `max_tokens` or `num_matches` (the
+ * number of resources matching the filter), whichever is smaller. Use
+ * fpgaDestroyToken() to destroy tokens that are no longer needed.
+ *
+ * To query the number of matches for a particular set of filters (e.g. to
+ * allocate a `tokens` array of the appropriate size), call fpgaEnumerate()
+ * with the parameter `tokens` set to NULL; this will only return the number of
+ * matches in `num_matches`.
+ *
+ * @Note fpgaEnumerate() will allocate memory for the created tokens returned
+ * in `tokens`. It is the responsibility of the using application to free this
+ * memory after use by calling fpgaDestroyToken() for each of the returned
+ * tokens.
+ *
+ * @param[in] filters      Array of `fpga_properties` objects describing the
+ *                         properties of the objects that should be returned. A
+ *                         resource is considered matching if its properties
+ *                         match any one of the supplied filters. Passing NULL
+ *                         will match all FPGA resources present in the system.
+ * @param[in] num_filters  Number of entries in the `filters` array.
+ * @param[out] tokens      Pointer to an array of fpga_token variables to be
+ *                         populated.  If NULL is supplied, fpgaEnumerate() will
+ *                         not create any tokens, but it will return the
+ *                         number of possible matches in `num_match`.
+ * @param[in] max_tokens   Maximum number of tokens that fpgaEnumerate() shall
+ *                         return (length of `tokens` array). There may be more
+ *                         or fewer matches than this number; `num_matches` is
+ *                         set to the number of actual matches.
+ * @param[out] num_matches Number of resources matching the `filter` criteria.
+ *                         This number can be higher than the number of tokens
+ *                         returned in the `tokens` array (depending on the
+ *                         value of `max_tokens`).
+ * @returns                FPGA_OK on success.
+ *                         FPGA_INVALID_PARAM if invalid pointers or objects
+ *                         are passed into the function.
+ *                         FPGA_NO_DRIVER if OPAE can't find the respective
+ *                         enumeration data structures usually provided by the
+ *                         driver.
+ *                         FPGA_NO_MEMORY if there was not enough memory to
+ *                         create tokens.
+ */
+__FPGA_API__ fpga_result fpgaEnumerate(const fpga_properties *filters,
+              uint32_t num_filters, fpga_token *tokens,
+              uint32_t max_tokens ,uint32_t *num_matches);
+
+/**
+ * Clone a fpga_token object
+ *
+ * Creates a copy of an fpga_token object.
+ *
+ * @Note This call creates a new token object and allocates memory for it. It
+ * is the responsibility of the using application to free this memory after use
+ * by calling fpgaDestroyToken() for the cloned token.
+ *
+ * @param[in]  src        fpga_token object to copy
+ * @param[out] dst        New fpga_token object cloned from 'src'
+ * @returns               FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaCloneToken(fpga_token src, fpga_token *dst);
+
+/**
+ * Destroy a Token
+ *
+ * This function destroys a token created by fpgaEnumerate() and frees the
+ * associated memory.
+ *
+ * @param[in] token      fpga_token to destroy
+ * @returns              FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaDestroyToken(fpga_token *token);
+
+END_C_DECL
+
+#endif // __FPGA_ENUM_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h
new file mode 100644
index 0000000..3d53554
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h
@@ -0,0 +1,151 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file event.h
+ * @brief Functions for registering events and managing the lifecycle for
+ * `fpga_event_handle`s.
+ *
+ * OPAE provides an interface to asynchronous events that can be generated by
+ * different FPGA resources. The event API provides functions to register for
+ * these events; associated with every event a process has registered for is an
+ * fpga_event_handle, which encapsulates the OS-specific data structure for
+ * event objects. On Linux, an fpga_event_handle can be used as a file
+ * descriptor and passed to select(), poll(), epoll() and similar functions to
+ * wait for asynchronous events.
+ */
+
+#ifndef __FPGA_EVENT_H__
+#define __FPGA_EVENT_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Initialize an event_handle
+ *
+ * Platform independent way to initialize an event_handle used for
+ * notifications from the driver to application. For Linux, this function
+ * creates an eventfd and returns the eventfd file descriptor in
+ * `*event_handle`.
+ *
+ * @param[out] event_handle  Pointer to event handle variable.
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL.
+ * FPGA_NOT_SUPPORTED if platform does not support events.
+ */
+__FPGA_API__ fpga_result fpgaCreateEventHandle(fpga_event_handle *event_handle);
+
+/**
+ * Destroy an event_handle
+ *
+ * Destroy handle and free resources. On Linux this corresponds
+ * to closing the file descriptor pointed to by handle
+ *
+ * @param[in] event_handle Pointer to handle to be destroyed
+ *
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL.
+ */
+__FPGA_API__ fpga_result fpgaDestroyEventHandle(fpga_event_handle *event_handle);
+
+/**
+ * Register an FPGA event
+ *
+ * This function tells the driver that the caller is interested in notification
+ * for the event specified by the type and flags pair.
+ *
+ * The event_handle points to an OS specific mechanism for event notification.
+ * An event_handle is associated with only a single event.
+ *
+ * @todo define if calling fpgaRegisterEvent multiple times with the
+ * same event_handle is an error condition or if it is silently ignored.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA resource.
+ * @param[in]  event_type     Type of event
+ * @param[in]  event_handle Handle to previously opened resource for event
+ *                           notification.
+ * @param[in]  flags    Optional argument for specifying additional
+ *                          information about event.  For example irq number
+ *                for interrupt events.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does not refer to
+ * a resource supporting the requested event, or if event_handle is not valid.
+ * FPGA_EXCEPTION if an internal exception occurred while accessing the handle
+ * or the event_handle. On Linux: FPGA_NO_DAEMON if the driver does not support the
+ * requested event and there is no FPGA Daemon (fpgad) running to proxy it.
+ */
+__FPGA_API__ fpga_result fpgaRegisterEvent(fpga_handle handle,
+                  fpga_event_type event_type,
+                  fpga_event_handle event_handle,
+                  uint32_t flags);
+
+/**
+ * Unregister an FPGA event
+ *
+ * This function tells the driver that the caller is no longer interested in
+ * notification for the event associated with the event_handle
+ *
+ * The event_handle points to an OS specific mechanism for event notification.
+ * An event_handle is associated with only a single event.
+ *
+ * @todo define if calling fpgaUnregisterEvent multiple times with the
+ * same event_handle is an error condition or if it is silently ignored.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  handle   Handle to previously opened FPGA resource.
+ * @param[in]  event_type     Type of event.
+ * @param[in]  event_handle Handle to previously opened resource for event
+ *                           notification.
+ * @returns             FPGA_OK on success. FPGA_INVALID_PARAM if handle does
+ *                      not refer to a resource supporting the requested event,
+ *                      or if event_handle is not valid. FPGA_EXCEPTION if an
+ *                      internal error occurred accessing the handle or the
+ *                      event_handle.
+ */
+__FPGA_API__ fpga_result fpgaUnregisterEvent(fpga_handle handle, fpga_event_type event_type,
+                                    fpga_event_handle event_handle);
+
+/**
+* Get OS object from event handle
+*
+* Check validity of event handle, and get the OS object used to
+* subscribe and unsubscribe to events. On Linux, the obkect corresponds
+* to a file descriptor.
+*
+* @param[in] event_handle Event handle to get the descriptor value from
+* @param[out] fd integer to store the descriptor value
+*
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is invalid.
+*/
+__FPGA_API__ fpga_result fpgaGetOSObjectFromEventHandle(const fpga_event_handle event_handle,
+                            int *fd);
+
+END_C_DECL
+
+#endif // __FPGA_EVENT_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h
new file mode 100644
index 0000000..f7a2c5c
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h
@@ -0,0 +1,87 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file flash.h
+ * @brief Functions to erase the flash memory and reconfigure a slot with a new bitstream .
+ */
+
+#ifndef __FLASH_H__
+#define __FLASH_H__
+
+BEGIN_C_DECL
+
+/**
+*
+*   Erase flash memory
+*
+*   This function erases the flash memory of the FPGA device
+*
+*   Arguments:
+*   @param[in]   fpga_handle              handle to previously opened FPGA_DEVICE resource
+*
+*   Return Value:
+*   FPGA_OK on success.
+*   FPGA_INVALID_PARAM if the handle does not refer to an owned resource.
+*   FPGA_NOT_FOUND if this host interface number is not found .
+*   FPGA_NOT_SUPPORTED if funcionality not supported
+*
+**/
+__FPGA_API__ fpga_result
+fpgaEraseFlash(
+    fpga_handle  fpga_handle
+    );
+
+
+/**
+*   Writes flash memory
+*
+*   This function programs the flash chip on the FPGA with the provided bitstream.
+*
+*   Arguments:
+*   @param[in]  handle                handle to an FPGA_DEVICE resource
+*   @param[in]  flashBitstream        pointer to memory holding the flash bitstream
+*   @param[in]  flashBitstreamLen     length of the bitstream in bytes
+*   @param[in]  offset                offset in flash controller to begin writing from
+*
+*   Return Value:
+*   FPGA_OK on success.
+*   FPGA_INVALID_PARAM if the handle does not refer to an owned resource.
+*   FPGA_NOT_FOUND if this host interface number is not found .
+*   FPGA_NOT_SUPPORTED if funcionality not supported.
+*/
+
+__FPGA_API__ fpga_result
+fpgaWriteFlash(
+    fpga_handle handle,
+    PUINT8      flashBitstream,
+    UINT64      flashBitstreamLen,
+    UINT64      offset
+);
+
+END_C_DECL
+
+#endif // __FLASH_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h
new file mode 100644
index 0000000..e6668e8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h
@@ -0,0 +1,60 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file fpga.h
+ * \brief FPGA API
+ *
+ * This conveniently includes all APIs that a part of the OPAE release (base and
+ * extensions).
+ */
+
+#ifndef __FPGA_FPGA_H__
+#define __FPGA_FPGA_H__
+
+#define FPGA_API_VERSION_MAJOR 0
+#define FPGA_API_VERSION_MINOR 1
+
+#ifdef _WIN32
+#include <Windows.h>
+#endif
+
+#include <opae/types.h>
+#include <opae/access.h>
+#include <opae/buffer.h>
+#include <opae/dma.h>
+#include <opae/enum.h>
+#include <opae/event.h>
+#include <opae/flash.h>
+#include <opae/manage.h>
+#include <opae/mmio.h>
+#include <opae/properties.h>
+#include <opae/umsg.h>
+#include <opae/utils.h>
+#include <opae/version.h>
+
+#endif // __FPGA_FPGA_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h
new file mode 100644
index 0000000..365cdaf
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h
@@ -0,0 +1,70 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file macrodefs.h
+ * @brief Definitions of conveinence macros for the OPAE C API
+ *
+ * This file defines convenience macros for the OPAE C API functions.
+ */
+
+#ifndef __FPGA_MACRODEFS_H__
+#define __FPGA_MACRODEFS_H__
+
+// Check for conflicting definitions
+#ifdef BEGIN_C_DECL
+#error BEGIN_C_DECL already defined, but used by the OPAE library
+#endif
+
+#ifdef END_C_DECL
+#error END_C_DECL already defined, but used by the OPAE library
+#endif
+
+#ifdef __FPGA_API__
+#error __FPGA_API__ already defined, but used by the OPAE library
+#endif
+
+// Macro for symbol visibility
+#ifdef _WIN32
+#ifdef FpgaLib_EXPORTS
+#define __FPGA_API__ __declspec(dllexport)
+#else
+#define __FPGA_API__ __declspec(dllimport)
+#endif
+#else
+#define __FPGA_API__ __attribute__((visibility("default")))
+#endif
+
+// Macro for disabling name mangling
+#ifdef __cplusplus
+#define BEGIN_C_DECL extern "C" {
+#define END_C_DECL }
+#else
+#define BEGIN_C_DECL
+#define END_C_DECL
+#endif
+
+#endif // __FPGA_MACRODEFS_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h
new file mode 100644
index 0000000..f93a1b1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h
@@ -0,0 +1,176 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file manage.h
+ * @brief Functions for managing FPGA configurations
+ *
+ * FPGA accelerators can be reprogrammed at run time by providing new partial
+ * bitstreams ("green bitstreams"). This file defines API functions for
+ * programming green bitstreams as well as for assigning accelerators to host
+ * interfaces for more complex deployment setups, such as virtualized systems.
+ */
+
+#ifndef __FPGA_MANAGE_H__
+#define __FPGA_MANAGE_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+* Assign Port to a host interface.
+*
+* This function assign Port to a host interface for subsequent use. Only
+* Port that have been assigned to a host interface can be opened by
+* fpgaOpen().
+*
+* @param[in]  fpga           Handle to an FPGA object previously opened that
+*                            both the host interface and the slot belong to
+* @param[in]  interface_num  Host interface number
+* @param[in]  slot_num       Slot number
+* @param[in]  flags          Flags (to be defined)
+* @returns                   FPGA_OK on success
+*                            FPGA_INVALID_PARAM if input parameter combination
+*                            is not valid.
+*                            FPGA_EXCEPTION if an exception occcurred accessing
+*                            the `fpga` handle.
+*                            FPGA_NOT_SUPPORTED if driver does not support
+*                            assignment.
+*/
+__FPGA_API__ fpga_result fpgaAssignPortToInterface(fpga_handle fpga,
+                    uint32_t interface_num,
+                    uint32_t slot_num,
+                    int flags);
+
+/**
+ * Assign an accelerator to a host interface
+ *
+ * This function assigns an accelerator to a host interface for subsequent use. Only
+ * accelerators that have been assigned to a host interface can be opened by
+ * fpgaOpen().
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened that
+ *                            both the host interface and the accelerator belong to
+ * @param[in]  afc            Accelerator to assign
+ * @param[in]  host_interface Host interface to assign accelerator to
+ * @param[in]  flags          Flags (to be defined)
+ * @returns                   FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaAssignToInterface(fpga_handle fpga,
+                  fpga_token afc,
+                  uint32_t host_interface,
+                  int flags);
+
+/**
+ * Unassign a previously assigned accelerator
+ *
+ * This function removes the assignment of an accelerator to an host interface (e.g. to
+ * be later assigned to a different host interface). As a consequence, the accelerator
+ * referred to by token 'accelerator' will be reset during the course of this function.
+ *
+ * @note This function is currently not supported.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened that
+ *                            both the host interface and the accelerator belong to
+ * @param[in]  afc            Accelerator to unassign/release
+ * @returns                   FPGA_OK on success
+ */
+__FPGA_API__ fpga_result fpgaReleaseFromInterface(fpga_handle fpga,
+                     fpga_token afc);
+
+/**
+ * Reconfigure a slot
+ *
+ * Sends a green bitstream file to an FPGA to reconfigure a specific slot. This
+ * call, if successful, will overwrite the currently programmed AFU in that
+ * slot with the AFU in the provided bitstream.
+ *
+ * As part of the reconfiguration flow, all accelerators associated with this slot will
+ * be unassigned and reset.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened
+ * @param[in]  slot           Token identifying the slot to reconfigure
+ * @param[in]  bitstream      Pointer to memory holding the bitstream
+ * @param[in]  bitstream_len  Length of the bitstream in bytes
+ * @param[in]  flags          Flags (to be defined)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the bitstream data to the driver. FPGA_RECONF_ERROR
+ * on errors reported by the driver (such as CRC or protocol errors).
+ */
+__FPGA_API__ fpga_result fpgaReconfigureSlot(fpga_handle fpga,
+             uint32_t slot,
+             const uint8_t *bitstream,
+             size_t bitstream_len, int flags);
+
+/**
+ * Process device specific commands
+ *
+ * Sends a device specific command to the driver and driver performs that action
+ * and returns if needed with the data.
+ *
+ * @param[in]  fpga           Handle to an FPGA object previously opened
+ * @param[in]  cmd            GUID identifying the command to process
+ * @param[in]  buffer         Pointer to memory where data will be returned.
+ * @param[in]  buffer_len     Length of the buffer passed.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the data to the driver.
+ */
+__FPGA_API__ fpga_result fpgaProcessDeviceCmd(fpga_handle fpga,
+    fpga_guid cmd,
+    void *arg,
+    void *buffer,
+    size_t buffer_len);
+
+/**
+ * Enumerate all the commands supported by the device.
+ *
+ * To enumerate all the commands supported by a specific device, call this
+ * function by passing NULL to buffer arg and it returns the number of bytes
+ * that needs to be allocated to get all the commands.
+ *
+ * Then allocate buffer for that size and call this function to get the list
+ * of all device supported CMDs.
+ *
+ * @param[in]  fpga         Handle to an FPGA object previously opened
+ * @param[in]  cmds         Pointer to memory where cmds will be returned.
+ * @param[in]  num_cmds     Pointer to memory where num cmds will be returned.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters
+ * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the
+ * handle or while sending the data to the driver.
+ */
+__FPGA_API__ fpga_result fpgaGetSupportedCommands(fpga_handle fpga,
+    fpga_guid *cmds,
+    uint32_t  *num_cmds);
+
+END_C_DECL
+
+#endif // __FPGA_MANAGE_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h
new file mode 100644
index 0000000..7c26d3f
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h
@@ -0,0 +1,342 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file mmio.h
+ * @brief Functions for mapping and accessing MMIO space
+ *
+ * Most FPGA accelerators provide access to control registers through
+ * memory-mappable address spaces, commonly referred to as "MMIO spaces". This
+ * file provides functions to map, unmap, read, and write MMIO spaces.
+ *
+ * Note that an accelerator may have multiple MMIO spaces, denoted by the
+ * `mmio_num` argument of the APIs below. The meaning and properties of each
+ * MMIO space are up to the accelerator designer.
+ */
+
+#ifndef __FPGA_MMIO_H__
+#define __FPGA_MMIO_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Write 64 bit value to MMIO space
+ *
+ * This function will write to MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[in]  value    Value to write (64 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaWriteMMIO64(fpga_handle handle,
+                uint32_t mmio_num, uint64_t offset,
+                uint64_t value);
+
+/**
+ * Read 64 bit value from MMIO space
+ *
+ * This function will read from MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[out] value    Pointer to memory where read value is returned (64 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaReadMMIO64(fpga_handle handle,
+               uint32_t mmio_num,
+               uint64_t offset, uint64_t *value);
+
+/**
+ * Write 32 bit value to MMIO space
+ *
+ * This function will write to MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[in]  value    Value to write (32 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaWriteMMIO32(fpga_handle handle,
+                uint32_t mmio_num, uint64_t offset,
+                uint32_t value);
+
+/**
+ * Read 32 bit value from MMIO space
+ *
+ * This function will read from MMIO space of the target object at a specified
+ * offset.
+ *
+ * In order to access a resource's MMIO space using this function, it has to be
+ * mapped to the application's address space using fpgaMapMMIO().
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[in]  offset   Byte offset into MMIO space
+ * @param[out] value    Pointer to memory where read value is returned (32 bit)
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+ * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+ */
+__FPGA_API__ fpga_result fpgaReadMMIO32(fpga_handle handle,
+               uint32_t mmio_num,
+               uint64_t offset, uint32_t *value);
+
+/**
+ * Map MMIO space
+ *
+ * This function will return a pointer to the specified MMIO space of the
+ * target object in process virtual memory. Some MMIO spaces may be restricted
+ * to privileged processes, depending on the used handle and type.
+ *
+ * After mapping the respective MMIO space, you can access it either through
+ * direct pointer operations (observing supported access sizes and alignments
+ * of the target platform and accelerator), or by using fpgaReadMMIO32(),
+ * fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(), fpgaReadMmio()
+ * and fpgaWriteMmio().
+ *
+ * @note This call only supports returning an actual mmio_ptr for hardware
+ * targets, not for ASE simulation. Use fpgaReadMMIO32(), fpgaWriteMMIO32(),
+ * fpgeReadMMIO64(), and fpgaWriteMMIO64() if you need ASE simulation
+ * capabilities. You will still need to call fpgaMapMMIO() before using these
+ * functions, though.
+ *
+ * If the caller passes in NULL for mmio_ptr, no virtual address will be
+ * returned. This implies that all accesses will be performed through
+ * fpgaReadMMIO32(), fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(),
+ * fpgaReadMmio() and fpgaWriteMmio(). This is the only supported case for ASE.
+ *
+ * The number of available MMIO spaces can be retrieved through the num_mmio
+ * property (fpgaPropertyGetNumMMIO()).
+ *
+ * @param[in]  handle   Handle to previously opened resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @param[out] mmio_ptr Pointer to memory where a pointer to the MMIO space
+ *                      will be returned. May be NULL, in which case no pointer
+ *                      is returned.
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle. FPGA_NO_ACCESS if the process'
+ * permissions are not sufficient to map the requested MMIO space.
+ */
+__FPGA_API__ fpga_result fpgaMapMMIO(fpga_handle handle,
+            uint32_t mmio_num, uint64_t **mmio_ptr);
+
+/**
+ * Unmap MMIO space
+ *
+ * This function will unmap a previously mapped MMIO space of the target opject,
+ * rendering any pointers to it invalid.
+ *
+ * @note This call is only supported by hardware targets, not by ASE
+ *       simulation.
+ *
+ * @param[in]  handle   Handle to previously opened resource
+ * @param[in]  mmio_num Number of MMIO space to access
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+ * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+ * while trying to access the handle.
+ */
+__FPGA_API__ fpga_result fpgaUnmapMMIO(fpga_handle handle,
+            uint32_t mmio_num);
+
+/**
+* Reads the value from MMIO space.
+*
+* This function will read from MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[out] buffer   Pointer to memory where read value is returned
+* @param[in]  length   Length of the MMIO to read.
+* @param[in]  accessType   Read MMIO as 8/16/32/64-bit reads.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaReadMmioType(fpga_handle handle,
+        uint32_t mmio_num,
+        uint64_t offset,
+        void* buffer,
+        uint32_t length,
+        uint32_t accessType);
+
+/**
+* Write the value to MMIO space.
+*
+* This function will write to MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[in]  buffer   Pointer to memory from where data to be written.
+* @param[in]  length   Length of the MMIO to write.
+* @param[in]  accessType   Write MMIO as 8/16/32/64-bit writes.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaWriteMmioType(fpga_handle handle,
+        uint32_t mmio_num,
+        uint64_t offset,
+        void* buffer,
+        uint32_t length,
+        uint32_t accessType);
+
+
+/**
+* Reads the value from MMIO space.
+*
+* This function will read from MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[out] buffer   Pointer to memory where read value is returned
+* @param[in]  length   Length of the MMIO to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaReadMmio(fpga_handle handle,
+    uint32_t mmio_num,
+    uint64_t offset,
+    void     *buffer,
+    uint32_t length);
+
+/**
+* Write the value to MMIO space.
+*
+* This function will write to MMIO space of the target object at a specified
+* offset and length.
+*
+* In order to access a resource's MMIO space using this function, it has to be
+* mapped to the application's address space using fpgaMapMMIO().
+*
+* @param[in]  handle   Handle to previously opened accelerator resource
+* @param[in]  mmio_num Number of MMIO space to access
+* @param[in]  offset   Byte offset into MMIO space
+* @param[in]  buffer   Pointer to memory from where data to be written.
+* @param[in]  length   Length of the MMIO to write.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space
+* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function.
+*/
+__FPGA_API__ fpga_result fpgaWriteMmio(fpga_handle handle,
+    uint32_t mmio_num,
+    uint64_t offset,
+    void     *buffer,
+    uint32_t length);
+
+/**
+* Read the config space of the device.
+*
+* This function will read the configuration space of the FPGA device
+*
+* @note This call is only supported by PCIe hardware targets, not by ASE
+*       simulation.
+*
+* @param[in]  handle   Handle to previously opened resource
+* @param[in]  offset   Offset within the config space of the device.
+* @param[in]  buffer   Pointer to the buffer where data read will be returned.
+* @param[in]  length   Number of bytes to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle.
+*/
+__FPGA_API__ fpga_result fpgaReadPciConfigSpace(fpga_handle handle,
+            uint32_t offset,
+            void*    buffer,
+            uint32_t length);
+
+/**
+* Write to config space of the device.
+*
+* This function will write to configuration space of the FPGA device
+*
+* @note This call is only supported by PCIe hardware targets, not by ASE
+*       simulation.
+*
+* @param[in]  handle   Handle to previously opened resource
+* @param[in]  offset   Offset within the config space of the device.
+* @param[in]  buffer   Pointer to the buffer where data read will be returned.
+* @param[in]  length   Number of bytes to read.
+* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied
+* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred
+* while trying to access the handle.
+*/
+__FPGA_API__ fpga_result fpgaWritePciConfigSpace(fpga_handle handle,
+            uint32_t offset,
+            void*    buffer,
+            uint32_t length);
+
+END_C_DECL
+
+#endif // __FPGA_MMIO_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h
new file mode 100644
index 0000000..03e5e79
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h
@@ -0,0 +1,689 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file properties.h
+ * @brief Functions for examining and manipulating `fpga_properties` objects
+ *
+ * In OPAE, `fpga_properties` objects are used both for obtaining information
+ * about resources and for selectively enumerating resources based on their
+ * properties. This file provides accessor functions (get/set) to allow reading
+ * and writing individual items of an `fpga_properties` object. Generally, not
+ * all object types supported by OPAE carry all properties. If you call a
+ * property accessor method on a `fpga_properties` object that does not support
+ * this particular property, it will return FPGA_INVALID_PARAM.
+ *
+ * # Accessor Return Values
+ * In addition to the return values specified in the documentation below, all
+ * accessor functions return FPGA_OK on success, FPGA_INVALID_PARAM if you pass
+ * NULL or invalid parameters (i.e. non-initialized properties objects),
+ * FPGA_EXCEPTION if an internal exception occurred trying to access the
+ * properties object, FPGA_NOT_FOUND if the requested property is not part of
+ * the supplied properties object.
+ */
+
+#ifndef __FPGA_PROPERTIES_H__
+#define __FPGA_PROPERTIES_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Create a fpga_properties object
+ *
+ * Initializes the memory pointed at by `prop` to represent a properties
+ * object, and populates it with the properties of the resource referred to by
+ * `token`. Individual properties can then be queried using fpgaPropertiesGet*()
+ * accessor functions.
+ *
+ * If `token` is NULL, an "empty" properties object is created to be used as a
+ * filter for fpgaEnumerate(). All individual fields are set to `don`t care`,
+ * which implies that the fpga_properties object would match all FPGA resources
+ * if used for an fpgaEnumerate() query. The matching criteria can be further
+ * refined by using fpgaSet* functions on the properties object, or the
+ * object can be populated with the actual properties of a resource by using
+ * fpgaUpdateProperties().
+ *
+ * @Note fpgaGetProperties() will allocate memory for the created properties
+ * object returned in `prop`. It is the responsibility of the using application
+ * to free this memory after use by calling fpgaDestroyProperties().
+ *
+ * @param[in]  token      Token to get properties for. Can be NULL, which will
+ *                        create an empty properties object to be used as a
+ *                        filter for fpgaEnumerate().
+ * @param[out] prop       Pointer to a variable of type fpga_properties
+ * @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated
+ * to create the `fpga_properties` object. FPGA_EXCEPTION if an exception
+ * happend while initializing the `fpga_properties` object.
+ */
+__FPGA_API__ fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop);
+
+/**
+ * Update a fpga_properties object
+ *
+ * Populates the properties object 'prop' with properties of the resource
+ * referred to by 'token'. Unlike fpgaGetProperties(), this call will not create
+ * a new properties object or allocate memory for it, but use a previously
+ * created properties object.
+ *
+ * @param[in]  token      Token to retrieve properties for
+ * @param[in]  prop       fpga_properties object to update
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `token` or `prop` are not
+ * valid objects. FPGA_NOT_FOUND if the resource referred to by `token` was
+ * not found. FPGA_NO_DRIVER if not driver is loaded. FPGA_EXCEPTION if an
+ * internal exception occured when trying to update `prop`.
+ */
+__FPGA_API__ fpga_result fpgaUpdateProperties(fpga_token token, fpga_properties prop);
+
+/**
+ * Clear a fpga_properties object
+ *
+ * Sets all fields of the properties object pointed at by 'prop' to 'don't
+ * care', which implies that the fpga_properties object would match all FPGA
+ * resources if used for an fpgaEnumerate() query. The matching criteria can be
+ * further refined by using fpgaSet* functions on the properties object.
+ *
+ * Instead of creating a new fpga_properties object every time, this function
+ * can be used to re-use fpga_properties objects from previous queries.
+ *
+ * @param[in]  prop       fpga_properties object to clear
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `prop` is not a valid
+ * object. FPGA_EXCEPTION if an * internal exception occured when trying to
+ * access `prop`.
+ */
+__FPGA_API__ fpga_result fpgaClearProperties(fpga_properties prop);
+
+/**
+ * Clone a fpga_properties object
+ *
+ * Creates a copy of an fpga_properties object.
+ *
+ * @Note This call creates a new properties object and allocates memory for it.
+ * Both the 'src' and the newly created 'dst' objects will eventually need to be
+ * destroyed using fpgaDestroyProperties().
+ *
+ * @param[in]  src        fpga_properties object to copy
+ * @param[out] dst        New fpga_properties object cloned from 'src'
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `src` is not a valid
+ * object, or if `dst` is NULL. FPGA_NO_MEMORY if there was not enough memory
+ * to allocate an `fpga_properties` object for `dst`. FPGA_EXCEPTION if an
+ * internal exception occurred either accessing `src` or updating `dst`.
+ */
+__FPGA_API__ fpga_result fpgaCloneProperties(fpga_properties src, fpga_properties *dst);
+
+/**
+ * Destroy a fpga_properties object
+ *
+ * Destroys an existing fpga_properties object that the caller has previously
+ * created using fpgaGetProperties() or fpgaCloneProperties().
+ *
+ * @param[inout]  prop    Pointer to the fpga_properties object to destroy
+ * @returns FPGA_OK on success. FPGA_INVALID_PARAM is `prop` is not a valid
+ * object. FPGA_EXCEPTION if an internal exception occurrred while trying to
+ * access `prop`.
+ */
+__FPGA_API__ fpga_result fpgaDestroyProperties(fpga_properties *prop);
+
+/**
+ * Get the token of the parent object
+ *
+ * Returns the token of the parent of the queried resource in '*parent'.
+ *
+ * @param[in]  prop   Properties object to query
+ * @param[out] parent Pointer to a token variable of the resource 'prop' is
+ *                    associated with
+ * @returns FPGA_NOT_FOUND if resource does not have a
+ * parent (e.g. an FPGA_DEVICE resource does not have parents). Also see
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetParent(const fpga_properties prop,
+                    fpga_token *parent);
+
+/**
+ * Set the token of the parent object
+ *
+ * @param[in]  prop   Properties object to modify
+ * @param[out] parent Pointer to a token variable of the resource 'prop' is
+ *                    associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetParent(fpga_properties prop,
+                    fpga_token parent);
+
+/**
+ * Get the object type of a resource
+ *
+ * Returns the object type of the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] objtype Pointer to an object type variable of the resource
+ *                     'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetObjectType(const fpga_properties prop,
+                    fpga_objtype *objtype);
+
+/**
+ * Set the object type of a resource
+ *
+ * Sets the object type of the resource. * Currently supported object types are
+ * FPGA_DEVICE and FPGA_ACCELERATOR.
+ *
+ * @param[in]  prop    Properties object to modify
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetObjectType(fpga_properties prop,
+                    fpga_objtype objtype);
+
+/**
+ * Get the PCI bus number of a resource
+ *
+ * Returns the bus number the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] bus     Pointer to a PCI bus variable of the resource 'prop'
+ *                     is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBus(const fpga_properties prop, uint8_t *bus);
+
+/**
+ * Set the PCI bus number of a resource
+ *
+ * @param[in]  prop    Properties object to modify
+ * @param[in]  bus     PCI bus number of the resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBus(fpga_properties prop, uint8_t bus);
+
+/**
+ * Get the PCI device number of a resource
+ *
+ * Returns the device number the queried resource.
+ *
+ * @param[in]  prop    Properties object to query
+ * @param[out] device  Pointer to a PCI device variable of the resource 'prop'
+ *                     is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetDevice(const fpga_properties prop,
+                    uint8_t *device);
+
+/**
+ * Set the PCI device number of a resource
+ *
+ * Enforces the limitation on the number of devices as specified in the
+ * PCI spec.
+ *
+ * @param[in]  prop    Properties object to modify
+ * @param[in]  device  PCI device number of the resource 'prop' is associated
+ *                     with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetDevice(fpga_properties prop,
+                    uint8_t device);
+
+/**
+ * Get the PCI function number of a resource
+ *
+ * Returns the function number the queried resource.
+ *
+ * @param[in]  prop     Properties object to query
+ * @param[out] function Pointer to PCI function variable of the
+ *                      resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetFunction(const fpga_properties prop,
+                      uint8_t *function);
+
+/**
+ * Set the PCI function number of a resource
+ *
+ * Enforces the limitation on the number of functions as specified in the
+ * PCI spec.
+ *
+ * @param[in]  prop     Properties object to modify
+ * @param[in]  function PCI function number of the resource 'prop' is
+ *                      associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetFunction(fpga_properties prop,
+                      uint8_t function);
+
+/**
+ * Get the socket id of a resource
+ *
+ * Returns the socket id of the queried resource.
+ *
+ * @param[in]  prop      Properties object to query
+ * @param[out] socket_id Pointer to a socket id variable of the
+ *                       resource 'prop'
+ *                       is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ * See also "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetSocketID(const fpga_properties prop,
+                      uint8_t *socket_id);
+
+/**
+ * Set the socket id of the resource
+ *
+ * @param[in]  prop      Properties object to modify
+ * @param[in]  socket_id Socket id of the resource 'prop' is
+ *                       associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetSocketID(fpga_properties prop,
+                      uint8_t socket_id);
+
+/**
+ * Get the device id of the resource
+ *
+ * @param[in]  prop      Properties object to query
+ * @param[out] device_id Pointer to a device id variable of the
+ *                       resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetDeviceID(const fpga_properties prop,
+                      uint32_t *device_id);
+
+/**
+ * Set the device id of the resource
+ *
+ * @param[in]  prop      Properties object to modify
+ * @param[in]  device_id Device id of the resource 'prop' is associated with
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetDeviceID(fpga_properties prop,
+                      uint32_t device_id);
+
+/**
+ * Get the number of slots of an FPGA resource property
+ *
+ * Returns the number of slots present in an FPGA.
+ *
+ * @param[in]  prop       Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] num_slots  Pointer to number of slots variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumSlots(const fpga_properties prop,
+                      uint32_t *num_slots);
+
+/**
+ * Set the number of slots of an FPGA resource property
+ *
+ * @param[in]  prop       Properties object to modify - must be of type
+ *                        FPGA_DEVICE
+ * @param[in] num_slots   Number of slots of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumSlots(fpga_properties prop,
+                      uint32_t num_slots);
+
+/**
+ * Get the BBS ID of an FPGA resource property
+ *
+ * Returns the blue bitstream id of an FPGA.
+ *
+ * @param[in]  prop       Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] bbs_id     Pointer to a bbs id variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBBSID(const fpga_properties prop,
+                   uint64_t *bbs_id);
+
+/**
+ * Set the BBS ID of an FPGA resource property
+ *
+ * @param[in]  prop       Properties object to modify - must be of type
+ *                        FPGA_DEVICE
+ * @param[in]  bbs_id     Blue bitstream id of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBBSID(fpga_properties prop,
+                   uint64_t bbs_id);
+
+/**
+ * Get the BBS Version of an FPGA resource property
+ *
+ * Returns the blue bitstream version of an FPGA.
+ *
+ * @param[in]  prop        Properties object to query - must be of type
+ *                         FPGA_DEVICE
+ * @param[out] bbs_version Pointer to a bbs version variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetBBSVersion(const fpga_properties prop,
+                    fpga_version *bbs_version);
+
+/**
+ * Set the BBS Version of an FPGA resource property
+ *
+ * @param[in]  prop        Properties object to modify - must be of type
+ *                         FPGA_DEVICE
+ * @param[in]  bbs_version Blue bitstream version of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetBBSVersion(fpga_properties prop,
+                    fpga_version version);
+
+/**
+ * Get the vendor id of an FPGA resource property
+ *
+ * Returns the vendor id of an FPGA.
+ *
+ * @param[in]  prop      Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] vendor_id Pointer to a vendor id variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetVendorID(const fpga_properties prop,
+                      uint16_t *vendor_id);
+
+/**
+ * Set the vendor id of an FPGA resource property
+ *
+ * @param[in]  prop      Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  vendor_id Vendor id of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetVendorID(fpga_properties prop,
+                      uint16_t vendor_id);
+
+/**
+ * Get the model of an FPGA resource property
+ *
+ * Returns the model of an FPGA.
+ *
+ * @param[in]  prop  Properties object to query - must be of type FPGA_DEVICE
+ * @param[in]  model Model of the FPGA resource (string of minimum
+ *                   FPGA_MODEL_LENGTH length
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetModel(const fpga_properties prop,
+                   char *model);
+
+/**
+ * Set the model of an FPGA resource property
+ *
+ * @param[in]  prop  Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  model Model of the FPGA resource (string of maximum
+ *                   FPGA_MODEL_LENGTH length
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetModel(fpga_properties prop,
+                   char *model);
+
+/**
+ * Get the local memory size of an FPGA resource property
+ *
+ * Returns the local memory size of an FPGA.
+ *
+ * @param[in]  prop  Properties object to query - must be of type FPGA_DEVICE
+ * @param[out] lms   Pointer to a memory size variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties prop,
+                         uint64_t *lms);
+
+/**
+ * Set the local memory size of an FPGA resource property
+ *
+ * @param[in]  prop  Properties object to modify - must be of type FPGA_DEVICE
+ * @param[in]  lms   Local memory size of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetLocalMemorySize(fpga_properties prop,
+                         uint64_t lms);
+
+/**
+ * Get the capabilities FPGA resource property
+ *
+ * Returns the capabilities of an FPGA.
+ * Capabilities is a bitfield value
+ *
+ * @param[in]  prop         Properties object to query - must be of type
+ *                          FPGA_DEVICE
+ * @param[out] capabilities Pointer to a capabilities variable of the FPGA
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetCapabilities(const fpga_properties prop,
+                      uint64_t *capabilities);
+
+/**
+ * Set the capabilities of an FPGA resource property
+ *
+ * Capabilities is a bitfield value
+ *
+ * @param[in]  prop         Properties object to modify - must be of type
+ *                          FPGA_DEVICE
+ * @param[in]  capabilities Capabilities of the FPGA resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ *
+ * @note This API is not currently supported.
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetCapabilities(fpga_properties prop,
+                      uint64_t capabilities);
+
+/**
+ * Get the GUID of a resource
+ *
+ * Returns the GUID of an FPGA or accelerator object.
+ *
+ * For an accelerator, the GUID uniquely identifies a specific accelerator context type,
+ * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID
+ * is used to identify a certain instance of an FPGA, e.g. to determine whether
+ * a given bitstream would be compatible.
+ *
+ * @param[in]  prop       Properties object to query
+ * @param[out] guid       Pointer to a GUID of the slot variable
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetGUID(const fpga_properties prop,
+                  fpga_guid *guid);
+
+/**
+ * Set the GUID of a resource
+ *
+ * Sets the GUID of an FPGA or accelerator object.
+ *
+ * For an accelerator, the GUID uniquely identifies a specific accelerator context type,
+ * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID
+ * is used to identify a certain instance of an FPGA, e.g. to determine whether
+ * a given bitstream would be compatible.
+ *
+ * @param[in]  prop       Properties object to modify
+ * @param[out] guid       Pointer to a GUID of the slot variable
+ * @returns See "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid);
+
+/**
+ * Get the number of mmio spaces
+ *
+ * Returns the number of mmio spaces of an AFU properties structure.
+ *
+ * @param[in]  prop        Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] mmio_spaces Pointer to a variable for number of mmio spaces
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumMMIO(const fpga_properties prop,
+                     uint32_t *mmio_spaces);
+
+/**
+ * Set the number of mmio spaces
+ *
+ * Sets the number of mmio spaces of an AFU properties structure.
+ *
+ * @param[in] prop        Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] mmio_spaces Number of MMIO spaces of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumMMIO(fpga_properties prop,
+                     uint32_t mmio_spaces);
+
+/**
+ * Get the number of interrupts
+ *
+ * Returns the number of interrupts of an accelerator properties structure.
+ *
+ * @param[in]  prop        Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] mmio_spaces Pointer to a variable for number of interrupts
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetNumInterrupts(const fpga_properties prop,
+                       uint32_t *num_interrupts);
+
+/**
+ * Set the number of mmio spaces
+ *
+ * Sets the number of interrupts of an accelerator properties structure.
+ *
+ * @param[in] prop        Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] mmio_spaces Number of interrupts of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetNumInterrupts(fpga_properties prop,
+                       uint32_t num_interrupts);
+
+/**
+ * Get the state of a accelerator resource property
+ *
+ * Returns the accelerator state of a accelerator.
+ *
+ * @param[in]  prop   Properties object to query - must be of type FPGA_ACCELERATOR
+ * @param[out] status Pointer to a accelerator state variable of the accelerator
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesGetAcceleratorState(const fpga_properties prop,
+                      fpga_accelerator_state *state);
+
+
+/**
+ * Set the state of an accelerator resource property
+ *
+ * @param[in] prop    Properties object to modify - must be of type FPGA_ACCELERATOR
+ * @param[in] status  accelerator state of the accelerator resource
+ * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also
+ * "Accessor Return Values" in [properties.h](#properties-h).
+ */
+__FPGA_API__ fpga_result fpgaPropertiesSetAcceleratorState(fpga_properties prop,
+                      fpga_accelerator_state state);
+
+/**
+* Get the object ID of a resource
+*
+* Returns the object ID of a resource. The object ID is a 64 bit identifier
+* that is unique within a single node or system. It represents a similar
+* concept as the token, but can be used across processes (e.g. passed on the
+* command line).
+*
+* @param[in]  prop       Properties object to query
+* @param[out] object_id  Pointer to a 64bit memory location to store the object
+*                        ID in
+* @returns See "Accessor Return Values" in [properties.h](#properties-h).
+*/
+__FPGA_API__ fpga_result fpgaPropertiesGetObjectID(fpga_properties prop,
+                        uint64_t *object_id);
+
+
+/**
+* Set the object ID of a resource
+*
+* Sets the object ID of a resource. The object ID is a 64 bit identifier
+* that is unique within a single node or system. It represents a similar
+* concept as the token, but can be used across processes (e.g. passed on the
+* command line).
+*
+* @param[in]  prop       Properties object to query
+* @param[in]  object_id  A 64bit value to use as the object ID
+* @returns See "Accessor Return Values" in [properties.h](#properties-h).
+*/
+__FPGA_API__ fpga_result fpgaPropertiesSetObjectID(fpga_properties prop,
+    uint64_t object_id);
+
+/**
+* Create a fpga_properties object
+*
+* Initializes the memory pointed at by `prop` to represent a properties
+* object, and populates it with the properties of the resource referred to by
+* `handle`. Individual properties can then be queried using fpgaPropertiesGet*()
+* accessor functions.
+*
+* @note fpgaGetPropertiesFromHandle() will allocate memory for the created properties
+* object returned in `prop`. It is the responsibility of the caller
+* to free this memory after use by calling fpgaDestroyProperties().
+*
+* @param[in]  handle     Open handle to get properties for.
+* @param[out] prop       Pointer to a variable of type fpga_properties
+* @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated
+* to create the `fpga_properties` object. FPGA_EXCEPTION if an exception
+* happend while initializing the `fpga_properties` object.
+**/
+__FPGA_API__
+fpga_result
+fpgaGetPropertiesFromHandle(
+    fpga_handle     handle,
+    fpga_properties *prop
+    );
+
+END_C_DECL
+
+#endif // __FPGA_PROPERTIES_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h
new file mode 100644
index 0000000..481e6ae
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h
@@ -0,0 +1,173 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file types.h
+ * @brief Type definitions for FPGA API
+ *
+ * OPAE uses the three opaque types fpga_properties, fpga_token, and
+ * fpga_handle to create a hierarchy of objects that can be used to enumerate,
+ * reference, acquire, and query FPGA resources. This object model is designed
+ * to be extensible to account for different FPGA architectures and platforms.
+ *
+ * Initialization
+ * --------------
+ * OPAEs management of the opaque types `fpga_properties`,
+ * `fpga_token`, and `fpga_handle` relies on the proper initialization of
+ * variables of these types. In other words, before doing anything with a
+ * variable of one of these opaque types, you need to first initialize them.
+ *
+ * The respective functions that initizalize opaque types are:
+ *
+ *   * fpgaGetProperties() and fpgaCloneProperties() for `fpga_properties`
+ *   * fpgaEnumerate() and fpgaCloneToken() for `fpga_token`
+ *   * fpgaOpen() for `fpga_handle`
+ *
+ * This should intuitively make sense - fpgaGetProperties() creates
+ * `fpga_properties` objects, fpgaEnumerate() creates `fpga_token` objects,
+ * fpgaOpen() creates `fpga_handle` objects, and fpgaCloneProperties() and
+ * fpgaCloneToken() clone (create) `fpga_properties` and `fpga_token` objects,
+ * respectively.
+ *
+ * Since these opaque types are interpreted as pointers (they are typedef'd to
+ * a `void *`), passing an uninitialized opaque type into any function except
+ * the respective initailzation function will result in undefined behaviour,
+ * because OPAE will try to follow an invalid pointer. Undefined behaviour in
+ * this case may include an unexpected error code, or an application crash.
+ *
+ */
+
+#ifndef __FPGA_TYPES_H__
+#define __FPGA_TYPES_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <opae/types_enum.h>
+
+/**
+ * Object for expressing FPGA resource properties
+ *
+ * `fpga_properties` objects encapsulate all enumerable information about an
+ * FPGA resources. They can be used for two purposes: selective enumeration
+ * (discovery) and querying information about existing resources.
+ *
+ * For selective enumeration, usually an empty `fpga_properties` object is
+ * created (using fpgaGetProperties()) and then populated with the desired
+ * criteria for enumeration. An array of `fpga_properties` can then be passed
+ * to fpgaEnumerate(), which will return a list of `fpga_token` objects
+ * matching these criteria.
+ *
+ * For querying properties of existing FPGA resources, fpgaGetProperties() can
+ * also take an `fpga_token` and will return an `fpga_properties` object
+ * populated with information about the resource referenced by that token.
+ *
+ * After use, `fpga_properties` objects should be destroyed using
+ * fpga_destroyProperties() to free backing memory used by the
+ * `fpga_properties` object.
+ */
+typedef void *fpga_properties;
+
+/**
+ * Token for referencing FPGA resources
+ *
+ * An `fpga_token` serves as a reference to a specific FPGA resource present in
+ * the system. Holding an `fpga_token` does not constitute ownership of the
+ * FPGA resource - it merely allows the user to query further information about
+ * a resource, or to use fpgaOpen() to acquire ownership.
+ *
+ * `fpga_token`s are usually returned by fpgaEnumerate() or
+ * fpgaPropertiesGetParent(), and used by fpgaOpen() to acquire ownership and
+ * yield a handle to the resource. Some API calls also take `fpga_token`s as
+ * arguments if they don't require ownership of the resource in question.
+ */
+typedef void *fpga_token;
+
+/**
+ * Handle to an FPGA resource
+ *
+ * A valid `fpga_handle` object, as populated by fpgaOpen(), denotes ownership
+ * of an FPGA resource. Note that ownership can be exclusive or shared,
+ * depending on the flags used in fpgaOpen(). Ownership can be released by
+ * calling fpgaClose(), which will render the underlying handle invalid.
+ *
+ * Many OPAE C API functions require a valid token (which is synonymous with
+ * ownership of the resource).
+ */
+typedef void *fpga_handle;
+
+/**
+ * Globally unique identifier (GUID)
+ *
+ * GUIDs are used widely within OPAE for helping identify FPGA resources. For
+ * example, every FPGA resource has a `guid` property, which can be (and in the
+ * case of FPGA_ACCELERATOR resource primarily is) used for enumerating a resource of a
+ * specific type.
+  *
+ * `fpga_guid` is compatible with libuuid's uuid_t, so users can use libuuid
+ * functions like uuid_parse() to create and work with GUIDs.
+ */
+typedef uint8_t fpga_guid[16];
+
+/**
+ * Semantic version
+ *
+ * Data structure for expressing version identifiers following the semantic
+ * versioning scheme. Used in various properties for tracking component
+ * versions.
+ */
+typedef struct {
+    uint8_t major;        /**< Major version */
+    uint8_t minor;        /**< Minor version */
+    uint16_t patch;       /**< Revision or patchlevel */
+} fpga_version;
+
+/*
+ * Scatter Gather list in userspace that will be populated during fpgaGetPhysicalAddress call
+ */
+typedef struct _sg_element {
+    uint64_t  phys_addr;                 /**< Starting physical address of this scatter/gather region */
+    uint32_t  length;                    /**< length, in bytes, of a physically contiguous SG region */
+} sg_element, *psg_element;
+
+/** Handle to an event object
+ *
+ * OPAE provides an interface to asynchronous events that can be generated by
+ * different FPGA resources. The event API provides functions to register for
+ * these events; associated with every event a process has registered for is an
+ * `fpga_event_handle`, which encapsulates the OS-specific data structure for
+ * event objects.
+ *
+ * On Linux, an `fpga_event_handle` can be used as a file descriptor and passed
+ * to select(), poll(), epoll() and similar functions to wait for asynchronous
+ * events.
+ */
+#ifndef _WIN32
+typedef int fpga_event_handle;
+#else
+typedef HANDLE fpga_event_handle;
+#endif
+
+#endif // __FPGA_TYPES_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h
new file mode 100644
index 0000000..6fc4de2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h
@@ -0,0 +1,196 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * @file types_enum.h
+ * @brief Definitions of enumerated types for the OPAE C API
+ *
+ * This file defines return and error codes, event and object types, states,
+ * and flags as used or reported by OPAE C API functions.
+ */
+
+#ifndef __FPGA_TYPES_ENUM_H__
+#define __FPGA_TYPES_ENUM_H__
+
+#ifdef _WIN32
+#ifdef FpgaLib_EXPORTS
+#define __FPGA_API__ __declspec(dllexport)
+#else
+#define __FPGA_API__ __declspec(dllimport)
+#endif
+#else
+#define __FPGA_API__ __attribute__((visibility("default")))
+#endif
+
+#ifdef __cplusplus
+#define BEGIN_C_DECL extern "C" {
+#define END_C_DECL }
+#else
+#define BEGIN_C_DECL
+#define END_C_DECL
+#endif
+
+/**
+ * OPAE C API function return codes
+ *
+ * Every public API function exported by the OPAE C library will return one of
+ * these codes. Usually, FPGA_OK denotes successful completion of the requested
+ * operation, while any return code *other* than FPGA_OK indicates an error or
+ * other deviation from the expected behavior. Users of the OPAE C API should
+ * always check the return codes of the APIs they call, and not use output
+ * parameters of functions that did not execute successfully.
+
+ * The fpgaErrStr() function converts error codes into printable messages.
+ *
+ * OPAE also has a logging mechanism that allows a developer to get more
+ * information about why a particular call failed with a specific message. If
+ * enabled, any function that returns an error code different from FPGA_OK will
+ * also print out a message with further details. This mechanism can be enabled
+ * by setting the environment variable `LIBOPAE_LOG` to 1 before running the
+ * respective application.
+ */
+
+//
+// Minimum alignment requirement for DMA BBB
+//
+#define FPGA_DMA_ALIGN_BYTES 64
+
+//
+// Maximum size (in bytes0 descriptor of each SGDMA
+// block can transfer. For pre-alpha maximum transfer size is
+// One Meg minus some bytes.
+
+#define FPGA_DMA_BUF_SIZE    (1020*1024)
+
+//
+// Number of DMA blocks supported by SGDMA.
+// Currently only one is supported by pre-alpha
+// bitstream
+//
+#define NDMA                 1
+
+typedef enum {
+    FPGA_OK = 0,         /**< Operation completed successfully */
+    FPGA_INVALID_PARAM,  /**< Invalid parameter supplied */
+    FPGA_BUSY,           /**< Resource is busy */
+    FPGA_EXCEPTION,      /**< An exception occurred */
+    FPGA_NOT_FOUND,      /**< A required resource was not found */
+    FPGA_NO_MEMORY,      /**< Not enough memory to complete operation */
+    FPGA_NOT_SUPPORTED,  /**< Requested operation is not supported */
+    FPGA_NO_DRIVER,      /**< Driver is not loaded */
+    FPGA_NO_DAEMON,      /**< FPGA Daemon (fpgad) is not running */
+    FPGA_NO_ACCESS,      /**< Insufficient privileges or permissions */
+    FPGA_RECONF_ERROR    /**< Error while reconfiguring FPGA */
+} fpga_result;
+
+ /*
+ * FPGA events
+ *
+ * OPAE currently defines the following event types that applications can
+ * register for.Note that not all FPGA resources and target platforms may
+ * support all event types.
+ */
+typedef enum
+{
+    FPGA_NO_EVENT = 0,
+    FPGA_EVENT_INTERRUPT,     /**< Interrupt generated by an accelerator */
+    FPGA_EVENT_ERROR,         /**< Infrastructure error event */
+    FPGA_EVENT_POWER_THERMAL, /**< Infrastructure thermal event */
+    FPGA_EVENT_PORT_ERROR,
+    FPGA_EVENT_FME_ERROR,
+    FPGA_LIFECYCLE_APPEAR_EVENT,
+    FPGA_LIFECYCLE_DISAPPEAR_EVENT,
+    FPGA_EVENT_AFC_INTERRUPT,
+    FPGA_EVENT_TYPE_MAX,
+    FPGA_EVENT_AP_EVENT,
+    FPGA_MAX_EVENT
+} fpga_event_type;
+
+/* TODO: consider adding lifecycle events in the future
+ * to help with orchestration.  Need a complete specification
+ * before including them in the API.  Proposed events:
+ * FPGA_EVENT_APPEAR
+ * FPGA_EVENT_DISAPPEAR
+ * FPGA_EVENT_CHANGE
+ */
+
+/** accelerator state */
+typedef enum {
+    FPGA_ACCELERATOR_ASSIGNED = 0,  /**< accelerator is opened exclusively by another process */
+    FPGA_ACCELERATOR_UNASSIGNED,    /**< accelerator is free to be opened */
+    FPGA_ACCELERATOR_STATE_MAX
+} fpga_accelerator_state;
+
+/**
+ * OPAE FPGA resources (objects)
+ *
+ * These are the FPGA resources currently supported by the OPAE object model.
+ */
+typedef enum {
+    /** FPGA_DEVICE objects represent FPGA devices and their management functionality.
+    * These objects can be opened (typically requires a certain privilege level or
+    * access permissions) and used for management functions like fpgaReconfigreSlot(). */
+    FPGA_DEVICE = 0,
+    /** FPGA_ACCELERATOR objects represent allocatable units for accessing
+    * accelerated functions on the FPGA. They are frequently opened for
+    * interacting via control registers (MMIO), shared memory, or other,
+    * possibly platform-specific functions. */
+    FPGA_ACCELERATOR,
+    FPGA_OBJTYPE_MAX
+} fpga_objtype;
+
+/**
+ * Buffer flags
+ *
+ * These flags can be passed to the fpgaPrepareBuffer() function.
+ */
+enum fpga_buffer_flags {
+    FPGA_BUF_PREALLOCATED = (1u << 0),  /**< Use existing buffer */
+    FPGA_BUF_QUIET = (1u << 1),         /**< Suppress error messages */
+    FPGA_BUF_NOCACHE = (1u << 2),
+    FPGA_BUF_LARGE_PAGE = (1u << 4)    /*< For 2MB page support in VTP */
+};
+
+/**
+ * Open flags
+ *
+ * These flags can be passed to the fpgaOpen() function.
+ */
+enum fpga_open_flags {
+    FPGA_OPEN_SHARED = (1u << 0) /**< Open FPGA resource for shared access */
+};
+
+/**
+ * Reconfiguration flags
+ *
+ * These flags can be passed to the fpgaReconfigure() function.
+ */
+enum fpga_reconf_flags {
+    /** Reconfigure the slot without checking if it is in use */
+    FPGA_RECONF_FORCE = (1u << 0)
+};
+
+#endif // __FPGA_TYPES_ENUM_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h
new file mode 100644
index 0000000..6e073ee
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h
@@ -0,0 +1,112 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file umsg.h
+ * \brief FPGA UMsg API
+ */
+
+#ifndef __FPGA_UMSG_H__
+#define __FPGA_UMSG_H__
+
+#include <opae/types.h>
+
+BEGIN_C_DECL
+
+/**
+ * Get number of Umsgs
+ *
+ * Retuns number of umsg supported by AFU.
+ *
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[out] value    Returns number of UMsgs
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaGetNumUmsg(fpga_handle handle, uint64_t *value);
+
+/**
+ * Sets Umsg hint
+ *
+ * Writes usmg hint bit.
+ *
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[in]  value    Value to use for UMsg hint, Umsg hit is N wide bitvector
+ *                      where N = number of Umsgs.
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaSetUmsgAttributes(fpga_handle handle,
+                  uint64_t value);
+
+/**
+ * Trigger Umsg
+ *
+ * Writes a 64-bit value to trigger low-latency accelerator notification mechanism
+ * (UMsgs).
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @prarm[in]  value    Value to use for UMsg
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ */
+__FPGA_API__ fpga_result fpgaTriggerUmsg(fpga_handle handle, uint64_t value);
+
+/**
+ * Access UMsg memory directly
+ *
+ * This function will return a pointer to the memory allocated for low latency
+ * accelerator notifications (UMsgs).
+ * @note This call is only supported by hardware targets, not by ASE
+ * simulation. Use fpgaTriggerUmsg() if you need ASE simulation capabilities.
+ *
+ * @param[in]  handle   Handle to previously opened accelerator resource
+ * @param[out] umsg_ptr Pointer to memory where a pointer to the virtual
+ *                      address space will be returned
+ * @returns             FPGA_OK on success.
+ *                      FPGA_INVALID_PARAM if input parameter combination
+ *                      is not valid.
+ *                      FPGA_EXCEPTION if input parameter fpga handle is not
+ *                      valid.
+ *                      FPGA_NO_MEMORY if memory allocation fails or system
+ *                      doesn't configure huge pages.
+ */
+__FPGA_API__ fpga_result fpgaGetUmsgPtr(fpga_handle handle, uint64_t **umsg_ptr);
+
+END_C_DECL
+
+#endif // __FPGA_UMSG_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h
new file mode 100644
index 0000000..5b57cbd
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h
@@ -0,0 +1,54 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+/**
+ * \file utils.h
+ * \brief Utility functions and macros for the FPGA API
+ */
+
+#ifndef __FPGA_UTILS_H__
+#define __FPGA_UTILS_H__
+
+#include <opae/types.h>
+#include <stdio.h>
+
+BEGIN_C_DECL
+
+/**
+ * Return human-readable error message
+ *
+ * Returns a pointer to a human-readable error message corresponding to the
+ * provided fpga_error error code.
+ *
+ * @param[in]  e   Error code (as returned by another FPGA API function
+ * @returns        Pointer to a descriptive error message string
+ */
+__FPGA_API__ const char *fpgaErrStr(fpga_result e);
+
+END_C_DECL
+
+#endif // __FPGA_UTILS_H__
+
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h
new file mode 100644
index 0000000..66bd18b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h
@@ -0,0 +1,79 @@
+// Copyright(c) 2017 - 2019, Intel Corporation
+//
+// Redistribution  and  use  in source  and  binary  forms,  with  or  without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of  source code  must retain the  above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name  of Intel Corporation  nor the names of its contributors
+//   may be used to  endorse or promote  products derived  from this  software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,  BUT NOT LIMITED TO,  THE
+// IMPLIED WARRANTIES OF  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED.  IN NO EVENT  SHALL THE COPYRIGHT OWNER  OR CONTRIBUTORS BE
+// LIABLE  FOR  ANY  DIRECT,  INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+// CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT  NOT LIMITED  TO,  PROCUREMENT  OF
+// SUBSTITUTE GOODS OR SERVICES;  LOSS OF USE,  DATA, OR PROFITS;  OR BUSINESS
+// INTERRUPTION)  HOWEVER CAUSED  AND ON ANY THEORY  OF LIABILITY,  WHETHER IN
+// CONTRACT,  STRICT LIABILITY,  OR TORT  (INCLUDING NEGLIGENCE  OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef __FPGA_VERSION_H__
+#define __FPGA_VERSION_H__
+
+#include <opae/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Get version information about the OPAE library
+ *
+ * Retrieve major version, minor version, and revision information about the
+ * OPAE library.
+ *
+ * @param[out]  version  FPGA version
+ * @returns FPGA_INVALID_PARAM if any of the output parameters is NULL, FPGA_OK
+ * otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECVersion(fpga_version *version);
+
+/**
+ * Get version information about the OPAE library as a string
+ *
+ * Retrieve major version, minor version, and revision information about the
+ * OPAE library, encoded in a human-readable string (e.g. "1.0.0").
+ *
+ * @param[out]  version_str  String to copy version information into
+ * @param[in]   len          Length of `version_str`
+ * @returns FPGA_INVALID_PARAM if `version_str` is NULL, FPGA_EXCEPTION if the
+ * version string cannot be copied into `version_str`, FPGA_OK otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECVersionString(char *version_str, size_t len);
+#define FPGA_VERSION_STR_MAX 10
+
+/**
+ * Get build information about the OPAE library as a string
+ *
+ * Retrieve the build identifier of the OPAE library.
+ *
+ * @param[out]  build_str  String to copy build information into
+ * @param[in]   len        Length of `build_str`
+ * @returns FPGA_INVALID_PARAM if `build_str` is NULL, FPGA_EXCEPTION if the
+ * version string cannot be copied into `build_str`, FPGA_OK otherwise.
+ */
+__FPGA_API__ fpga_result fpgaGetOPAECBuildString(char *build_str, size_t len);
+#define FPGA_BUILD_STR_MAX 41
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // __FPGA_VERSION_H__
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h
new file mode 100644
index 0000000..27f4f1e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h
@@ -0,0 +1,170 @@
+/* Editor for Altera OpenCL package files
+ *
+ * Dmitry Denisenko, June 2012.
+ *
+ * This provides higher-level functions for ELF work.
+ * The idea is to put content into sections, one "piece" of content
+ * per section, and use section names to identify the content.
+ * The interface enforces unique section names (not true for generic ELFs)
+ * and hides all the ugly ELF interface calls and structures.
+ */
+
+#ifndef PKG_FILE_EDITOR_H
+#define PKG_FILE_EDITOR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_STRING_LENGTH 100000
+
+/* Modes for open_struct acl_pkg_file() call.
+ * Exactly one of ACL_PKG_READ, ACL_PKG_READ_WRITE must be supplied.
+ * Other flags may be bitwise OR'd into the mode.
+ *
+ * You can combine other modes with ACL_PKG_SHOW_* to control messages.
+ */
+#define ACL_PKG_READ          (1<<0)  /* Only reading the package */
+#define ACL_PKG_READ_WRITE    (1<<1)  /* Expect to read and write the binary. File must already exist. */
+#define ACL_PKG_CREATE        (1<<2)  /* Also creating.  Can only be used with ACL_PKG_READ_WRITE */
+
+#define ACL_PKG_SHOW_ERROR    (1<<8)  /*print errors to stderr*/
+#define ACL_PKG_SHOW_INFO     (1<<9) /*print info messages to stdout*/
+
+#define ACL_PKG_SECTION_ACL_VERSION    ".acl.version"
+#define ACL_PKG_SECTION_ACL_BUILD      ".acl.build"
+#define ACL_PKG_SECTION_QVERSION       ".acl.qversion"
+#define ACL_PKG_SECTION_HASH           ".acl.hash"
+#define ACL_PKG_SECTION_BOARD          ".acl.board"
+#define ACL_PKG_SECTION_COMPILEOPTIONS ".acl.compileoptions"
+#define ACL_PKG_SECTION_SOURCE         ".acl.source"
+#define ACL_PKG_SECTION_LLVMIR         ".acl.llvmir"
+#define ACL_PKG_SECTION_VERILOG        ".acl.verilog"
+#define ACL_PKG_SECTION_PROFILE_BASE   ".acl.profile_base"
+#define ACL_PKG_SECTION_AUTODISCOVERY  ".acl.autodiscovery"
+#define ACL_PKG_SECTION_RBF            ".acl.rbf"
+#define ACL_PKG_SECTION_CORE_RBF       ".acl.core.rbf"
+#define ACL_PKG_SECTION_PERIPH_RBF     ".acl.periph.rbf"
+#define ACL_PKG_SECTION_BASE_RBF       ".acl.base_revision.rbf"
+#define ACL_PKG_SECTION_SOF            ".acl.sof"
+#define ACL_PKG_SECTION_VFABRIC        ".acl.vfabric"
+#define ACL_PKG_SECTION_PLL_CONFIG     ".acl.pll_config"
+#define ACL_PKG_SECTION_FPGA_BIN       ".acl.fpga.bin"
+#define ACL_PKG_SECTION_EMULATOR_OBJ_LINUX   ".acl.emulator_object.linux"
+#define ACL_PKG_SECTION_EMULATOR_OBJ_WINDOWS ".acl.emulator_object.windows"
+#define ACL_PKG_SECTION_AUTODISCOVERY_XML    ".acl.autodiscovery.xml"
+#define ACL_PKG_SECTION_BOARDSPEC_XML  ".acl.board_spec.xml"
+#define ACL_PKG_SECTION_PERIPH_HASH    ".acl.periph.hash"
+#define ACL_PKG_SECTION_PROFILER_XML   ".acl.profiler.xml"
+#define ACL_PKG_SECTION_COMPILE_REV    ".acl.compile_revision"
+#define ACL_PKG_SECTION_PCIE_DEV_ID    ".acl.pcie.dev_id"
+#define ACL_PKG_SECTION_BASE_PERIPH_HASH    ".acl.base_revision.periph.hash"
+#define ACL_PKG_SECTION_ADJUST_PLLS_OUTPUT ".acl.quartus_report"
+#define ACL_PKG_SECTION_KERNEL_ARG_INFO_XML ".acl.kernel_arg_info.xml"
+#define ACL_PKG_SECTION_FAST_COMPILE ".acl.fast_compile"
+
+/* Minimum alignment in memory. */
+#define ACL_PKG_MIN_SECTION_ALIGNMENT  128
+
+/* Open and close the pkg file */
+struct acl_pkg_file *acl_pkg_open_file (const char *fname, int mode);
+/* You can call close on a NULL pointer: it will do nothing.
+ * Closing the package file will also free its memory, so you better lose
+ * the pointer reference.
+ */
+int acl_pkg_close_file (struct acl_pkg_file *pkg);
+
+/* Set message output mode: show_mode is some combination of the bits
+ * in ACL_PKG_SHOW_INFO and ACL_PKG_SHOW_ERROR
+ */
+void acl_pkg_set_show_mode( struct acl_pkg_file* pkg, int show_mode );
+
+/* Open memory image of pkg file. Only good for reading!
+ * The show_mode argument is an OR combination of zero or more of
+ *    ACL_PKG_SHOW_INFO,
+ *    ACL_PKG_SHOW_ERROR.
+ */
+struct acl_pkg_file *acl_pkg_open_file_from_memory (char *pkg_image, size_t pkg_image_size, int show_mode);
+
+
+/* Does the given named section exist?
+ * Returns 1 for yes, 0 for no.
+ * If the section exists, and size_ret is not-NULL, then the size (in bytes) of the
+ * section is stored into *size_ret. The size does NOT include NULL terminator, just like strlen().
+ */
+int acl_pkg_section_exists (const struct acl_pkg_file *pkg, const char *sect_name, size_t* size_ret);
+
+/* Return list of ALL (useful) section names in the package.
+ * The buffer must be pre-allocated by the caller upto max_len bytes.
+ * Each section name is separated by '\n'
+ * Returns 1 on success, 0 on failure.
+ */
+int acl_pkg_section_names (const struct acl_pkg_file *pkg, char *buf, size_t max_len);
+
+
+/* Add a new section with specified content.
+ * If a section with such name already exists, nothing is done.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_add_data_section           (struct acl_pkg_file *pkg, const char *sect_name, const void* content, size_t len);
+int acl_pkg_add_data_section_from_file (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file);
+
+/* Read content of an existing section.
+ * For read_section(), the buffer must be pre-allocated by caller to hold at least len bytes.
+ * This function will add '\0' at the end, therefore, the 'len' argument passed to this function
+ * must be one larger than the value returned by acl_pkg_section_exists.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_read_section              (const struct acl_pkg_file *pkg, const char *sect_name, char *buf, size_t len);
+int acl_pkg_read_section_into_file    (struct acl_pkg_file *pkg, const char *sect_name, const char *out_file);
+
+/* Get a transient pointer to a section's data, via buf_ptr.
+ * The pointer is transient: It might move if you update the package in any way.
+ * This is a "fast" path in comparison to acl_pkg_read_section, so you
+ * don't have to allocate space to copy into.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_read_section_transient(const struct acl_pkg_file *pkg, const char *sect_name, char** buf_ptr);
+
+/* Update content of an existing section.
+ * Old content is discarded. The section must already exist.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_update_section            (struct acl_pkg_file *pkg, const char *sect_name, const void *new_content, size_t new_len);
+int acl_pkg_update_section_from_file  (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file);
+
+/* List all pkg sections to stdout.
+ * Returns 0 on failure, non-zero on success.
+ */
+int acl_pkg_list_file_sections   (struct acl_pkg_file *pkg);
+
+/* Read full content of file into a buffer.
+ * The buffer is allocated by this function but must be freed by the caller.
+ * File length is returned in the second argument */
+void *acl_pkg_read_file_into_buffer (const char *in_file, size_t *file_size_out);
+
+/* support for package/unpackage */
+
+/* Package the input files and directory trees (NULL terminated list in input_files_dirs)
+ * and put them into the output file (out_file).
+ * Returns 0 on failure, non-zero on success
+ */
+int acl_pkg_pack (const char* out_file, const char** input_files_dirs);
+
+/* Unpack the input file (or stdin if filename is ACL_PKG_UNPACKAGE_STDIN)
+ * created by acl_pkg_pack into directory out_dir.
+ * Returns 0 on failure, non-zero on success
+ */
+#define ACL_PKG_UNPACKAGE_STDIN "-"
+int acl_pkg_unpack (const char* in_file, const char* out_dir);
+
+/* Unpack the buffer created by acl_pkg_pack into directory out_dir.
+ * Returns 0 on failure, non-zero on success
+ */
+int acl_pkg_unpack_buffer (const char* buffer, size_t buffer_size, const char* out_dir);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PKG_FILE_EDITOR_H */
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib
new file mode 100755
index 0000000..2f26b62
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib
diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib
new file mode 100755
index 0000000..6c7f423
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore
new file mode 100644
index 0000000..0948b39
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore
@@ -0,0 +1,20 @@
+*~
+*#
+*.marks
+release_build/
+build/
+example_designs/mem_bandwidth/bin/
+example_designs/mem_bandwidth/simulation.tar.gz
+example_designs/mem_bandwidth/temp_simulation/
+linux64/lib/
+linux64/libexec/diagnose
+linux64/libexec/program
+ase/mpf_src
+*.pyc
+*.swp
+*.kwlp
+*.kwps
+temp_simulation/
+simulation.tar.gz
+
+backup
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt
new file mode 100644
index 0000000..d8bf50d
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt
@@ -0,0 +1,59 @@
+# (C) 2017 Intel Corporation. All rights reserved.
+# Your use of Intel Corporation's design tools, logic functions and other
+# software and tools, and its AMPP partner logic functions, and any output
+# files any of the foregoing (including device programming or simulation
+# files), and any associated documentation or information are expressly subject
+# to the terms and conditions of the Intel Program License Subscription
+# Agreement, Intel MegaCore Function License Agreement, or other applicable
+# license agreement, including, without limitation, that your use is for the
+# sole purpose of programming logic devices manufactured by Intel and sold by
+# Intel or its authorized distributors.  Please refer to the applicable
+# agreement for further details.
+
+cmake_minimum_required(VERSION 2.8.12)
+project(mmd)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
+
+# DLA specific modifications made to the MMD
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD")
+
+# Select PCIE Gen3 x8
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x8")
+
+# from the opencl makefile
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -DOPTION3=1 -DACL_USE_DMA=1 -DACL_COMPILER_IS_MSVC=0 -Wall -Wno-unknown-pragmas -DACL_HAS_STDLIB_STDIO")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -DACL_HOST_RUNTIME_IS_STATIC=0")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_SYS=linux -DACL_TARGET_BIT=64 -DLINUX -DACL_MAX_DEVICE=128")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2 -O3")
+enable_language(C ASM)
+
+set(ASM_OPTIONS "-x assembler-with-cpp")
+if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as")
+endif()
+
+set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}")
+
+if(RUNTIME_POLLING)
+  add_definitions(-DRUNTIME_POLLING)
+endif(RUNTIME_POLLING)
+
+set(MMD_SRC
+   ./host/acl_hps.cpp
+   ./host/mmd_device.cpp
+   ./host/dma_device.cpp
+   ./host/uio_device.cpp
+)
+
+add_library(hps_platform_mmd SHARED ${MMD_SRC})
+
+target_include_directories(hps_platform_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+target_link_libraries(hps_platform_mmd)
+
+install(TARGETS hps_platform_mmd
+   LIBRARY DESTINATION lib
+   COMPONENT hps_platform_mmd
+)
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp
new file mode 100644
index 0000000..53055ef
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp
@@ -0,0 +1,473 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- HPS.cpp  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) HPS MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions that are defined in aocl_mmd.h               */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "acl_hps.h"
+
+// other standard header files
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <map>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "mmd_device.h"
+
+#ifdef DLA_MMD
+#include <chrono>
+#include <thread>
+#endif
+
+#if defined(LINUX)
+#include <fcntl.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <unistd.h>
+#endif  // LINUX
+
+#define MAX_HPS_FPGA_DEVICES (1)
+
+// MAX size of line read from pipe-ing the output of system call to MMD
+#define BUF_SIZE 1024
+// MAX size of command passed to system for invoking system call from MMD
+#define SYSTEM_CMD_SIZE 4 * 1024
+
+#ifndef DLA_MMD
+// static helper functions
+static bool blob_has_elf_signature(void *data, size_t data_size);
+#endif
+
+
+// Function to return the number of boards installed in the system
+unsigned int get_offline_num_boards() {
+  board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES);
+  return (unsigned int)names.size();
+}
+
+// Get information about the board using the enum aocl_mmd_offline_info_t for
+// offline info (called without a handle), and the enum aocl_mmd_info_t for
+// info specific to a certain board.
+#define RESULT_INT(X)                                  \
+  {                                                    \
+    *((int *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(int); \
+  }
+#define RESULT_UNSIGNED(X)                                  \
+  {                                                         \
+    *((unsigned *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(unsigned); \
+  }
+#define RESULT_SIZE_T(X)                                  \
+  {                                                       \
+    *((size_t *)param_value) = X;                         \
+    if (param_size_ret) *param_size_ret = sizeof(size_t); \
+  }
+#if defined(WINDOWS)
+#define RESULT_STR(X)                                                                                         \
+  do {                                                                                                        \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                                              \
+    memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                                               \
+  } while (0)
+#else
+#define RESULT_STR(X)                                                                     \
+  do {                                                                                    \
+    size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1;                                          \
+    memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \
+    if (param_size_ret) *param_size_ret = Xlen;                                           \
+  } while (0)
+#endif
+#define ACL_VENDOR_NAME "Intel"
+int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                              size_t param_value_size,
+                              void *param_value,
+                              size_t *param_size_ret) {
+  unsigned int num_boards;
+  switch (requested_info_id) {
+    case AOCL_MMD_VERSION:
+      RESULT_STR(MMD_VERSION);
+      break;
+    case AOCL_MMD_NUM_BOARDS: {
+      num_boards = MAX_HPS_FPGA_DEVICES;
+      RESULT_INT((int)num_boards);
+      break;
+    }
+    case AOCL_MMD_BOARD_NAMES: {
+      // Retrieve all the CoreDLA cores in the system
+      board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES);
+      // Construct a list of all possible devices supported by this MMD layer
+      std::ostringstream board;
+      auto name = names.begin();
+      while(name != names.end() )
+      {
+        board << *name;
+        name++;
+        if( name != names.end() )
+        {
+          board << ";";
+        }
+      }
+
+      RESULT_STR(board.str().c_str());
+      break;
+    }
+    case AOCL_MMD_VENDOR_NAME: {
+      RESULT_STR(ACL_VENDOR_NAME);
+      break;
+    }
+    case AOCL_MMD_VENDOR_ID:
+      RESULT_INT(0);
+      break;
+    case AOCL_MMD_USES_YIELD:
+      RESULT_INT(0); /* TODO: Can we yield? */
+      break;
+    case AOCL_MMD_MEM_TYPES_SUPPORTED:
+      RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY); /* TODO: Confirm this is the right memory type */
+      break;
+  }
+  return 0;
+}
+
+// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime
+// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure
+// the runtime doesn't get to reference them after MMD destructors have been called.
+// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does.
+// Implemented as a singleton.
+class DeviceMapManager final {
+public:
+  typedef std::map<int, mmd_device_ptr> map_handle_to_dev_t;
+  ~DeviceMapManager()
+  {
+  }
+
+  int add_device(const char *name)
+  {
+    int handle = idx++;
+
+    mmd_device_ptr spDevice = std::make_shared<mmd_device>(name, handle);
+    if( spDevice->bValid() )
+    {
+      auto it = handle_to_dev.find(handle);
+      HPS_ERROR_IF( it != handle_to_dev.end(), return FAILURE, "Error: Handle already used.\n" );
+      handle_to_dev.insert({handle, spDevice});
+      return handle;
+    }
+    return FAILURE;
+  }
+
+  mmd_device_ptr get_device(const int handle)
+  {
+    auto it = handle_to_dev.find(handle);
+    HPS_ERROR_IF( it == handle_to_dev.end(), return nullptr, "Error: Invalid handle.\n" );
+    return it->second;
+  }
+
+  bool remove_device(const int handle)
+  {
+    auto it = handle_to_dev.find(handle);
+    HPS_ERROR_IF( it == handle_to_dev.end(), return false, "Error: Handle does not exist.\n" );
+    handle_to_dev.erase(it);
+    return true;
+  }
+
+  DeviceMapManager()
+  {
+  }
+private:
+  map_handle_to_dev_t handle_to_dev = {};
+  int                 idx = {0};
+};
+static DeviceMapManager _gDeviceMapManager;
+
+int aocl_mmd_get_info(
+  int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) {
+  HPS_ERROR_IF(true,
+                    return FAILURE,
+                    "aocl_mmd_get_info not supported on platform. \n");
+}
+
+#undef RESULT_INT
+#undef RESULT_STR
+
+
+// Open and initialize the named device.
+int AOCL_MMD_CALL aocl_mmd_open(const char *name) {
+  return _gDeviceMapManager.add_device(name);
+}
+
+// Close an opened device, by its handle.
+int AOCL_MMD_CALL aocl_mmd_close(int handle) {
+  if ( _gDeviceMapManager.remove_device(handle) )
+    return SUCCESS;
+  return FAILURE;
+}
+
+// Set the interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->set_interrupt_handler(fn, user_data);
+}
+
+// Set the device interrupt handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle,
+                                                        aocl_mmd_device_interrupt_handler_fn fn,
+                                                        void *user_data) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+// Set the operation status handler for the opened device.
+int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+ return -1;
+}
+
+// Called when the host is idle and hence possibly waiting for events to be
+// processed by the device
+int AOCL_MMD_CALL aocl_mmd_yield(int handle)
+{
+      printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+// Read, write and copy operations on a single interface.
+int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->read_block(op, mmd_interface, dst, offset, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->write_block(op, mmd_interface, src, offset, len);
+}
+
+int AOCL_MMD_CALL
+aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// Initialize host channel specified in channel_name
+int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// reset the host channel specified with channel handle
+int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+// Get the pointer to buffer the user can write/read from the kernel with
+AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return NULL;
+}
+
+// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer
+size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return -1;
+}
+
+#ifdef DLA_MMD
+// Reprogram the device given the sof file name
+int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* We don't support reprogramming the SOF on a HPS device */
+  return -1;
+}
+#else
+// Reprogram the device based on the program mode
+int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* We don't support reprogramming the SOF on a HPS device */
+  return -1;
+}
+#endif
+// Shared memory allocator
+AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return NULL;
+}
+
+// Shared memory de-allocator
+AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  /* Not called by CoreDLA, so not implementing */
+  return;
+}
+
+#ifndef DLA_MMD
+// This function checks if the input data has an ELF-formatted blob.
+// Return true when it does.
+static bool blob_has_elf_signature(void *data, size_t data_size) {
+  bool result = false;
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  if (data && data_size > 4) {
+    unsigned char *cdata = (unsigned char *)data;
+    const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'};  // Little endian
+    result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) &&
+             (cdata[3] == elf_signature[3]);
+  }
+  return result;
+}
+#endif
+
+// Return a positive number when single device open. Otherwise, return -1
+AOCL_MMD_CALL int get_open_handle() {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  return -1;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles,
+                                        size_t num_devices,
+                                        size_t size,
+                                        size_t alignment,
+                                        aocl_mmd_mem_properties_t *properties,
+                                        int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_free(void *mem) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return 0;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_device_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL void *aocl_mmd_shared_alloc(
+    int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return NULL;
+}
+
+AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) {
+    printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__);
+  // Not supported on this BSP
+  return 0;
+}
+
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances()
+{
+  return 1;
+}
+
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() {
+  return 1ULL << 29;
+}
+
+// AGX7 HPS board uses 333.3325 MHz (1333.33/4) for the DLA DDR Clock
+// All other boards use 266.666666 MHz (1066.66666/4)
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() {
+#ifdef HPS_AGX7
+  return 333.332500;
+#else
+  return 266.666666;
+#endif
+}  // MHz
+
+// Helper functions for the wrapper functions around CSR and DDR
+uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) {
+  return (0x1000 * instance) + addr;
+}
+uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) {
+  return addr;
+}
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) {
+  return aocl_mmd_write(
+      handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) {
+  return aocl_mmd_read(
+      handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) {
+  return aocl_mmd_read(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr));
+}
+
+#ifdef STREAM_CONTROLLER_ACCESS
+AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) {
+  mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle);
+  if( nullptr == spDevice ) {
+    return FAILURE;
+  }
+  return spDevice->bStreamControllerValid();
+}
+
+AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) {
+  return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr);
+}
+
+AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) {
+  return aocl_mmd_read(
+      handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr);
+}
+#endif
+
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) {
+  return 200;
+}
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h
new file mode 100644
index 0000000..7c85a24
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h
@@ -0,0 +1,111 @@
+#ifndef ACL_HPS_H
+#define ACL_HPS_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- acl_hps.h  --------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Intel(R) HPS MMD Driver                                */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file defines macros and types that are used inside the MMD driver          */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#ifndef ACL_HPS_EXPORT
+#define ACL_HPS_EXPORT __declspec(dllimport)
+#endif
+
+#define MMD_VERSION AOCL_MMD_VERSION_STRING
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#ifdef DLA_MMD
+#include <cstdint>
+#endif
+#include "aocl_mmd.h"
+
+#include "hps_types.h"
+
+#if defined(WINDOWS)
+#error Currently not available for windows
+#endif
+
+#if defined(LINUX)
+typedef uintptr_t KPTR;
+typedef int fpga_handle;
+typedef unsigned int fpga_result;
+#define FPGA_OK 0
+
+typedef unsigned int DWORD;
+typedef unsigned long long QWORD;
+typedef char INT8;
+typedef unsigned char UINT8;
+typedef int16_t INT16;
+typedef uint16_t UINT16;
+typedef int INT32;
+typedef unsigned int UINT32;
+typedef long long INT64;
+typedef unsigned long long UINT64;
+
+#define INVALID_HANDLE_VALUE ((int)(-1))
+
+#define INVALID_DEVICE (-1)
+#define WD_STATUS_SUCCESS 0
+
+// define for the format string for DWORD type
+#define DWORD_FMT_U "%u"
+#define DWORD_FMT_X "%x"
+#define DWORD_FMT_4X "%04X"
+
+// define for the format string for size_t type
+#define SIZE_FMT_U "%zu"
+#define SIZE_FMT_X "%zx"
+
+#endif  // LINUX
+
+#define MAX_NAME_SIZE (1204)
+
+#define HPS_ASSERT(COND, ...)                        \
+  do {                                                    \
+    if (!(COND)) {                                        \
+      printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \
+      printf(__VA_ARGS__);                                \
+      fflush(stdout);                                     \
+      assert(0);                                          \
+    }                                                     \
+  } while (0)
+
+#define HPS_ERROR_IF(COND, NEXT, ...) \
+  do {                                     \
+    if (COND) {                            \
+      printf("\nMMD ERROR: " __VA_ARGS__); \
+      fflush(stdout);                      \
+      NEXT;                                \
+    }                                      \
+  } while (0)
+
+#define HPS_INFO(...)             \
+  do {                                 \
+    printf("MMD INFO : " __VA_ARGS__); \
+    fflush(stdout);                    \
+  } while (0)
+
+#endif  // ACL_HPS_H
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp
new file mode 100644
index 0000000..e403823
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp
@@ -0,0 +1,120 @@
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- dma_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         dma device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the dma device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "dma_device.h"
+#include <unistd.h>
+#include <glob.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+#include <memory.h>
+
+// Copied from Linux driver: /drivers/dma/altera-msgdma.c
+#define MSGDMA_DESC_NUM 1024
+
+// Same page size as used in /meta-intel-fpga-coredla/recipes-drivers/msgdma-userio/files/msgdma_userio_chr.c
+#define PAGE_SIZE 4096
+
+//////////////////////////////////////////////////////
+
+#define ERR(format, ...) \
+printf("%s:%u() **ERROR** : " format, \
+    __func__, __LINE__,  ##__VA_ARGS__)
+
+//////////////////////////////////////////////////////
+dma_device::dma_device(std::string &name)
+{
+    _pFile = fopen(name.c_str(), "r+");
+    if( _pFile == nullptr )
+    {
+        ERR("dma_device::dma_device failed to open %s\n", name.c_str());
+        return;
+    }
+
+    // Turn off buffering
+    setvbuf(_pFile, NULL, _IONBF, 0);
+}
+
+dma_device::~dma_device()
+{
+    if( _pFile )
+    {
+        fclose(_pFile);
+        _pFile = NULL;
+    }
+}
+
+int  dma_device::read_block(void *host_addr, size_t offset, size_t size)
+{
+    // Use 32bit seek as DDR memory current < 32bits
+    if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) {
+        return FAILURE;
+    }
+
+    size_t read_size = fread(host_addr, 1, size, _pFile);
+    return (read_size == size) ? SUCCESS : FAILURE;
+}
+
+int  dma_device::write_block(const void *host_addr, size_t offset, size_t size)
+{
+    // The MSGDMA driver only supports a maximum of 1024 x 4096 = 4MBytes in the worst case scenario,
+    // in the event that the virtual buffer is fully fragmented. As the buffer gets more fragmented it's
+    // possible to run out of DMA descriptors. To prevent this, slice the data into 4MB chunks.
+
+    // chunk_size is chosen based on the size of a page (12 bits) and default number of descriptors (1024).
+    // The descriptor count is reduced by 1 since if the host_addr is not aligned to a page then an extra page
+    // will be added at the end. This would then increase the descriptor count by 1.
+    size_t chunk_size = PAGE_SIZE * (MSGDMA_DESC_NUM - 1);
+    size_t write_size = 0;
+
+    // Use 32bit seek as DDR memory current < 32bits
+    if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) {
+        return FAILURE;
+    }
+
+    for (size_t host_addr_offset = 0; host_addr_offset < size; host_addr_offset += chunk_size) {
+        size_t current_size = chunk_size;
+
+        // If the current address is within one chunk_size from the end of the data, set current_size
+        // to the bytes left to send
+        if (size - host_addr_offset < chunk_size) {
+            current_size = size - host_addr_offset;
+        }
+
+        size_t current_write_size = fwrite((uint8_t *)host_addr + host_addr_offset, 1, current_size, _pFile);
+
+        if (current_write_size != current_size) {
+            return FAILURE;
+        }
+
+        write_size += current_write_size;
+    }
+
+    return (write_size == size) ? SUCCESS : FAILURE;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h
new file mode 100644
index 0000000..24f89e4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h
@@ -0,0 +1,56 @@
+#ifndef DMA_DEVICE_H_
+#define DMA_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- dma_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         dma device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the dma device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+#include <memory>
+
+#include "hps_types.h"
+
+class dma_device
+{
+public:
+  dma_device(std::string &name);
+  ~dma_device();
+
+  int read_block(void *host_addr, size_t offset, size_t size);
+  int write_block(const void *host_addr, size_t offset, size_t size);
+
+  bool bValid() { return _pFile != nullptr; };
+private:
+
+  dma_device() = delete;
+  dma_device(dma_device const&) = delete;
+  void operator=(dma_device const &) = delete;
+
+  FILE *_pFile = {nullptr}; // File pointer to UIO - Used to indicate the the uio_device is valid
+};
+typedef std::shared_ptr<dma_device> dma_device_ptr;
+
+#endif // DMA_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h
new file mode 100644
index 0000000..3f11c4a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h
@@ -0,0 +1,44 @@
+#ifndef HPS_TYPES_H_
+#define HPS_TYPES_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- hps_types.h  -------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         Useful HPS Types                                        */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file contains useful type definition                                       */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+
+#define SUCCESS (0)
+#define FAILURE (1)
+
+typedef std::vector<std::string> board_names;
+
+typedef enum {
+  HPS_MMD_COREDLA_CSR_HANDLE = 1, // COREDLA CSR Interface
+  HPS_MMD_MEMORY_HANDLE = 2,      // Device Memory transfers
+  HPS_MMD_STREAM_CONTROLLER_HANDLE = 3   // Stream Controller Interface
+} hps_mmd_interface_t;
+
+#endif // HPS_TYPES_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp
new file mode 100644
index 0000000..b52c1d8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp
@@ -0,0 +1,129 @@
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- mmd_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         mmd device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the mmd device object            */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+#include "mmd_device.h"
+
+// Defined names of the UIO Nodes
+#define UIO_COREDLA_PREFIX "coredla"
+#define STREAM_CONTROLLER_PREFIX "stream_controller"
+
+// Defined name of the msgdma device
+#define DMA_DEVICE_PREFIX "/dev/msgdma_coredla"
+#define UIO_DEVICE_PREFIX "uio"
+
+board_names mmd_get_devices(const int max_fpga_devices)
+{
+    return uio_get_devices(UIO_COREDLA_PREFIX, max_fpga_devices);
+}
+
+
+/////////////////////////////////////////////////////////
+mmd_device::mmd_device(std::string name, const int mmd_handle)
+: _name(name), _mmd_handle(mmd_handle) {
+    _spCoredlaDevice = std::make_shared<uio_device>(name, _mmd_handle, true);
+    int32_t index = extract_index(_name);
+    if( (index >= 0) && _spCoredlaDevice && _spCoredlaDevice->bValid() )
+    {
+        std::string dma_name(DMA_DEVICE_PREFIX);
+        dma_name += std::to_string(index);
+        _spDmaDevice = std::make_shared<dma_device>(dma_name);
+
+        if( (_spDmaDevice==nullptr) || (!_spDmaDevice->bValid()) ) {
+            _spDmaDevice = nullptr;
+            return;
+        }
+        std::string stream_controller_name = uio_get_device(STREAM_CONTROLLER_PREFIX, index);
+        if( !stream_controller_name.empty() ) {
+            // Create a uio_device but don't attach any interrupt support as the stream controller
+            // does not require interrupts
+            _spStreamControllerDevice = std::make_shared<uio_device>(stream_controller_name, _mmd_handle, false);
+            if( _spStreamControllerDevice && !_spStreamControllerDevice->bValid() ) {
+                // The stream controller does not exist
+                _spStreamControllerDevice = nullptr;
+            }
+        }
+    }
+}
+
+int mmd_device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size)
+{
+    if( op ) {
+        LOG_ERR("op not support : %s\n", __func__ );
+        return FAILURE;
+    }
+    if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) {
+        return _spDmaDevice->read_block(host_addr, offset, size);
+    } else if( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) {
+        return _spCoredlaDevice->read_block(host_addr, offset, size);
+    } else if( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) {
+        if ( _spStreamControllerDevice ) {
+            return _spStreamControllerDevice->read_block(host_addr, offset, size);
+        }
+    }
+
+    return FAILURE;
+}
+
+int mmd_device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size)
+{
+     if( op ) {
+        LOG_ERR("op not support : %s\n", __func__ );
+        return FAILURE;
+    }
+    if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) {
+        return _spDmaDevice->write_block(host_addr, offset, size);
+    } else if ( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) {
+        return _spCoredlaDevice->write_block(host_addr, offset, size);
+    } else if ( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) {
+        if( _spStreamControllerDevice ) {
+            return _spStreamControllerDevice->write_block(host_addr, offset, size);
+        }
+    }
+    return FAILURE;
+}
+
+int mmd_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data) {
+    if( _spCoredlaDevice ) {
+        return _spCoredlaDevice->set_interrupt_handler(fn, user_data);
+    }
+    return FAILURE;
+}
+
+// Returns the index of a uio device
+// If index cannot be found then returns -1
+int mmd_device::extract_index(const std::string name) {
+    std::string prefix(UIO_DEVICE_PREFIX);
+
+  if (name.length() <= prefix.length() && name.compare(0, prefix.length(), prefix)) {
+    LOG_ERR("Error parsing device name '%s'\n", name.c_str());
+    return -1;
+  }
+
+  std::string device_num_str = name.substr(prefix.length());
+  int32_t index = std::stoi(device_num_str, 0, 10);
+  return index;
+}
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h
new file mode 100644
index 0000000..9cb0c71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h
@@ -0,0 +1,75 @@
+#ifndef MMD_DEVICE_H_
+#define MMD_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- mmd_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         mmd device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the mmd device object            */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <memory>
+#include <string>
+
+#include "hps_types.h"
+#include "dma_device.h"
+#include "uio_device.h"
+
+#include "aocl_mmd.h"
+
+// LOG ERRORS
+#define MMD_ERR_LOGGING 1
+#ifdef MMD_ERR_LOGGING
+#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define LOG_ERR(...)
+#endif
+
+class mmd_device {
+public:
+  mmd_device(std::string name, const int mmd_handle);
+
+  bool bValid() { return _spCoredlaDevice && _spCoredlaDevice->bValid() && _spDmaDevice && _spDmaDevice->bValid(); };
+  bool bStreamControllerValid() { return _spCoredlaDevice && _spStreamControllerDevice && _spStreamControllerDevice->bValid(); };
+  int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size);
+  int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size);
+
+  int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data);
+private:
+  int32_t extract_index(const std::string name);
+
+  mmd_device() = delete;
+  mmd_device(mmd_device const&) = delete;
+  void operator=(mmd_device const &) = delete;
+  std::string _name;
+
+  uio_device_ptr _spCoredlaDevice;
+  uio_device_ptr _spStreamControllerDevice;
+  dma_device_ptr _spDmaDevice;
+  int            _mmd_handle;
+};
+
+typedef std::shared_ptr<mmd_device> mmd_device_ptr;
+
+extern board_names mmd_get_devices(const int max_fpga_devices);
+
+#endif // MMD_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp
new file mode 100644
index 0000000..95a9567
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp
@@ -0,0 +1,469 @@
+// (c) 1992-2021 Intel Corporation.
+// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words
+// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.
+// and/or other countries. Other marks and brands may be claimed as the property
+// of others. See Trademarks on intel.com for full list of Intel trademarks or
+// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)
+// Your use of Intel Corporation's design tools, logic functions and other
+// software and tools, and its AMPP partner logic functions, and any output
+// files any of the foregoing (including device programming or simulation
+// files), and any associated documentation or information are expressly subject
+// to the terms and conditions of the Altera Program License Subscription
+// Agreement, Intel MegaCore Function License Agreement, or other applicable
+// license agreement, including, without limitation, that your use is for the
+// sole purpose of programming logic devices manufactured by Intel and sold by
+// Intel or its authorized distributors.  Please refer to the applicable
+// agreement for further details.
+
+/* ===- uio_device.cpp  ----------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         uio device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the uio device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+
+// common and its own header files
+#include "uio_device.h"
+#include <unistd.h>
+#include <glob.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <poll.h>
+
+#include <cinttypes>
+#include <memory.h>
+
+//////////////////////////////////////////////////////
+#define UIO_BASE_NAME "uio*"
+#define UIO_BASE_PATH "/sys/class/uio/"
+#define UIO_BASE_SEARCH UIO_BASE_PATH UIO_BASE_NAME
+#define UIO_MAX_PATH (256)
+
+#define ERR(format, ...) \
+fprintf(stderr, "%s:%u **ERROR** : " format, \
+    __FILE__, __LINE__,  ##__VA_ARGS__)
+
+//////////////////////////////////////////////////////
+#define MAX_NAME (20)
+bool uio_read_sysfs_uint64(const char *device_name, const char *sysfs_name, uint64_t &value)
+{
+       FILE *fp;
+    char param_path[UIO_MAX_PATH];
+
+    if( snprintf(param_path, sizeof(param_path), "%s/%s", device_name, sysfs_name) < 0 )
+    {
+        ERR("Path too long. %s, %s\n", device_name, sysfs_name);
+        return false;
+    }
+
+    fp = fopen(param_path, "r");
+    if( !fp )
+    {
+        ERR("Failed to fopen - %s\n", param_path);
+        return false;
+    }
+
+    if( fscanf(fp, "%" PRIx64, &value) != 1 )
+    {
+        ERR("Failed fscanf - %s\n", param_path);
+        fclose(fp);
+        return false;
+    }
+
+    fclose(fp);
+    return true;
+}
+
+bool uio_read_sysfs_string(const char *uio_path, const char *sysfs_name, std::string &result)
+{
+    char uio_name[MAX_NAME];
+    FILE *fp;
+    char param_path[UIO_MAX_PATH];
+
+    if( snprintf(param_path, sizeof(param_path), "%s/%s", uio_path, sysfs_name) < 0 )
+    {
+        ERR("Path too long. %s, %s\n", uio_path, sysfs_name);
+        return false;
+    }
+
+    fp = fopen(param_path, "r");
+    if( !fp )
+    {
+        ERR("Failed to fopen - %s\n", param_path);
+        return false;
+    }
+
+    int num_read = fread(uio_name, 1, MAX_NAME, fp);
+    if( num_read <= 0 )
+    {
+        ERR("Failed to read name - %s\n", param_path);
+        fclose(fp);
+        return false;
+    }
+
+    uio_name[num_read-1] = '\0'; // Terminate
+    result = std::string(uio_name);
+    fclose(fp);
+
+    return true;
+}
+
+std::string uio_get_device(const std::string prefix, const int32_t index)
+{
+  glob_t globbuf = {0};
+  std::string uio_name;
+
+  int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf);
+  if( (glob_res == 0) && (globbuf.gl_pathc) )
+  {
+    std::string device_name;
+    device_name = prefix + std::to_string(index);
+
+    for( size_t i=0; i<globbuf.gl_pathc; i++ )
+    {
+      std::string name;
+      uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name);
+
+      if( name.find(device_name) != std::string::npos )
+      {
+        // We will return just the device name without the UIO_BASE_PATH
+        std::string name = std::string(globbuf.gl_pathv[i]);
+        uio_name = name.substr(sizeof(UIO_BASE_PATH)-1);
+      }
+    }
+   }
+   return uio_name;
+}
+
+board_names uio_get_devices(const std::string device_name, const int max_devices)
+{
+  board_names names;
+  int device = 0;
+
+  glob_t globbuf = {0};
+
+  int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf);
+  if( (glob_res == 0) && (globbuf.gl_pathc) )
+  {
+    for( size_t i=0; (i<globbuf.gl_pathc) && (device < max_devices); i++ )
+    {
+      std::string name;
+      uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name);
+
+      if( name.find(device_name) != std::string::npos )
+      {
+        // We will return just the device name without the UIO_BASE_PATH
+        std::string name = std::string(globbuf.gl_pathv[i]);
+        name = name.substr(sizeof(UIO_BASE_PATH)-1);
+        names.push_back(name);
+        device++;
+      }
+    }
+   }
+   return names;
+}
+
+//////////////////////////////////////////////////////////////
+uio_device::uio_device(std::string &name, const int mmd_handle, const bool bEnableIRQ)
+: _mmd_handle(mmd_handle)
+{
+    // Map the first address space
+    if ( !map_region(name, 0) ) {
+        ERR("Failed to map region 0 on %s\n", name.c_str());
+        return;
+    }
+#ifndef RUNTIME_POLLING
+    if( bEnableIRQ ) {
+        _spInterrupt = std::make_shared<uio_interrupt>(_fd, _mmd_handle);
+        if( !_spInterrupt->initialized() ) {
+            _spInterrupt = nullptr; // If the uio_interrupt failed to initialize then delete
+        }
+        _bIrqEnabled = bEnableIRQ;
+    }
+#endif
+}
+
+bool uio_device::bValid() {
+    bool bValid = (_fd >=0);
+#ifndef RUNTIME_POLLING // If we're not polling check that the interrupt handling is working
+    if( _bIrqEnabled ) {
+        bValid |= (_spInterrupt != nullptr);
+    }
+#endif
+    return bValid;
+};
+
+uio_device::~uio_device()
+{
+#ifndef RUNTIME_POLLING
+    _spInterrupt = nullptr; // Shutdown the interrupt handler
+#endif
+    unmap_region();
+}
+
+uint32_t uio_device::read(const uint32_t reg)
+{
+    // NOT YET IMPLEMENTED
+    return 0;
+}
+
+void uio_device::write(const uint32_t reg, const uint32_t value)
+{
+    // NOT YET IMPLEMENTED
+    return;
+}
+
+// Copies the block of data from the FPGA to the host
+// memcpy is not used as this can cause multiple transfers of the AXI bus depending
+// on the implementation of memcpy
+int  uio_device::read_block(void *host_addr, size_t offset, size_t size)
+{
+    // Support for only 32bit aligned transfers
+    if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){
+        return FAILURE;
+    }
+
+    // Transfer the data in 32bit chunks
+    volatile const uint32_t *pDeviceMem32 = reinterpret_cast<volatile const uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset);
+    uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr);
+    while (size >= sizeof(uint32_t)) {
+        *host_addr32++ = *pDeviceMem32++;
+        size -= sizeof(uint32_t);
+    }
+
+    return SUCCESS;
+}
+
+// Copies the block of data from the host to the FPGA
+// memcpy is not used as this can cause multiple transfers of the AXI bus depending
+// on the implementation of memcpy
+int  uio_device::write_block(const void *host_addr, size_t offset, size_t size)
+{
+    // Support for only 32bit aligned transfers
+    if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){
+        return FAILURE;
+    }
+
+    // Transfer the remaining 32bits of data
+    volatile uint32_t *pDeviceMem32 = reinterpret_cast<volatile uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset);
+    const uint32_t *host_addr32 = reinterpret_cast<const uint32_t*>(host_addr);
+    while( size >= sizeof(uint32_t) ) {
+        *pDeviceMem32++ = *host_addr32++;
+        size -= sizeof(uint32_t);
+    }
+    return SUCCESS;
+}
+
+int uio_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+#ifndef RUNTIME_POLLING
+    if( _spInterrupt ) {
+        return _spInterrupt->set_interrupt_handler(fn, user_data);
+    }
+#endif
+    return FAILURE;
+}
+
+/////////////////////////////////////////////////////////////////
+void uio_device::unmap_region()
+{
+    if( _pBase )
+    {
+        munmap(_pBase, _size);
+        _pBase = nullptr;
+    }
+
+    if( _fd >= 0 )
+    {
+        close(_fd);
+        _fd = -1;
+    }
+}
+
+bool uio_device::map_region( std::string &name, const uint32_t index)
+{
+    char map_path[UIO_MAX_PATH];
+
+    std::string uio_params_path(UIO_BASE_PATH);
+    uio_params_path += name;
+
+    // char device_path[UIO_MAX_PATH];
+    // const char *p;
+
+    if( snprintf(map_path, sizeof(map_path), "maps/map%d/size", index ) < 0 )
+    {
+        ERR("Failed to make map addr name.\n");
+        return false;
+    }
+    if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _size) )
+    {
+        ERR("Failed to read size\n");
+        return false;
+    }
+    // Make sure that the size doesn't exceed 32bits, as this will fail the mapping
+    // call on 32bit systems
+    if( _size > UINT32_MAX ) {
+        ERR("Invalid size value\n");
+        return false;
+    }
+
+    if( snprintf(map_path, sizeof(map_path), "maps/map%d/offset", index ) < 0 )
+    {
+        ERR("Failed to make map offset name.\n");
+        return false;
+    }
+    if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _offset) )
+    {
+        ERR("Failed to read offset\n");
+        return false;
+    }
+
+    std::string uio_dev_path("/dev/");
+    uio_dev_path += name;
+
+    _fd = open(uio_dev_path.c_str(), O_RDWR );
+    if( _fd < 0 )
+    {
+        ERR("Failed to open - %s\n", uio_dev_path.c_str());
+        return false;
+    }
+    // Map the region into userspace
+    // The base of the region is the page_size offset of the index
+    uint32_t page_size = (uint32_t)sysconf(_SC_PAGESIZE);
+
+    _pBase = (uint8_t*)mmap(NULL, (size_t)_size, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, (off_t) (index * page_size));
+    if( _pBase == MAP_FAILED )
+    {
+        ERR("Failed to map uio region.\n");
+        close(_fd);
+        _fd = -1;
+        return false;
+    }
+    // CST base address is at _pBase + _offset
+    _pPtr = (uint32_t*)(_pBase + _offset);
+
+    return true;
+};
+
+#ifndef RUNTIME_POLLING
+///////////////////////////////////////////////////////////////////////////////////
+uio_interrupt::uio_interrupt(const int fd, const int mmd_handle)
+: _device_fd(fd), _mmd_handle(mmd_handle) {
+    if( is_irq_available() ) {
+        // Create a eventfd_object to be used for shutting down the work_thread
+        _spShutdown_event = std::make_shared<eventfd_object>();
+        if( _spShutdown_event->initialized() ) {
+            _pThread = new std::thread(work_thread, std::ref(*this));
+        } else {
+            _spShutdown_event = nullptr;
+        }
+    } else {
+        ERR("No device interrupt found.\n");
+    }
+}
+
+uio_interrupt::~uio_interrupt() {
+    // kill the thread
+    if (_pThread && _spShutdown_event) {
+        // send message to thread to end it
+        _spShutdown_event->notify(1);
+
+        // join with thread until it ends
+        _pThread->join();
+
+        delete _pThread;
+        _pThread = NULL;
+
+        _spShutdown_event = nullptr;
+    }
+}
+
+bool uio_interrupt::is_irq_available() {
+    // Disable the interrupt handling, this will fail if the IRQ has not been setup correctly.
+    // For example devicetree is incorrect.
+    return disable_irq();
+}
+
+bool uio_interrupt::enable_irq() {
+    // Enable interrupts from the device
+    uint32_t info = 1;
+    ssize_t nb = write(_device_fd, &info, sizeof(info));
+    if( nb != (ssize_t)sizeof(info) ) {
+        ERR( "Failed in enable CoreDLA Interrupt = %s\n", strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+bool uio_interrupt::disable_irq() {
+    // Enable interrupts from the device
+    uint32_t info = 0;
+    ssize_t nb = write(_device_fd, &info, sizeof(info));
+    if( nb != (ssize_t)sizeof(info) ) {
+        ERR( "Failed in disable CoreDLA Interrupt = %s\n", strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+void uio_interrupt::work_thread(uio_interrupt& obj) {
+    obj.run_thread();
+}
+
+#define UIO_INTERRUPT_TIMEOUT (-1)
+void uio_interrupt::run_thread() {
+    while( true ) {
+        // Need to re-enable the UIO interrupt handling as UIO disables the IRQ each time it is fired
+        if ( !enable_irq() ) {
+            exit(-1);
+        }
+        // Poll for the shutdown_event and uio interrupt
+        struct pollfd pollfd_arr[2];
+        pollfd_arr[0].fd = _spShutdown_event->get_fd();
+        pollfd_arr[0].events = POLLIN;
+        pollfd_arr[0].revents = 0;
+        pollfd_arr[1].fd = _device_fd;
+        pollfd_arr[1].events = POLLIN;
+        pollfd_arr[1].revents = 0;
+
+        int res = poll(pollfd_arr, 2, UIO_INTERRUPT_TIMEOUT);
+        if (res < 0) {
+            ERR( "Poll error errno = %s\n", strerror(errno));
+            exit(-1);
+        } else if (res > 0 && pollfd_arr[0].revents == POLLIN) {
+            uint64_t count;
+            ssize_t bytes_read = read(pollfd_arr[0].fd, &count, sizeof(count));
+            if (bytes_read > 0) {
+                break; // We've been asked to shutdown
+            } else {
+                ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+                exit(-1);
+            }
+        } else if (res > 0 && pollfd_arr[1].revents == POLLIN) {
+            uint32_t count;
+            ssize_t bytes_read = read(pollfd_arr[1].fd, &count, sizeof(count));
+            if (bytes_read > 0) {
+                if( _interrupt_fn ) { // Run the callback to the application
+                    _interrupt_fn(get_mmd_handle(), _interrupt_fn_user_data );
+                }
+            } else {
+                ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read");
+                exit(-1);
+            }
+        }
+    }
+    // Disable interrupt handling in UIO
+    if( !disable_irq() ){
+        exit(-1);
+    }
+}
+
+int uio_interrupt::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) {
+  _interrupt_fn = fn;
+  _interrupt_fn_user_data = user_data;
+  return SUCCESS;
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h
new file mode 100644
index 0000000..c5f3ed5
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h
@@ -0,0 +1,162 @@
+#ifndef UIO_DEVICE_H_
+#define UIO_DEVICE_H_
+
+/* (c) 1992-2021 Intel Corporation.                                                */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+/* ===- uio_device.h  ------------------------------------------------- C++ -*-=== */
+/*                                                                                 */
+/*                         uio device access functions                             */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+/*                                                                                 */
+/* This file implements the functions used access the uio device objects           */
+/*                                                                                 */
+/* ===-------------------------------------------------------------------------=== */
+#include <vector>
+#include <string>
+#include <string.h>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <sys/eventfd.h>
+#include <unistd.h>
+
+#include "aocl_mmd.h"
+#include "hps_types.h"
+
+// simple wrapper class for managing eventfd objects
+class eventfd_object final {
+ public:
+  eventfd_object() {
+    m_initialized = false;
+    // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set
+    // The implementation of functions using eventfd assumes that
+    m_fd = eventfd(0, 0);
+    if (m_fd < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return;
+    }
+
+    m_initialized = true;
+  }
+
+  ~eventfd_object() {
+    if (m_initialized) {
+      if (close(m_fd) < 0) {
+        fprintf(stderr, "eventfd : %s", strerror(errno));
+      }
+    }
+  }
+
+  bool notify(uint64_t count) {
+    ssize_t res = write(m_fd, &count, sizeof(count));
+    if (res < 0) {
+      fprintf(stderr, "eventfd : %s", strerror(errno));
+      return false;
+    }
+    return true;
+  }
+
+  int get_fd() { return m_fd; }
+  bool initialized() { return m_initialized; }
+
+ private:
+  // not used and not implemented
+  eventfd_object(eventfd_object& other);
+  eventfd_object& operator=(const eventfd_object& other);
+
+  // member varaibles
+  int m_fd;
+  int m_initialized;
+};  // class eventfd_object
+typedef std::shared_ptr<eventfd_object> eventfd_object_ptr;
+
+#ifndef RUNTIME_POLLING
+class uio_interrupt final {
+  public:
+    uio_interrupt(const int fd, const int mmd_handle);
+    ~uio_interrupt();
+    bool initialized() { return _pThread != nullptr; }; // If the thread is not created then must be invalid
+    int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+
+  private:
+    bool is_irq_available(); // Checks that the interrupt has been mapped into userspace
+    bool enable_irq();  // Enables UIO Irq handling
+    bool disable_irq(); // Disabled UIO Irq handling
+
+    static void work_thread(uio_interrupt &obj);
+    void run_thread(); // Function which handles waiting for interrupts
+
+    uio_interrupt() = delete;
+    uio_interrupt(uio_interrupt const&) = delete;
+    void operator=(uio_interrupt const&) = delete;
+
+    int get_mmd_handle() {return _mmd_handle; };
+
+    std::thread *_pThread = {nullptr}; // Pointer to a thread object for waiting for interrupts
+    int _device_fd = {-1}; // /dev/uio* device pointer
+    int _mmd_handle = {-1}; // handle to the parent mmd_device
+    eventfd_object_ptr _spShutdown_event = {nullptr}; // Shutdown thread event object
+
+    aocl_mmd_interrupt_handler_fn _interrupt_fn = {nullptr};
+    void                          *_interrupt_fn_user_data = {nullptr};
+};
+typedef std::shared_ptr<uio_interrupt> uio_interrupt_ptr;
+#endif
+
+class uio_device
+{
+public:
+  uio_device(std::string &name, const int mmd_handle, const bool bEnableIrq=false);
+  ~uio_device();
+
+  uint32_t read(const uint32_t reg);
+  void write(const uint32_t reg, const uint32_t value);
+
+  int read_block(void *host_addr, size_t offset, size_t size);
+  int write_block(const void *host_addr, size_t offset, size_t size);
+  int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data);
+
+  bool bValid();
+
+private:
+  bool map_region( std::string &name, const uint32_t index );
+  void unmap_region();
+
+  uio_device() = delete;
+  uio_device(uio_device const&) = delete;
+  void operator=(uio_device const &) = delete;
+
+  int _mmd_handle; // Handle to the parent mmd device
+  int _fd = {-1}; // File pointer to UIO - Used to indicate the the uio_device is valid
+  uint64_t _size;   // Size of the mmapped region
+  uint64_t _offset; // Offset of the first register
+  uint8_t *_pBase; // Base of the mmapped region
+
+  uint32_t *_pPtr; // The first register
+#ifndef RUNTIME_POLLING
+  bool _bIrqEnabled; // Indicates that we tried to create with IRQ
+  uio_interrupt_ptr _spInterrupt; // Object to handle UIO Interrupts
+#endif
+};
+typedef std::shared_ptr<uio_device> uio_device_ptr;
+
+extern board_names uio_get_devices(const std::string name, const int max_devices);
+extern std::string uio_get_device(const std::string prefix, const int32_t index);
+
+#endif // UIO_DEVICE_H_
diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h
new file mode 100644
index 0000000..7c1c73d
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h
@@ -0,0 +1,645 @@
+#ifndef AOCL_MMD_H
+#define AOCL_MMD_H
+
+/* (c) 1992-2021 Intel Corporation.                             */
+/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words     */
+/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S.   */
+/* and/or other countries. Other marks and brands may be claimed as the property   */
+/* of others. See Trademarks on intel.com for full list of Intel trademarks or     */
+/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera)  */
+/* Your use of Intel Corporation's design tools, logic functions and other         */
+/* software and tools, and its AMPP partner logic functions, and any output        */
+/* files any of the foregoing (including device programming or simulation          */
+/* files), and any associated documentation or information are expressly subject   */
+/* to the terms and conditions of the Altera Program License Subscription          */
+/* Agreement, Intel MegaCore Function License Agreement, or other applicable       */
+/* license agreement, including, without limitation, that your use is for the      */
+/* sole purpose of programming logic devices manufactured by Intel and sold by     */
+/* Intel or its authorized distributors.  Please refer to the applicable           */
+/* agreement for further details.                                                  */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Support for memory mapped ACL devices.
+ *
+ * Typical API lifecycle, from the perspective of the caller.
+ *
+ *    1. aocl_mmd_open must be called first, to provide a handle for further
+ *    operations.
+ *
+ *    2. The interrupt and status handlers must be set.
+ *
+ *    3. Read and write operations are performed.
+ *
+ *    4. aocl_mmd_close may be called to shut down the device. No further
+ *    operations are permitted until a subsequent aocl_mmd_open call.
+ *
+ * aocl_mmd_get_offline_info can be called anytime including before
+ * open. aocl_mmd_get_info can be called anytime between open and close.
+ */
+
+#ifndef AOCL_MMD_CALL
+#if defined(_WIN32)
+#define AOCL_MMD_CALL __declspec(dllimport)
+#else
+#define AOCL_MMD_CALL __attribute__((visibility ("default")))
+#endif
+#endif
+
+#ifndef WEAK
+#if defined(_WIN32)
+#define WEAK
+#else
+/* This normally comes with "__attribute__((weak))" but for reasons not presently
+ * understood, the shared library is not properly loaded on Ubuntu18 when the functions
+ * are weak.
+ */
+#define WEAK
+#endif
+#endif
+
+#ifdef DLA_MMD
+#include <cstddef>  //size_t
+#include <cstdint>  //uint32_t
+#endif
+
+/* The MMD API's version - the runtime expects this string when
+ * AOCL_MMD_VERSION is queried. This changes only if the API has changed */
+#define AOCL_MMD_VERSION_STRING "20.3"
+
+/* Memory types that can be supported - bitfield. Other than physical memory
+ * these types closely align with the OpenCL SVM types.
+ *
+ * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate
+ * directly with physical memory such as DDR, QDR, etc.
+ *
+ * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires explicit function calls from the user
+ * to synchronize the cache between the host processor and the FPGA. This level
+ * of SVM is not currently supported by Altera except as a subset of
+ * SVM_FINE_GAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for
+ * caching SVM pointer data and requires additional information from the user
+ * and/or host runtime that can be collected during pointer allocation in order
+ * to synchronize the cache between the host processor and the FPGA. Once this
+ * additional data is provided for an SVM pointer, the vendor interface handles
+ * cache synchronization between the host processor & the FPGA automatically.
+ * This level of SVM is not currently supported by Altera except as a subset
+ * of SVM_FINE_GRAIN_SYSTEM support.
+ *
+ * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for
+ * caching SVM pointer data and does not require any additional information to
+ * synchronize the cache between the host processor and the FPGA. The vendor
+ * interface handles cache synchronization between the host processor & the
+ * FPGA automatically for all SVM pointers. This level of SVM support is
+ * currently under development by Altera and some features may not be fully
+ * supported.
+ */
+#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0)
+#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1)
+#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2)
+#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3)
+
+/* program modes - bitfield
+ *
+ * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory
+ * when this bit is set to 1. If programming can't occur without preserving
+ * global memory contents, the program function must fail, in which case the
+ * runtime may re-invoke program with this bit set to 0, allowing programming
+ * to occur even if doing so destroys global memory contents.
+ *
+ * more modes are reserved for stacking on in the future
+ */
+#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0)
+typedef int aocl_mmd_program_mode_t;
+
+typedef void* aocl_mmd_op_t;
+
+typedef struct {
+  unsigned lo; /* 32 least significant bits of time value. */
+  unsigned hi; /* 32 most significant bits of time value. */
+} aocl_mmd_timestamp_t;
+
+/* Defines the set of characteristics that can be probed about the board before
+ * opening a device. The type of data returned by each is specified in
+ * parentheses in the adjacent comment.
+ *
+ * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES
+ *   These two fields can be used to implement multi-device support. The MMD
+ *   layer may have a list of devices it is capable of interacting with, each
+ *   identified with a unique name. The length of the list should be returned
+ *   in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in
+ *   AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open
+ *   for each board name returned in AOCL_MMD_BOARD_NAMES.
+ */
+typedef enum {
+  AOCL_MMD_VERSION = 0,     /* Version of MMD (char*)*/
+  AOCL_MMD_NUM_BOARDS = 1,  /* Number of candidate boards (int)*/
+  AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/
+  AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */
+  AOCL_MMD_VENDOR_ID = 4,   /* An integer ID for the vendor (int) */
+  AOCL_MMD_USES_YIELD = 5,  /* 1 if yield must be called to poll hw (int) */
+  /* The following can be combined in a bit field:
+   * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER,
+   * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM
+   * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1
+   */
+  AOCL_MMD_MEM_TYPES_SUPPORTED = 6,
+} aocl_mmd_offline_info_t;
+
+/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */
+/**
+ * If not set allocation function is not supported, even if other capabilities are set.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0)
+/**
+ *   Supports atomic access to the memory by either the host or device.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1)
+/**
+ * Supports concurrent access to the memory either by host or device if the
+ * accesses are not on the same block. Block granularity is defined by
+ * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this
+ * granularity
+ */
+#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2)
+/**
+ * Memory can be accessed by multiple devices at the same time.
+ */
+#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3)
+
+/* Defines the set of characteristics that can be probed about the board after
+ * opening a device. This can involve communication to the device
+ *
+ * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1
+ *
+ * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface.
+ * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int
+ *
+ * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each
+ * kernel interface. If a kernel interface is not clocked by acl_kernel_clk
+ * then return -1
+ *
+ * */
+typedef enum {
+  AOCL_MMD_NUM_KERNEL_INTERFACES = 1,            /* Number of Kernel interfaces (int) */
+  AOCL_MMD_KERNEL_INTERFACES = 2,                /* Kernel interface (int*) */
+  AOCL_MMD_PLL_INTERFACES = 3,                   /* Kernel clk handles (int*) */
+  AOCL_MMD_MEMORY_INTERFACE = 4,                 /* Global memory handle (int) */
+  AOCL_MMD_TEMPERATURE = 5,                      /* Temperature measurement (float) */
+  AOCL_MMD_PCIE_INFO = 6,                        /* PCIe information (char*) */
+  AOCL_MMD_BOARD_NAME = 7,                       /* Name of board (char*) */
+  AOCL_MMD_BOARD_UNIQUE_ID = 8,                  /* Unique ID of board (int) */
+  AOCL_MMD_CONCURRENT_READS = 9,                 /* # of parallel reads; 1 is serial*/
+  AOCL_MMD_CONCURRENT_WRITES = 10,               /* # of parallel writes; 1 is serial*/
+  AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11,      /* total # of concurrent operations read + writes*/
+  AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12,       /* Min alignment that the BSP supports for host allocations (size_t) */
+  AOCL_MMD_HOST_MEM_CAPABILITIES = 13,           /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/
+  AOCL_MMD_SHARED_MEM_CAPABILITIES = 14,         /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/
+  AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15,         /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/
+  AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/
+  AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/
+  AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/
+} aocl_mmd_info_t;
+
+typedef struct {
+  unsigned long long int exception_type;
+  void* user_private_info;
+  size_t user_cb;
+} aocl_mmd_interrupt_info;
+
+typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data);
+typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data);
+typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status);
+
+/* Get information about the board using the enum aocl_mmd_offline_info_t for
+ * offline info (called without a handle), and the enum aocl_mmd_info_t for
+ * info specific to a certain board.
+ * Arguments:
+ *
+ *   requested_info_id - a value from the aocl_mmd_offline_info_t enum
+ *
+ *   param_value_size - size of the param_value field in bytes. This should
+ *     match the size of the return type expected as indicated in the enum
+ *     definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so
+ *     the param_value_size should be set to sizeof(float) and you should
+ *     expect the same number of bytes returned in param_size_ret.
+ *
+ *   param_value - pointer to the variable that will receive the returned info
+ *
+ *   param_size_ret - receives the number of bytes of data actually returned
+ *
+ * Returns: a negative value to indicate error.
+ */
+AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id,
+                                            size_t param_value_size,
+                                            void* param_value,
+                                            size_t* param_size_ret) WEAK;
+
+// AOCL_MMD_CALL int aocl_mmd_get_info(int handle,
+//                                     aocl_mmd_info_t requested_info_id,
+//                                     size_t param_value_size,
+//                                     void* param_value,
+//                                     size_t* param_size_ret) WEAK;
+
+/* Open and initialize the named device.
+ *
+ * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline
+ * info.
+ *
+ * Arguments:
+ *    name - open the board with this name (provided as a C-style string,
+ *           i.e. NUL terminated ASCII.)
+ *
+ * Returns: the non-negative integer handle for the board, otherwise a
+ * negative value to indicate error. Upon receiving the error, the OpenCL
+ * runtime will proceed to open other known devices, hence the MMD mustn't
+ * exit the application if an open call fails.
+ */
+AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK;
+
+/* Close an opened device, by its handle.
+ * Returns: 0 on success, negative values on error.
+ */
+AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK;
+
+/* Set the interrupt handler for the opened device.
+ * The interrupt handler is called whenever the client needs to be notified
+ * of an asynchronous event signaled by the device internals.
+ * For example, the kernel has completed or is stalled.
+ *
+ * Important: Interrupts from the kernel must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a kernel interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK;
+
+/* Set the device interrupt handler for the opened device.
+ * The device interrupt handler is called whenever the client needs to be notified
+ * of a device event signaled by the device internals.
+ * For example, an ECC error has been reported.
+ *
+ * Important: Interrupts from the device must be ignored until this handler is
+ * set
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a device interrupt occurs
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+// AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle,
+//                                                         aocl_mmd_device_interrupt_handler_fn fn,
+//                                                         void* user_data) WEAK;
+
+/* Set the operation status handler for the opened device.
+ * The operation status handler is called with
+ *    status 0 when the operation has completed successfully.
+ *    status negative when the operation completed with errors.
+ *
+ * Arguments:
+ *   fn - the callback function to invoke when a status update is to be
+ *   performed.
+ *   user_data - the data that should be passed to fn when it is called.
+ *
+ * Returns: 0 if successful, negative on error
+ */
+//AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK;
+
+/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle
+ * and hence possibly waiting for events to be processed by the device.
+ * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is
+ * assumed to provide status/event updates via some other execution thread
+ * such as through an interrupt handler.
+ *
+ * Returns: non-zero if the yield function performed useful work such as
+ * processing DMA transactions, 0 if there is no useful work to be performed
+ *
+ * NOTE: yield may be called continuously as long as it reports that it has useful work
+ */
+//AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK;
+
+/* Read, write and copy operations on a single interface.
+ * If op is NULL
+ *    - Then these calls must block until the operation is complete.
+ *    - The status handler is not called for this operation.
+ *
+ * If op is non-NULL, then:
+ *    - These may be non-blocking calls
+ *    - The status handler must be called upon completion, with status 0
+ *    for success, and a negative value for failure.
+ *
+ * Arguments:
+ *   op - the operation object used to track this operations progress
+ *
+ *   len - the size in bytes to transfer
+ *
+ *   src - the host buffer being read from
+ *
+ *   dst - the host buffer being written to
+ *
+ *   mmd_interface - the handle to the interface being accessed. E.g. To
+ *   access global memory this handle will be whatever is returned by
+ *   aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE.
+ *
+ *   offset/src_offset/dst_offset - the byte offset within the interface that
+ *   the transfer will begin at.
+ *
+ * The return value is 0 if the operation launch was successful, and
+ * negative otherwise.
+ */
+AOCL_MMD_CALL int aocl_mmd_read(
+    int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK;
+AOCL_MMD_CALL int aocl_mmd_write(
+    int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK;
+// AOCL_MMD_CALL int aocl_mmd_copy(
+//     int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK;
+
+/* Host Channel create operation
+ * Opens channel between host and kernel.
+ *
+ * Arguments:
+ *   channel_name - name of channel to initialize. Same name as used in board_spec.xml
+ *
+ *   queue_depth - the size in bytes of pinned memory queue in system memory
+ *
+ *   direction - the direction of the channel
+ *
+ * The return value is negative if initialization was unsuccessful, and
+ * positive otherwise. Positive return value is handle to the channel to be used for
+ * subsequent calls for the channel.
+ */
+//AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK;
+
+/* Host Channel destroy operation
+ * Closes channel between host and kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to close, that was obtained with
+ *             create channel
+ *
+ * The return value is 0 if the destroy was successful, and negative
+ * otherwise.
+ */
+//AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK;
+
+/* Host Channel get buffer operation
+ * Provide host with pointer to buffer they can access to write or
+ * read from kernel, along with space or data available in the buffer
+ * in bytes.
+ *
+ * Arguments:
+ *   channel - the handle to the channel to get the buffer for
+ *
+ *   buffer_size - the address that this call will write the amount of
+ *                 space or data that's available in the buffer,
+ *                 depending on direction of the channel, in bytes
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is the pointer to the buffer that host can write
+ * to or read from. NULL if the status is negative.
+ */
+//AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK;
+
+/* Host Channel acknowledge buffer operation
+ * Acknowledge to the channel that the user has written or read data from
+ * it. This will make the data or additional buffer space available to
+ * write to or read from kernel.
+ *
+ * Arguments:
+ *   channel - the handle to the channel that user is acknowledging
+ *
+ *   send_size - the size in bytes that the user is acknowledging
+ *
+ *   status - the address that this call will write to for result of this
+ *            call. Value will be 0 for success, and negative otherwise
+ *
+ * The return value is equal to send_size if send_size was less than or
+ * equal to the buffer_size from get buffer call. If send_size was
+ * greater, then return value is the amount that was actually sent.
+ */
+//AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK;
+
+/* Program the device
+ *
+ * The host will guarantee that no operations are currently executing on the
+ * device. That means the kernels will be idle and no read/write/copy
+ * commands are active. Interrupts should be disabled and the FPGA should
+ * be reprogrammed with the data from user_data which has size size. The host
+ * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler
+ * again. At this point interrupts can be enabled.
+ *
+ * The new handle to the board after reprogram does not have to be the same as
+ * the one before.
+ *
+ * Arguments:
+ *   user_data - The binary contents of the fpga.bin file created during
+ *   Quartus II compilation.
+ *   size - the size in bytes of user_data
+ *   program_mode - bit field for programming attributes. See
+ *   aocl_mmd_program_mode_t definition
+ *
+ * Returns: the new non-negative integer handle for the board, otherwise a
+ * negative value to indicate error.
+ */
+
+// #ifdef DLA_MMD
+// // CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to
+// // reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask
+// // the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared).
+// // BEWARE: reprogramming will invalidate the handle
+// AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename) WEAK;
+// #else
+// AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK;
+// #endif
+
+/** Error values*/
+#define AOCL_MMD_ERROR_SUCCESS 0
+#define AOCL_MMD_ERROR_INVALID_HANDLE -1
+#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2
+#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3
+#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4
+#define AOCL_MMD_ERROR_INVALID_POINTER -5
+#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6
+
+/** Memory properties*/
+typedef enum {
+  /**
+   *  Specifies the name of a global memory that can be found in the
+   *  board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  global memory interface.
+   */
+  AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1,
+  /**
+   *  Specifies the index of a bank inside the global memory interface that can be found in
+   *  the board_spec.xml file for the BSP. Allocations will be allocated to this
+   *  memory bank. It is invalid to specify this property without also specifying
+   *  AOCL_MMD_GLOBAL_MEMORY_INTERFACE.
+   */
+  AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK
+} aocl_mmd_mem_properties_t;
+
+/**
+ *  Host allocations provide memory that is allocated on the host. Host
+ *  allocations are accessible by the host and one or more devices.
+ *  The same pointer to a host allocation may be used on the host and all
+ *  supported devices; they have address equivalence. This memory must be
+ *  deallocated with aocl_mmd_free();
+ *
+ *  Once the device has signaled completion through
+ *  aocl_mmd_interrupt_handler_fn() the host can assume it has access to the
+ *  latest contents of the memory, allocated by this call.
+ *
+ *  @param handles Handles for devices that will need access to this memory
+ *  @param num_devices Number of devices in the handles
+ *  @param size The size of the memory region
+ *  @param alignment The alignment in bytes of the allocation
+ *  @param properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+// AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles,
+//                                         size_t num_devices,
+//                                         size_t size,
+//                                         size_t alignment,
+//                                         aocl_mmd_mem_properties_t* properties,
+//                                         int* error) WEAK;
+
+/**
+ * Frees memory that has been allocated by MMD
+ *
+ * @param mem The pointer to the memory region. Must be a pointer that is
+ *   allocated by the MMD.
+ * @return AOCL_MMD_ERROR_SUCCESS if success, else error code
+ */
+// AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK;
+
+/**
+ *  Allocate memory that is owned by the device. This pointer can only be
+ *  accessed by the kernel; can't be accessed by the host. The host is able to
+ *  manipulate the pointer (e.g. increment it) just not access the underlying
+ *  data. This memory must be deallocated by aocl_mmd_free();
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param  alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported values are
+ *    described above. Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return Pointer that can be passed into the kernel. NULL on failure.
+ */
+// AOCL_MMD_CALL void* aocl_mmd_device_alloc(
+//     int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+/**
+ *  Shared allocations may migrate between the host and one or more associated
+ *  device. The same pointer to a shared allocation may be used on the host and
+ *  the supported device; they have address equivalence.
+ *
+ *  If the device does not support concurrent access to memory allocated by
+ *  aocl_mmd_shared_alloc() then a call must be made to
+ *  aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should
+ *  be migrated to the device before the device accesses this memory.  For
+ *  example, a call to aocl_mmd_shared_mem_migrate() should be made before a
+ *  kernel accessing this memory is launched).  Conversely,
+ *  aocl_mmd_shared_mem_migrate() should be called again to indicate that the
+ *  shared allocation should be migrated to the host before the host accesses
+ *  this memory again.  If the device supports concurrent access to memory
+ *  allocated with aocl_mmd_shared_alloc(), then the call to
+ *  aocl_mmd_shared_mem_migrate() is not necessary, but may still be made.  In
+ *  the case of concurrent access, it is the responsibility of the MMD to ensure
+ *  both the device and host can access aocl_mmd_shared_alloc() allocations at
+ *  all times.
+ *
+ *  Memory allocated by aocl_mmd_shared_alloc() must be deallocated with
+ *  aocl_mmd_free().
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param  size The size of the memory region
+ *  @param alignment The alignment in bytes of the memory region
+ *  @param  properties Specifies additional information about the allocated
+ *    memory, described by a property type name and its corresponding value.
+ *    Each property type name is immediately followed by the corresponding
+ *    desired value. The list is terminated with 0. Supported properties are
+ *    listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_.
+ *    Example: [<property1>, <value1>, <property2>, <value2>, 0]
+ *  @param error The error code defined by AOCL_MMD_ERROR*
+ *  @return valid pointer, on error NULL
+ */
+// AOCL_MMD_CALL void* aocl_mmd_shared_alloc(
+//     int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK;
+
+typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t;
+
+/**
+ *  A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared
+ *  allocations any time the accessor of the allocation changes.  For example,
+ *  aocl_mmd_shared_migrate() should be called indicating that the allocation
+ *  should be migrated to the device before a kernel accessing the allocation
+ *  is launched on the device.  Similarly, aocl_mmd_shared_migrate() should be
+ *  called indicating that the allocation is migrated to the host before the
+ *  host accesses the memory after kernel completion.
+ *
+ *  For concurrent allocations this call may be used as a performance hint, but
+ *  is not strictly required for functionality.
+ *
+ *  @param  handle Device that will have access to this memory
+ *  @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc()
+ *  @param size In bytes, the size of the migration. Must be of multiple of a
+ *   page boundary that the BSP supports.
+ *  @param destination The destination of migration
+ *  @return The error code defined by AOCL_MMD_ERROR*
+ */
+// AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle,
+//                                           void* shared_ptr,
+//                                           size_t size,
+//                                           aocl_mmd_migrate_t destination) WEAK;
+
+// CoreDLA modifications
+// To support multiple different FPGA boards, anything board specific must be implemented in a
+// board-specific MMD instead of the CoreDLA runtime layer.
+#ifdef DLA_MMD
+// Query functions to get board-specific values
+AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK;
+AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK;
+AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK;
+
+// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets
+AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+
+#define STREAM_CONTROLLER_ACCESS
+#ifdef STREAM_CONTROLLER_ACCESS
+AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) WEAK;
+AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK;
+AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK;
+#endif
+
+// Get the PLL clock frequency in MHz, returns a negative value if there is an error
+AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt
new file mode 100644
index 0000000..d8be216
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+add_library(system_console_mmd INTERFACE)
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp
new file mode 100644
index 0000000..64c6631
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp
@@ -0,0 +1,320 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "mmd_wrapper.h"
+#include "dla_dma_constants.h"  // DLA_DMA_CSR_OFFSET_***
+
+#include <cassert>    // assert
+#include <cstddef>    // size_t
+#include <iostream>   // std::cerr
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::string
+
+#include <boost/process.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/format.hpp>
+#include <boost/filesystem/fstream.hpp>
+#include <boost/process/environment.hpp>
+#include <string>
+#include <iostream>
+#include <string>
+#include <cstdio>
+#include <sstream>
+#include <ostream>
+
+#define xstr(s) _str(s)
+#define _str(s)  #s
+
+// All board variants must obey the CoreDLA CSR spec, which says that all access must be
+// - 32 bits in size
+// - address must be 4 byte aligned
+// - within the address range, CSR size is 2048 bytes
+constexpr uint64_t DLA_CSR_ALIGNMENT = 4;
+constexpr uint64_t DLA_CSR_SIZE = 2048;
+namespace bp = boost::process; //we will assume this for all further examples
+
+constexpr auto max_size = std::numeric_limits<std::streamsize>::max();
+
+static const boost::filesystem::path system_console_path("/home/pmclean/intelfpga_pro/23.4/qprogrammer/syscon/bin/system-console");
+static boost::filesystem::path temp_file_path;
+static boost::filesystem::path tcl_file_path;
+static boost::filesystem::path sof_file_path;
+static uint32_t enable_pmon;
+static bool     preserve_temp_files;
+
+const uint32_t DLA_CSR_BASE_ADDRESS = 0x80000000;
+const uint32_t DLA_DDR_BASE_ADDRESS = 0x0;
+
+
+static bp::opstream in;
+static bp::ipstream out;
+static bp::child subprocess;
+
+static int capture_till_prompt(bp::ipstream& out, std::ostream& capture)
+{
+  std::array<char, 4096> line_buffer;
+  if (out.fail()) {
+    std::cout << "EOF" << std::endl;
+    return 1;
+  }
+
+  do {
+    out.clear();
+    out.getline(&line_buffer[0], (std::streamsize)line_buffer.size(), '%');
+    capture.write(&line_buffer[0], out.gcount());
+    // If out.getline fills the line buffer without encountering the delimiter
+    // then the failbit of out will be set, causing out.fail() to return true.
+    // bp::ipstream indirectly inherits std::ios_base::iostate, which defines failbit/badbit
+  } while (out.fail() && (static_cast<long unsigned int> (out.gcount()) == line_buffer.size()-1));
+
+  if (out.fail()) {
+    std::cout << "EOF" << std::endl;
+    return 1;
+  }
+  return 0;
+}
+
+static int wait_for_prompt(bp::ipstream& out)
+{
+  return capture_till_prompt(out, std::cout);
+}
+
+std::string remove_non_alphanumeric(const std::string& input) {
+  std::string result = input;
+  result.erase(std::remove_if(result.begin(), result.end(), [](unsigned char c) {
+   return !std::isalnum(c);
+  }), result.end());
+  return result;
+}
+
+static void send_command(bp::opstream& in, std::string command)
+{
+  in << command << "\n";
+  in.flush();
+}
+
+static void write_to_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr, uint32_t data) {
+  addr += DLA_CSR_BASE_ADDRESS;
+  send_command(in, "master_write_32 $::g_dla_csr_service " + str( boost::format("0x%|08x| 0x%|08x|") % addr % data));
+  if (0 != wait_for_prompt(out))
+  {
+    throw std::runtime_error("Unexpected EOF");
+  }
+}
+
+static uint32_t read_from_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr) {
+  if (addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK)
+  {
+    return 3;
+  }
+  if (addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG)
+  {
+    return 1;
+  }
+  addr += DLA_CSR_BASE_ADDRESS;
+  send_command(in, "master_read_32 $::g_dla_csr_service " + str( boost::format("0x%|08x|") % addr ) + " 1");
+  std::basic_stringstream<char> s1;
+  std::string captured;
+  do {
+    if (0 != capture_till_prompt(out, s1))
+    {
+      throw std::runtime_error("Unexpected EOF");
+    }
+    captured = s1.str();
+  } while (std::all_of(captured.begin(), captured.end(), [](unsigned char c){return (std::isspace(c) || std::iscntrl(c));}));
+  std::string trimmed = remove_non_alphanumeric(captured);
+
+  uint32_t data = std::stoul(trimmed, nullptr, 16);
+
+  return data;
+}
+
+static void read_from_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, void* data)
+{
+  if (data == nullptr)
+  {
+    throw std::runtime_error("null data");
+  }
+  boost::filesystem::path temp_file_name = boost::filesystem::unique_path();
+  boost::filesystem::path temppath = temp_file_path / temp_file_name;
+  send_command(in, "master_read_to_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x| 0x%|08x|") % addr % length ) );
+  if (0 != wait_for_prompt(out)) {
+    throw std::runtime_error("Unexpected EOF");
+  }
+  boost::filesystem::ifstream ifs(temppath, std::ios::in | std::ios::binary);
+  ifs.read(static_cast<char *>(data), length);
+  ifs.close();
+
+  if (!preserve_temp_files) {
+    try {
+          boost::filesystem::remove(temppath);
+        } catch (const boost::filesystem::filesystem_error& ex) {
+          std::cerr << "Error removing file: " << ex.what() << std::endl;
+    }
+  }
+}
+
+static void write_to_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, const void* data)
+{
+  boost::filesystem::path temp_file_name = boost::filesystem::unique_path();
+  boost::filesystem::path temppath = temp_file_path / temp_file_name;
+  boost::filesystem::ofstream ofs(temppath, std::ios::out | std::ios::binary);
+  if (ofs.fail()) {
+    throw std::runtime_error("Failed to access the temporary file " + temppath.generic_string());
+  }
+  ofs.write(static_cast<const char *>(data), length);
+  ofs.close();
+  send_command(in, "master_write_from_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x|") % addr ) );
+  if (0 != wait_for_prompt(out))
+  {
+    throw std::runtime_error("Unexpected EOF");
+  }
+
+  if (!preserve_temp_files) {
+    try {
+          boost::filesystem::remove(temppath);
+        } catch (const boost::filesystem::filesystem_error& ex) {
+          std::cerr << "Error removing file: " << ex.what() << std::endl;
+    }
+  }
+}
+
+MmdWrapper::MmdWrapper() {
+  // Check for the envrionment variable
+  auto env = boost::this_process::environment();
+  tcl_file_path = env.find("DLA_SYSCON_SOURCE_FILE") != env.end() ?
+      boost::filesystem::path(env["DLA_SYSCON_SOURCE_FILE"].to_string()) :
+      boost::filesystem::path(xstr(DLA_SYSCON_SOURCE_ROOT)) / "system_console_script.tcl";
+  if (!boost::filesystem::exists(tcl_file_path)) {
+     throw std::runtime_error("Cannot locate " + tcl_file_path.generic_string() + ". Please specify the path of the Tcl setup script by defining the environment variable DLA_SYSCON_SOURCE_FILE\n");
+  } else {
+    std::cout <<"Using the Tcl setup script at "<<tcl_file_path.generic_string()<<std::endl;
+  }
+
+  temp_file_path = env.find("DLA_TEMP_DIR") != env.end() ?
+    boost::filesystem::path(env["DLA_TEMP_DIR"].to_string()) :
+    boost::filesystem::current_path();
+  if (!boost::filesystem::exists(temp_file_path)) {
+    throw std::runtime_error("The temporary file storage directory specified via the environment variable DLA_TEMP_DIR does not exist.\n");
+  } else {
+    std::cout <<"Saving temporary files to "<<temp_file_path.generic_string()<<std::endl;
+  }
+
+  sof_file_path = env.find("DLA_SOF_PATH") != env.end() ?
+    boost::filesystem::path(env["DLA_SOF_PATH"].to_string()):
+    boost::filesystem::current_path() / "top.sof";
+  if (!boost::filesystem::exists(sof_file_path)) {
+    throw std::runtime_error("Cannot find the FPGA bitstream (.sof). Please specify its location via the environment variable DLA_SOF_PATH,"\
+     " or copy it as top.sof to the current working directory.\n");
+  } else {
+    std::cout <<"Using the FPGA bitstream at "<<sof_file_path.generic_string()<<" to configure the JTAG connection"<<std::endl;
+  }
+
+  boost::filesystem::path system_console_path = bp::search_path("system-console");
+  if (system_console_path.empty()) {
+    throw std::runtime_error("Cannot find system-console in system PATH!\n");
+
+  }
+  enable_pmon = env.find("DLA_ENABLE_PMON") != env.end() ? 1 : 0;
+
+  preserve_temp_files = env.find("DLA_PRESERVE_TEMP_FILES") != env.end() ? true : false;
+
+  subprocess = bp::child(system_console_path, "-cli", bp::std_out > out, bp::std_in < in);
+  if (wait_for_prompt(out))
+  {
+    throw std::runtime_error("Could not find initial prompt");
+  }
+  send_command(in, "set ::cl(sof) " + sof_file_path.generic_string());
+  if (enable_pmon == 1) {
+    send_command(in, "set ::cl(enable_pmon) 1");
+  }
+  send_command(in, "source " + tcl_file_path.generic_string());
+  std::basic_stringstream<char> s1;
+  if (0 != capture_till_prompt(out, s1))
+  {
+    throw std::runtime_error("Could not find prompt after source");
+  }
+  std::string captured(s1.str());
+
+  // Reset the IP
+  write_to_csr(in, out, DLA_DMA_CSR_OFFSET_IP_RESET, 1);
+  // Constants of the design
+  maxInstances_ = 1;
+  ddrSizePerInstance_ = 0x80000000;
+  // Need to change the frequencies below when their counterparts in the Qsys system are modified
+  coreDlaClockFreq_ = 200;
+  ddrClockFreq_ = 200;
+  // Initialize the handle_ object to a dummy value. It is not relevant to this MMD
+  handle_ = 0;
+}
+
+MmdWrapper::~MmdWrapper() {
+  send_command(in, "close_services");
+  if (wait_for_prompt(out))
+  {
+    std::cout << "Could not find prompt after attempting to close system console services\n";
+  }
+  send_command(in, "exit");
+  try {
+    subprocess.terminate();
+    std::cout << "Successfully closed JTAG services.\n";
+  } catch (const boost::process::process_error& e) {
+    std::cerr << "Failed to terminate the system-console process due to reason: " << e.what() << std::endl;
+  }
+}
+
+void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const {
+  throw std::runtime_error("System Console plugin requires polling");
+}
+
+void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const {
+  write_to_csr(in, out, addr, data);
+}
+
+uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const {
+  return read_from_csr(in, out, addr);
+}
+
+void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const {
+  write_to_ddr(in, out, addr, length, data);
+}
+
+void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const {
+  read_from_ddr(in, out, addr, length, data);
+}
+
+#ifndef STREAM_CONTROLLER_ACCESS
+// Stream controller access is not supported by the platform abstraction
+bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; }
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(false);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(false);
+}
+#else
+// If the mmd layer supports accesses to the Stream Controller
+bool MmdWrapper::bIsStreamControllerValid(int instance) const {
+  return false;
+}
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl
new file mode 100644
index 0000000..9e0e386
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl
@@ -0,0 +1,79 @@
+# Author: linqiaol
+# Purpose: Perform write-read tests on external memory and CoreDLA CSR to make sure the registers can be accessed from host.
+
+# Declare and initialize CL arguments
+if {![info exists ::cl(sof)]} {
+    set ::cl(sof)                "top.sof"
+}
+
+if {![info exists ::cl(enable_pmon)]} {
+    set ::cl(enable_pmon)                0
+}
+
+# Declare global variables
+set ::g_emif_calip_service ""
+set ::g_emif_ddr_service ""
+set ::g_dla_csr_service ""
+set ::g_pmon_service ""
+
+# Declare some contants
+set ::g_const_master_offset_emif 0x0
+set ::g_const_master_range_emif  0x080000000
+set ::g_const_master_offset_dla  0x080000000
+set ::g_const_master_range_dla   0x000001000
+
+#{{{ load_sof
+proc load_sof {} {
+    puts "loading sof: $::cl(sof)"
+    design_load $::cl(sof)
+}
+#}}}
+
+#{{{claim_emif_ddr_service
+proc claim_emif_ddr_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]]
+    set service [claim_service master $path {} "\{${::g_const_master_offset_emif} ${::g_const_master_range_emif} EXCLUSIVE\}"]
+    return $service
+}
+#}}}
+
+#{{{claim_dla_csr_service
+proc claim_dla_csr_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]]
+    set service [claim_service master $path {} "\{${::g_const_master_offset_dla} ${::g_const_master_range_dla} EXCLUSIVE\}"]
+    return $service
+}
+#}}}
+
+#{{{claim_pmon_service
+proc claim_pmon_service {} {
+    set all_master_paths [get_service_paths master]
+    set path [lindex $all_master_paths [lsearch -glob $all_master_paths *pmon*master*]]
+    set service [claim_service master $path {} {{0x0 0x00001000 EXCLUSIVE}}]
+    return $service
+}
+#}}}
+
+proc initialization {} {
+    load_sof
+    puts "Claim required services"
+    set ::g_dla_csr_service    [claim_dla_csr_service]
+    set ::g_emif_ddr_service   [claim_emif_ddr_service]
+    if {$::cl(enable_pmon) == 1} {
+        puts "Claiming JTAG service to the AXI4 performance monitor"
+        set ::g_pmon_service       [claim_pmon_service]
+    }
+}
+
+proc close_services {} {
+    close_service master $::g_dla_csr_service
+    if {$::cl(enable_pmon) == 1} {
+        close_service master $::g_pmon_service
+    }
+    close_service master $::g_emif_ddr_service
+    puts "Closed DLA JTAG services"
+}
+
+initialization
+\ No newline at end of file
diff --git a/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp
new file mode 100644
index 0000000..9ac7598
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp
@@ -0,0 +1,125 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_batch_job.h"  //CoreDlaBatchJob
+#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
+#include "stream_controller_comms.h"
+
+static constexpr int CONFIG_READER_DATA_BYTES = 8;
+
+std::unique_ptr<BatchJob> CoreDlaBatchJob::MakeUnique(MmdWrapper* mmdWrapper,
+                                                      uint64_t totalConfigWords,
+                                                      uint64_t configBaseAddrDDR,
+                                                      uint64_t inputAddrDDR,
+                                                      uint64_t outputAddrDDR,
+                                                      uint64_t inputSizeDDR,
+                                                      uint64_t outputSizeDDR,
+                                                      const bool enableIstream,
+                                                      const bool enableOstream,
+                                                      int instance,
+                                                      std::shared_ptr<StreamControllerComms> spStreamControllerComms) {
+  return std::unique_ptr<BatchJob>(new CoreDlaBatchJob(mmdWrapper,
+                                                       totalConfigWords,
+                                                       configBaseAddrDDR,
+                                                       inputAddrDDR,
+                                                       outputAddrDDR,
+                                                       inputSizeDDR,
+                                                       outputSizeDDR,
+                                                       enableIstream,
+                                                       enableOstream,
+                                                       instance,
+                                                       spStreamControllerComms));
+}
+CoreDlaBatchJob::CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+                                 uint64_t totalConfigWords,
+                                 uint64_t configBaseAddrDDR,
+                                 uint64_t inputAddrDDR,
+                                 uint64_t outputAddrDDR,
+                                 uint64_t inputSizeDDR,
+                                 uint64_t outputSizeDDR,
+                                 const bool enableIstream,
+                                 const bool enableOstream,
+                                 int instance,
+                                 std::shared_ptr<StreamControllerComms> spStreamControllerComms)
+: mmdWrapper_(mmdWrapper)
+, instance_(instance)
+, totalConfigWords_(totalConfigWords)
+, configBaseAddrDDR_(configBaseAddrDDR)
+, inputAddrDDR_(inputAddrDDR)
+, outputAddrDDR_(outputAddrDDR)
+, inputSizeDDR_(inputSizeDDR)
+, outputSizeDDR_(outputSizeDDR)
+, enableIstream_(enableIstream)
+, enableOstream_(enableOstream)
+, lastJobQueueNumber_(0)
+, spStreamControllerComms_(spStreamControllerComms) {
+}
+
+// This function must be called by a single thread
+// It can be called on a different thread than StartDla or WaitForDla
+void CoreDlaBatchJob::LoadInputFeatureToDDR(void* inputArray) {
+  mmdWrapper_->WriteToDDR(instance_, inputAddrDDR_, inputSizeDDR_, inputArray);
+  StartDla();
+}
+
+void CoreDlaBatchJob::ScheduleInputFeature() const {
+  if (spStreamControllerComms_) {
+    // Send message to NIOS-V
+    uint64_t configurationSize64 = (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2;
+    uint32_t configurationBaseAddressDDR = static_cast<uint32_t>(configBaseAddrDDR_);
+    uint32_t configurationSize = static_cast<uint32_t>(configurationSize64);
+    uint32_t inputAddressDDR = static_cast<uint32_t>(inputAddrDDR_);
+    uint32_t outputAddressDDR = static_cast<uint32_t>(outputAddrDDR_);
+
+    Payload<CoreDlaJobPayload> item;
+    item._configurationBaseAddressDDR = configurationBaseAddressDDR;
+    item._configurationSize = configurationSize;
+    item._inputAddressDDR = inputAddressDDR;
+    item._outputAddressDDR = outputAddressDDR;
+
+    spStreamControllerComms_->ScheduleItems( { item } );
+  }
+}
+
+// This function must be called by a single thread
+// It can be called on a different thread than WaitForDla or LoadInputFeatureToDDR
+void CoreDlaBatchJob::StartDla() {
+  //////////////////////////////////////
+  //  Write to CSR to start the FPGA  //
+  //////////////////////////////////////
+
+  // interrupt mask was already enabled in the DlaDevice constructor
+
+  // intermediate buffer address was already set when the graph was loaded
+
+  // base address for config reader
+  mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configBaseAddrDDR_);
+
+  // how many words for config reader to read
+  // hardware wants the number of words minus 2 since the implementation is a down counter which ends at -1, the sign
+  // bit is used to denote the end of the counter range
+  mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2);
+
+  if (enableIstream_ && enableOstream_) {
+    // Arm the streaming interface. Will continuously load configs.
+    const unsigned int enable = 1;
+    mmdWrapper_->WriteToCsr(instance_, DLA_CSR_OFFSET_READY_STREAMING_IFACE, enable);
+  } else {
+    // base address for feature reader -- this will trigger one run of DLA
+    mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputAddrDDR_);
+  }
+}
+
+void CoreDlaBatchJob::ReadOutputFeatureFromDDR(void* outputArray) const {
+  mmdWrapper_->ReadFromDDR(instance_, outputAddrDDR_, outputSizeDDR_, outputArray);
+}
diff --git a/python/openvino/runtime/coredla_device/src/coredla_device.cpp b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
new file mode 100644
index 0000000..b28d8a2
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_device.cpp
@@ -0,0 +1,574 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_device.h"     //CoreDlaDevice
+#include "coredla_batch_job.h"  //CoreDlaBatchJob
+#include "coredla_graph_job.h"  //CoreDlaBatchJob
+#include "dla_dma_constants.h"  //DLA_DMA_CSR_OFFSET_***
+#include "stream_controller_comms.h"
+
+#include <algorithm>  //std::count
+#include <cassert>    //assert
+#include <chrono>     //std::chrono::seconds
+#include <cstddef>    //size_t
+#include <cstdlib>    //std::getenv
+#ifndef USE_OLD_COREDLA_DEVICE
+#include <cinttypes>  //printf formatters
+#endif
+#include <mutex>      //std::mutex
+#include <stdexcept>  //std::runtime_error
+#include <string>     //std::string
+#include <iostream>   //std::cerr
+#include <stdint.h>   //
+#include <thread>
+#include <cinttypes>
+
+std::unique_ptr<Device> Device::MakeUnique(const arch_params* archParams,
+                                           uint32_t waitForDlaTimeoutSeconds) {
+  return std::unique_ptr<Device>(new CoreDlaDevice(waitForDlaTimeoutSeconds));
+}
+
+void InterruptServiceRoutine(int handle, void* data) {
+  InterruptServiceRoutineData* isrData = static_cast<InterruptServiceRoutineData*>(data);
+  // clear interrupt status -- write 1 to clear that bit
+  constexpr int writeDataToClearInterruptStatus = 3;
+  const int numInstances = static_cast<int>(isrData->jobsFinished.size());
+  for (int i = 0; i < numInstances; i++) {
+    isrData->mmdWrapper->WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, writeDataToClearInterruptStatus);
+  }
+  for (int i = 0; i < numInstances; i++) {
+    isrData->desc_queue_diag[i] = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS);
+    // ask the csr how many jobs have finished
+    uint32_t completionCount =  isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+    // check if the completionCount wraps around (overflow detection) and save this information
+    if (isrData->prevCount[i] > completionCount)
+      isrData->base_multiplier[i] ++;
+    isrData->prevCount[i] = completionCount;
+    // we add base_multiplier to account for the fact that a wrap around is actually an increment of 1
+    std::unique_lock<std::mutex> isrMutexLock(isrData->isrMutex[i]);
+    isrData->jobsFinished[i] = (uint64_t) isrData->base_multiplier[i] * UINT32_MAX + completionCount + isrData->base_multiplier[i];
+    isrData->isrCondVar[i].notify_all();
+  }
+}
+
+CoreDlaDevice::CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds)
+: waitForDlaTimeoutSeconds_(waitForDlaTimeoutSeconds) {
+#ifdef COREDLA_RUNTIME_POLLING
+  runtimePolling_ = true;
+#else
+  runtimePolling_ = false;
+#endif
+  // mmdWrapper_ ctor runs first, which will open a handle to the MMD. Now determine the number of hardware instances
+  // by writing a nonzero value to some offset and then reading it back. While trying to enable the interrupt
+  // mask, test for this.
+  numInstances_ = 0;
+  for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+    constexpr uint32_t allInterruptsMask = (1<<DLA_DMA_CSR_INTERRUPT_ERROR_BIT) | (1<<DLA_DMA_CSR_INTERRUPT_DONE_BIT);
+    // clear any pending interrupts (there may be pending interrupts from last run), then enable mask for instance count
+    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, allInterruptsMask);
+    mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, allInterruptsMask);
+    uint32_t readData = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK);
+    if (allInterruptsMask == readData) numInstances_ = i + 1;
+  }
+  LOG_AND_PRINT(Logger::INFO, "numInstances_: %d\n", numInstances_);
+  assert(numInstances_ >= 1);
+  jobsWaited_.resize(numInstances_, 0);
+
+  uint32_t license = mmdWrapper_.ReadFromCsr(0, DLA_DMA_CSR_OFFSET_LICENSE_FLAG);
+  if (license == 0) {
+    DLA_LOG("Using unlicensed IP\n");
+  }
+  else if (license == 1) {
+    DLA_LOG("Using licensed IP\n");
+  }
+  else {
+    throw std::runtime_error("Unrecongnized license flag");
+  }
+#ifndef USE_OLD_COREDLA_DEVICE
+  startClocksActive.resize(numInstances_, 0);
+  startClockAllJobs.resize(numInstances_, 0);
+#endif
+  startNumInputFeatureMemoryReads.resize(numInstances_, 0);
+  startNumFilterMemoryReads.resize(numInstances_, 0);
+  startNumOutputFeatureMemoryWrites.resize(numInstances_, 0);
+
+  // Package up the data that interrupt service routine needs
+  isrData_.mmdWrapper = &mmdWrapper_;
+  isrData_.jobsFinished = std::vector<uint64_t>(numInstances_, 0);
+  isrData_.base_multiplier = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.prevCount = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.desc_queue_diag = std::vector<uint32_t>(numInstances_, 0);
+  isrData_.isrMutex = std::vector<std::mutex>(numInstances_);
+  isrData_.isrCondVar = std::vector<std::condition_variable>(numInstances_);
+
+  if (runtimePolling_) {
+    // disable the interrupt mask -- it was originally enabled to determine how many instances were present
+    for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) {
+      constexpr uint32_t disableInterruptMaskValue = 0;
+      mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, disableInterruptMaskValue);
+    }
+  }
+  else {
+    // register an interrupt handler
+    mmdWrapper_.RegisterISR(&InterruptServiceRoutine, &isrData_);
+  }
+
+  // Record the current counters
+  for(int i=0; i < numInstances_; i++) {
+#ifndef USE_OLD_COREDLA_DEVICE
+    jobsWaited_[i] = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+    isrData_.jobsFinished[i] = jobsWaited_[i];
+
+    startClocksActive[i] = GetClocksActive(i);
+    startClockAllJobs[i] = GetClocksAllJobs(i);
+#endif
+    startNumInputFeatureMemoryReads.at(i) = GetNumInputFeatureMemoryReadsTotal(i);
+    startNumFilterMemoryReads.at(i) = GetNumFilterMemoryReadsTotal(i);
+    startNumOutputFeatureMemoryWrites.at(i) = GetNumOutputFeatureMemoryWritesTotal(i);
+  }
+
+  // Allocator needs access to mmd to write to CSR the start address of the shared intermediate buffer allocated in DDR
+  ddrAllocator_ = std::unique_ptr<DeviceMemoryAllocator[]>(new DeviceMemoryAllocator[numInstances_]);
+  for (int i = 0; i < numInstances_; i++) {
+    ddrAllocator_[i].Initialize(mmdWrapper_.GetDDRSizePerInstance(), &mmdWrapper_);
+  }
+
+// Choose which data pattern you want, all zeros or all ones can also be useful for IP debug purposes
+#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR, INDEX) ((ADDR * 12345) + (INDEX * 6789))
+  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0)
+  //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0xffffffffffffffffULL)
+  bool run_memory_test = getenv("COREDLA_RUNTIME_MEMORY_TEST") != nullptr;
+  if (run_memory_test) {
+    // Ensure host can access all of the device memory that is accessible by all CoreDLA instances
+    // This is not necessarily the total device memory e.g. only 1 CoreDLA instance but 2 DDR banks
+    DLA_LOG("starting memory test with %d instances\n", numInstances_);
+    constexpr uint64_t CHUNK_SIZE = 1ULL << 20;  // one address check is 1 MB
+    const uint64_t ADDR_LIMIT = mmdWrapper_.GetDDRSizePerInstance();
+    int mismatch = 0;
+    uint64_t expected;
+    uint64_t* data = new uint64_t[CHUNK_SIZE / sizeof(uint64_t)];
+
+    for (int inst = 0; inst < numInstances_; ++inst) {
+      // write to entire fpga ddr
+      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++)
+          data[index] = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+        mmdWrapper_.WriteToDDR(inst, addr, CHUNK_SIZE, static_cast<const void*>(data));
+      }
+      // read back entire fpga ddr and compare to expected
+      for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) {
+        mmdWrapper_.ReadFromDDR(inst, addr, CHUNK_SIZE, data);
+        for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) {
+          expected = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index);
+          if (data[index] != expected) {
+            if (mismatch < 10) {
+#if (!defined(USE_OLD_COREDLA_DEVICE) || defined(_WIN32))
+              DLA_LOG("memory test mismatch, addr %" PRIu64 ", index %" PRIu64 ", got %" PRIu64 ", expected %" PRIu64
+                      "\n",
+                      addr,
+                      index,
+                      data[index],
+                      expected);
+#else
+              DLA_LOG("memory test mismatch, addr %lu, index %lu, got %lu, expected %lu\n",
+                      addr,
+                      index,
+                      data[index],
+                      expected);
+#endif
+            }
+            mismatch++;
+          }
+        }
+      }
+    }
+    delete[] data;
+    DLA_LOG("finished memory test ");
+    if (mismatch == 0) {
+      DLA_LOG("SUCCESS\n");
+    } else {
+      DLA_LOG("FAILURE (%d mismatches)\n", mismatch);
+    }
+  }
+}
+
+CoreDlaDevice::~CoreDlaDevice() {
+  // Avoid the scenario where some CoreDLA job has been started but something goes wrong
+  // in the runtime which causes it to exit, e.g. assertion failure or uncaught exception.
+  // CoreDLA will still raise an interrupt when the job finishes, yet the runtime will no
+  // longer be able to deal with it. Better to shut off interurpts.
+  for (int instance = 0; instance < numInstances_; instance++) {
+    // MmDWrapper.WriteToCSR might throw exception, and the destructor should not have
+    // unhandled exception, so we need to handle exceptions internally
+    try {
+      mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0);
+    } catch (const std::exception& e) {
+      std::cerr << "Failed to shut off the DMA CSR interrupt mask due to " << e.what() << std::endl;
+    }
+  }
+}
+
+GraphJob* CoreDlaDevice::CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+                                        size_t numPipelines,
+#else
+                                        uint64_t numPipelines,
+#endif
+                                        int instance,
+                                        std::string AES_key,
+                                        std::string IV_key,
+                                        bool encryption_enabled,
+                                        const std::string export_dir,
+                                        const std::string parameter_rom_export_dir) {
+  assert(instance < numInstances_);
+  (void) export_dir;  // unused in HW runtime. CoreDLA utilizes base pointers, which the SW emulator utilizes this variable. We void it here.
+  allGraphJobs_.push_back(move(
+      CoreDlaGraphJob::MakeUnique(&ddrAllocator_[instance], &mmdWrapper_, compiledResult, numPipelines, instance, spStreamControllerComms_)));
+  return (allGraphJobs_.back()).get();
+}
+
+// This function must be called by a single thread
+void CoreDlaDevice::WaitForDla(int instance, size_t threadId, std::function<bool()> isCancelledPredicate) {
+  // ISR updates jobsFinished, if not enough jobs have finished then sleep until ISR runs again
+  // it is possible that several hardware jobs could finish around the same time
+  // by the time software handles the first interrupt, hardware could report that 2 jobs have
+  // finished, for example the second time that waitForInterrupt runs, software already tracks
+  // that the second job has finished and therefore don't need to sleep waiting for ISR
+  std::unique_lock<std::mutex> isrMutexLock(isrData_.isrMutex[instance]);
+  uint32_t completionCount = 0;
+  bool timedOut = false;
+  auto timeoutDuration = std::chrono::seconds(waitForDlaTimeoutSeconds_);
+
+  if (runtimePolling_) {
+    std::chrono::time_point<std::chrono::system_clock> pollingEndingTime =
+        std::chrono::system_clock::now() + timeoutDuration;
+
+    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+      // Update isrData_.jobsFinished[instance] here (polling)
+      if (isCancelledPredicate and isCancelledPredicate()) {
+        break;
+      }
+
+      completionCount = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT);
+      isrData_.jobsFinished[instance] = completionCount;
+      if (std::chrono::system_clock::now() > pollingEndingTime) {
+        timedOut = true;
+        break;
+      }
+    }
+  } else {
+    while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) {
+      // isrData_.jobsFinished[instance] is updated in the ISR
+      if (std::cv_status::timeout == isrData_.isrCondVar[instance].wait_for(isrMutexLock, timeoutDuration)) {
+        timedOut = true;
+        break;
+      }
+    }
+  }
+
+  if (timedOut) {
+    std::string str_poll_vs_int = "interrupt";
+    if (runtimePolling_) {
+      str_poll_vs_int = "polling";
+    }
+    std::string timeoutMsg = "WaitForDla " + str_poll_vs_int + " timeout with threadId_" + std::to_string(threadId) + "\n";
+
+    // Timeout has happened if we get here
+    timeoutMsg += "If inference on one batch is expected to take more than " +
+                  std::to_string(waitForDlaTimeoutSeconds_) +
+                  " seconds, then increase WAIT_FOR_DLA_TIMEOUT in dlia_plugin.cpp and "
+                  "recompile the runtime.\n";
+    DLA_LOG("%s", timeoutMsg.c_str());  // this should always print, even if logging
+                                        // verbosity is too low
+    LOG(Logger::WARNING, "%s", timeoutMsg.c_str());
+    std::string exceptionMsg = "FATAL ERROR: inference on FPGA did not complete";
+    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+    throw std::runtime_error(exceptionMsg);
+  }
+
+  if ((isrData_.desc_queue_diag[instance] >> DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT) & 0x01) {
+    std::cerr << "ERROR: Out of free inferences on this IP. " <<
+                 "The Intel FPGA AI suite cannot continue without a license!" << std::endl;
+    std::string exceptionMsg = "Inference on FPGA exited with a license error";
+    exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]);
+    exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]);
+    exceptionMsg += "\nPlease check your license. The Intel FPGA AI suite cannot continue without a license!";
+    throw std::runtime_error(exceptionMsg);
+  }
+
+  jobsWaited_[instance]++;
+}
+
+#ifndef USE_OLD_COREDLA_DEVICE
+uint64_t CoreDlaDevice::GetClocksActive(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+  return (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+}
+
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+  uint64_t clocksActive = GetClocksActive(instance) - startClocksActive[instance];
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+uint64_t CoreDlaDevice::GetClocksAllJobs(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+  return (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+  uint64_t clocksAllJobs = GetClocksAllJobs(instance) - startClockAllJobs[instance];
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#else
+double CoreDlaDevice::GetActiveHWTimeMs(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+    uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO);
+  uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI);
+  uint64_t clocksActive = (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo;
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq());
+}
+
+double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO);
+  uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI);
+  uint64_t clocksAllJobs = (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo;
+  // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds
+  return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs);
+}
+#endif
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReads(int instance) const {
+  return GetNumInputFeatureMemoryReadsTotal(instance) - startNumInputFeatureMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReads(int instance) const {
+  return GetNumFilterMemoryReadsTotal(instance) - startNumFilterMemoryReads.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWrites(int instance) const {
+  return GetNumOutputFeatureMemoryWritesTotal(instance) - startNumOutputFeatureMemoryWrites.at(instance);
+}
+
+uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReadsTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numIFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO);
+  uint32_t numIFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI);
+  uint64_t numIFReads = (((uint64_t) numIFReadsHi) << 32) | ((uint64_t) numIFReadsLo);
+  return numIFReads;
+}
+
+uint64_t CoreDlaDevice::GetNumFilterMemoryReadsTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numWeightReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO);
+  uint32_t numWeightReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI);
+  uint64_t numWeightReads = (((uint64_t) numWeightReadsHi) << 32) | ((uint64_t) numWeightReadsLo);
+  return numWeightReads;
+}
+
+uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWritesTotal(int instance) const {
+  //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR
+  //the host must first read the lower 32-bit of the counter,
+  //then immediately read the higher 32-bit of the counter
+  uint32_t numOFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO);
+  uint32_t numOFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI);
+  uint64_t numOFReads = (((uint64_t) numOFReadsHi) << 32) | ((uint64_t) numOFReadsLo);
+  return numOFReads;
+}
+
+// Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail if
+// the module number and address have not been implemented. The debug network is fault tolerant to both read requests
+// never being accepted as well as read responses never being produced.
+bool CoreDlaDevice::ReadDebugCsr(
+    uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose) const {
+  assert(moduleNum <= 0xff);
+  assert(address <= 0xffffff);
+  uint32_t addr = ((moduleNum & 0xff) << 24) | (address & 0xffffff);
+
+  // Step 1: send the address that the debug network will use to issue a read request. Writing once to this CSR offset
+  // will cause the debug network to issue one read request.
+  mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR, addr);
+
+  // Optional step: read back the value sent to CSR, sanity check that it is correct. Note this is all handled
+  // internally to the CSR, e.g. the CSR does not go ask the debug network what address it sent.
+  uint32_t addrCheck = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR);
+  if (addr != addrCheck) {
+    if (verbose) DLA_LOG("ReadDebugCsr addr read back check failed, expected %u, got %u\n", addr, addrCheck);
+    return false;
+  }
+
+  // Step 2: the debug network should produce a read response which is cached by the CSR. Poll the corresponding status
+  // register inside the CSR until this happens, or until the runtime decides to give up and declare the read a failure.
+  // Do not throw an exception if the read fails, it is allowed to fail if the runtime is trying to figure out which
+  // external debug-capable modules are attached to the debug network. Once the runtime has determined that a module is
+  // attached, only then should read failures should cause an exception.
+  uint32_t isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+  int retry = 5;
+  while (!isValid && retry) {
+    --retry;
+    isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID);
+  }
+  if (!isValid) {
+    if (verbose) DLA_LOG("ReadDebugCsr failed to read at addr %u\n", addr);
+    return false;
+  }
+
+  // Step 3: runtime has confirmed the CSR has a cached the read response from debug network, now go and get the value.
+  readData = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA);
+  if (verbose) DLA_LOG("ReadDebugCsr, addr %u, data %u\n", addr, readData);
+  return true;
+}
+
+// This is a helper function that throws an exception if runtime fails to read from the debug network. This should only
+// be called if the runtime has already confirmed that a module is attached to the debug network i.e. a previous read to
+// this module number had succeeded.
+void ReadDebugNetworkError(int moduleNum, int address, int instance) {
+  std::string msg = "ReadDebugNetwork failure, instance " + std::to_string(instance) +
+                    ", failed to read at module number " + std::to_string(moduleNum) + " address " +
+                    std::to_string(address);
+  throw std::runtime_error(msg);
+}
+
+// Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+// this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+// information the debug register contains, and the value is the data of the debug register. Note that the runtime must
+// completely tranverse the ROM before reading any of the debug register values, and the runtime must read the debug
+// register values in the order that they occur inside the ROM. Usually profiling counters are 64-bit values, and since
+// there is only a 32-bit read available, it takes more than one read to get all the data. The counters could still be
+// updating when the runtime wants to read them, so typically there is a freeze register which can be activated by
+// reading from a special address (hardware will see an incoming read request to this address, that is how it knows to
+// freeze the counters). The offset for the freeze register will typically go first in the ROM, even if it is not the
+// first offset in the address space.
+DebugNetworkData CoreDlaDevice::ReadDebugNetwork(int instance) const {
+  DebugNetworkData result;
+  for (uint32_t moduleNum = 0; moduleNum < 256; moduleNum++) {
+    // Read the ROM to get the offsets and descriptions
+    std::vector<uint32_t> offset;
+    std::vector<std::string> description;
+    uint32_t address = 0, readData = 0;
+    bool first = true, success = false;
+    while (1) {
+      // Parse the offset
+      success = ReadDebugCsr(moduleNum, address, instance, readData);
+      if (!success) {
+        // Failure to read is allowed on the very first time, it is assumed that no external debug-capable module is
+        // attached to the debug network at this moduleNum
+        if (first)
+          break;
+        else
+          ReadDebugNetworkError(moduleNum, address, instance);
+      }
+      if (!readData) break;  // end of list is indicated with offset = 0
+      first = false;
+      address += 4;
+      offset.push_back(readData);
+
+      // Parse the description string
+      std::string str;
+      bool endOfStringSeen = false;
+      while (!endOfStringSeen) {
+        success = ReadDebugCsr(moduleNum, address, instance, readData);
+        if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+        address += 4;
+        for (int i = 0; i < 4; i++) {
+          if (readData & 0xff) {
+            str += ((char)(readData & 0xff));
+            readData >>= 8;
+          } else {
+            endOfStringSeen = true;
+            break;
+          }
+        }
+      }
+      description.push_back(str);
+    }
+
+    assert(offset.size() == description.size());
+
+    // Read the profiling counters
+    for (size_t i = 0; i < offset.size(); i++) {
+      address = offset[i];
+      success = ReadDebugCsr(moduleNum, address, instance, readData);
+      if (!success) ReadDebugNetworkError(moduleNum, address, instance);
+
+      int descriptionOccurenceCnt = result.count(description[i]);
+      // Same description name should show up 2 times in maximum
+      if (descriptionOccurenceCnt == 2) {
+        throw std::runtime_error("More than 2 profiling counter descriptions are the same.");
+      } else if (descriptionOccurenceCnt && (address - offset[i - 1] != 4)) {
+        // same description existed before
+        // check if the two addresses associatede with the same decription are consecutive (offset by 4)
+        throw std::runtime_error("Profiling counter addresses with name: " + description[i] + " are not consecutive");
+      } else if (std::count(offset.begin(), offset.end(), address) > 1) {
+        // same address shows up more than once
+        throw std::runtime_error("Duplicate profiling counter address: " + address);
+      }
+
+      // Avoid printing special stuff like _Freeze and _Unfreeze
+      if (description[i].at(0) != '_') {
+        if (descriptionOccurenceCnt) {
+          // This key has existed before, concatenate 2 uint32_t into uint64_t
+          result[description[i]] |= (((uint64_t)readData) << 32);
+        } else {
+          result[description[i]] = readData;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+int CoreDlaDevice::GetSizeCsrDescriptorQueue() const { return DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE; }
+
+double CoreDlaDevice::GetCoreDlaClockFreq() const { return mmdWrapper_.GetCoreDlaClockFreq(); }
+
+std::string CoreDlaDevice::SchedulerGetStatus() const {
+  if (!spStreamControllerComms_) return "";
+
+  Payload<StatusMessagePayload> statusPayload = spStreamControllerComms_->GetStatus();
+  return spStreamControllerComms_->GetStatusString(statusPayload);
+}
+
+bool CoreDlaDevice::InitializeScheduler(uint32_t sourceBufferSize,
+                                        uint32_t dropSourceBuffers,
+                                        uint32_t numInferenceRequests,
+                                        const std::string source_fifo_file) {
+  spStreamControllerComms_ = std::make_shared<StreamControllerComms>();
+  if (spStreamControllerComms_->IsPresent()) {
+    bool initOK = spStreamControllerComms_->Initialize(sourceBufferSize, dropSourceBuffers, numInferenceRequests);
+    return initOK;
+  } else {
+    spStreamControllerComms_.reset();
+    return false;
+  }
+}
diff --git a/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp
new file mode 100644
index 0000000..c1f349f
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp
@@ -0,0 +1,279 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "coredla_graph_job.h"  //CoreDlaGraphJob
+
+#include <cinttypes>
+#include <cstdlib>   //std::getenv
+#include <iomanip>   //std::hex
+#include <iostream>  //std::cerr
+#include <sstream>   //std::stringstream
+#include <string>    //std::string
+
+#define BUILD_VERSION_CSR_OFFSET (ARCH_HASH_SIZE)
+#define ARCH_NAME_CSR_OFFSET (ARCH_HASH_SIZE + BUILD_VERSION_SIZE)
+
+#define FLAG_DISABLE_ARCH_CHECK "DLA_DISABLE_ARCH_CHECK"
+#define FLAG_DISABLE_VERSION_CHECK "DLA_DISABLE_VERSION_CHECK"
+
+std::unique_ptr<GraphJob> CoreDlaGraphJob::MakeUnique(DeviceMemoryAllocator *ddrBufferAllocator,
+                                                      MmdWrapper *mmdWrapper,
+                                                      const dla::CompiledResult *compiledResult,
+                                                      uint64_t numPipelines,
+                                                      int instance,
+                                                      std::shared_ptr<StreamControllerComms> spStreamControllerComms) {
+  return std::unique_ptr<GraphJob>(new CoreDlaGraphJob(
+      ddrBufferAllocator, mmdWrapper, compiledResult, numPipelines, instance, spStreamControllerComms));
+}
+
+std::string get_env_var_wrapper(const std::string &env_var) {
+  const char *env_var_ptr = std::getenv(env_var.c_str());
+  if (env_var_ptr == nullptr) {
+    return "";
+  }
+
+  return std::string(env_var_ptr);
+}
+
+std::string arch_hash_to_string(const std::vector<int> &arch_hash) {
+  std::stringstream s;
+  for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+    s << std::setfill('0') << std::setw(8) << std::hex << std::right << arch_hash[i] << " ";
+  }
+
+  return s.str();
+}
+
+std::string read_string_from_bitstream_rom(MmdWrapper *mmdWrapper,
+                                           const int instance,
+                                           const uint32_t str_word_size_in_bytes,
+                                           const uint32_t str_offset_in_rom) {
+  std::string str_from_rom;
+  bool done = false;
+  for (uint32_t i = 0; i < str_word_size_in_bytes && (!done); ++i) {
+    int chunk = mmdWrapper->ReadFromCsr(instance, str_offset_in_rom + i * 4);
+    // Parse the int word into chars. Stops at any NUL char.
+    for (int j = 0; j < 4; ++j) {
+      char rom_char = (chunk >> (j * 8)) & 0xFF;
+      if (rom_char == 0) {
+        done = true;
+        break;
+      } else {
+        str_from_rom.push_back(rom_char);
+      }
+    }
+  }
+  return str_from_rom;
+}
+
+CoreDlaGraphJob::CoreDlaGraphJob(DeviceMemoryAllocator *ddrBufferAllocator,
+                                 MmdWrapper *mmdWrapper,
+                                 const dla::CompiledResult *compiledResult,
+                                 uint64_t numPipelines,
+                                 int instance,
+                                 std::shared_ptr<StreamControllerComms> spStreamControllerComms)
+    : configFilterBiasBufferSizeDDR_(0),
+      intermediateBufferSizeDDR_(0),
+      ddrBufferAllocator_(ddrBufferAllocator),
+      mmdWrapper_(mmdWrapper),
+      batchJobsRequested_(0),
+      instance_(instance) {
+  // First read the arch_md5, build_version_string and arch_name string from
+  // the metadata stored in the bitstream discovery ROM, then compare them
+  // against the information present in the compiled result. Fail if it does not match.
+
+  // ARCH_HASH_SIZE bytes for the arch hash.
+  std::vector<int> bitstream_arch_hash;
+  DLA_LOG("Read hash from bitstream ROM...\n");
+  for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+    bitstream_arch_hash.push_back(mmdWrapper_->ReadFromCsr(instance_, i * 4));
+  }
+
+  // Next BUILD_VERSION_SIZE bytes are for the build version string
+  DLA_LOG("Read build version string from bitstream ROM...\n");
+  std::string bitstream_build_version =
+      read_string_from_bitstream_rom(mmdWrapper_, instance_, BUILD_VERSION_WORD_SIZE, BUILD_VERSION_CSR_OFFSET);
+
+  // Next ARCH_NAME_SIZE bytes are for the arch name string
+  DLA_LOG("Read arch name string from bitstream ROM...\n");
+  std::string bitstream_arch_name =
+      read_string_from_bitstream_rom(mmdWrapper_, instance_, ARCH_NAME_WORD_SIZE, ARCH_NAME_CSR_OFFSET);
+
+  // ************************ Perform all checks *******************************
+  // ***************************************************************************
+  if (get_env_var_wrapper(FLAG_DISABLE_ARCH_CHECK) != "1") {
+    DLA_LOG("Runtime arch check is enabled. Check started...\n");
+
+    for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) {
+      if (compiledResult->get_arch_hash()[i] != bitstream_arch_hash[i]) {
+        std::cerr << "Arch check failed: "
+                  << "compiledResult arch hash is " << arch_hash_to_string(compiledResult->get_arch_hash())
+                  << ", compiledResult arch is " << compiledResult->get_arch_name() << ", bitstream arch_hash is "
+                  << arch_hash_to_string(bitstream_arch_hash) << ", bitstream arch is " << bitstream_arch_name
+                  << std::endl;
+
+        std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_ARCH_CHECK << "=1."
+                  << std::endl;
+        std::exit(1);
+      }
+    }
+    DLA_LOG("Runtime arch check passed.\n");
+  } else {
+    DLA_ERROR(
+        "Environment variable %s is set to 1; "
+        "architecture check will be skipped. "
+        "This might cause undefined behavior including hanging, "
+        "and the user should only disable the check if "
+        "they understand the potential consequences.\n",
+        FLAG_DISABLE_ARCH_CHECK);
+  }
+
+  if (get_env_var_wrapper(FLAG_DISABLE_VERSION_CHECK) != "1") {
+    DLA_LOG(
+        "Runtime build version check is enabled. "
+        "Check started...\n");
+    if (bitstream_build_version != compiledResult->get_build_version_string()) {
+      std::cerr << "Build version check failed:"
+                << "compiledResult build version is " << compiledResult->get_build_version_string()
+                << ", bitstream build version is " << bitstream_build_version << std::endl;
+
+      std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_VERSION_CHECK << "=1."
+                << std::endl;
+
+      std::exit(1);
+    }
+    DLA_LOG("Runtime build version check passed.\n");
+  } else {
+    DLA_ERROR(
+        "Environment variable %s is set to 1; "
+        "build version check will be skipped. "
+        "This might cause undefined behavior including hanging, "
+        "and the user should only disable the check if "
+        "they understand the potential consequences.\n",
+        FLAG_DISABLE_VERSION_CHECK);
+  }
+
+  // Checks completed. Allocate buffers and write to DDR
+  intermediateBufferSizeDDR_ = compiledResult->get_conv_intermediate_size_in_bytes();
+  uint64_t totalConfigBytes = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                                0 :
+                                compiledResult->get_config_size_in_bytes();
+  auto &config_fbs_array = compiledResult->get_config_filter_bias_scale_array();
+  auto config_fbs_raw_array = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                              nullptr :
+                              config_fbs_array[0].data();
+  configFilterBiasBufferSizeDDR_ = compiledResult->get_ddrfree_header().enable_parameter_rom ?
+                                    0 :
+                                    config_fbs_array[0].size();
+
+  // TODO: uncomment when buffer_t object is added
+  // assert(config_filter_bias_graph_buffer_size_ddr == config_filter_bias_buffer->size_in_bytes());
+  // Allocate graph buffer (config, filter, bias, io) in DDR
+  uint64_t inputSizeDDR = compiledResult->get_conv_input_size_in_bytes();
+  uint64_t outputSizeDDR = compiledResult->get_conv_output_size_in_bytes();
+
+  // DMA data path width in bytes for feature and filter data
+  // TODO: move this into the arch
+  constexpr uint64_t featureWordSize = 32;
+  constexpr uint64_t filterWordSize = 64;
+
+  // Sanity check that buffer sizes are sufficiently aligned to ensure address alignment.
+  // Input, output, and intermediate buffers contain feature words.
+  assert(inputSizeDDR % featureWordSize == 0);
+  assert(outputSizeDDR % featureWordSize == 0);
+  assert(intermediateBufferSizeDDR_ % featureWordSize == 0);
+  // filter contains filter words, and config must be padded to a filter word size
+  assert(totalConfigBytes % filterWordSize == 0);
+  assert(configFilterBiasBufferSizeDDR_ % filterWordSize == 0);
+
+  // Allocate the intermediate buffer.
+  ddrBufferAllocator_->AllocateSharedBuffer(intermediateBufferSizeDDR_, instance_);
+
+  // Allocate the input/output buffer.
+  // Output buffer must come immediately after the input buffer, so from an allocation perspective this is one buffer.
+  // Note there is an input/output buffer pair allocated for each pipeline. The input/output pair must be contiguous for
+  // each pipeline, but input/output pairs from different pipelines are allowed to have a gap. We could call the
+  // allocator for each input/output buffer pair, however because everything is sized and aligned to the feature word
+  // size, we won't get gaps between them due to alignment. Calling the allocator once per pipeline would result in the
+  // same allocation as calling the allocator just once and using offsets within this big buffer for each pipeline.
+  uint64_t inputOutputBufferSize = numPipelines * (inputSizeDDR + outputSizeDDR);  // how much space to allocate
+  uint64_t inputOutputBufferAlignment = featureWordSize;  // starting address must be aligned to this
+  uint64_t inputOutputBufferAddr;                         // where did the allocator place this buffer
+  ddrBufferAllocator_->AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr);
+
+  // Allocate the config/filter buffer.
+  // Filter buffer must come immediately after the config buffer, so from an allocation perspective this is one buffer.
+  uint64_t configFilterBufferSize = configFilterBiasBufferSizeDDR_;
+  uint64_t configFilterBufferAlignment = filterWordSize;
+  uint64_t configFilterBufferAddr;
+  ddrBufferAllocator_->AllocatePrivateBuffer(
+      configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr);
+
+  // Print the allocation results
+  bool print_allocation_result = getenv("COREDLA_RUNTIME_DEBUG") != nullptr;
+  ios_base::fmtflags coutFlags = cout.flags();  // printing in both decimal and hex, save cout state to undo it later
+  if (print_allocation_result) {
+    DLA_LOG("FPGA DDR allocation results\n");
+    // Intermediate buffer address is hardcoded to 0 in device_memory_allocator.cpp, don't bother printing this
+    DLA_LOG("  Config buffer is at address %" PRIu64, configFilterBufferAddr);
+    DLA_LOG(" (%#" PRIx64 ")\n", configFilterBufferAddr);
+    const uint64_t filter_buffer_address = configFilterBufferAddr + totalConfigBytes;
+    DLA_LOG("  Filter/bias/scale buffer is at address %" PRIu64, filter_buffer_address);
+    DLA_LOG(" (%#" PRIx64 ")\n", filter_buffer_address);
+  }
+
+  const bool enable_istream = compiledResult->get_input_configuration().begin()->second.enable_input_streaming;
+  const bool enable_ostream = compiledResult->get_output_configuration().output_streaming_enabled;
+
+  // Write graph buffer to DDR
+  if (!compiledResult->get_ddrfree_header().enable_parameter_rom) {
+    mmdWrapper_->WriteToDDR(instance_, configFilterBufferAddr, configFilterBiasBufferSizeDDR_, config_fbs_raw_array);
+  } else {
+    DLA_LOG("  Ddrfree graph constants are not written to DDR.\n");
+  }
+
+  for (uint64_t i = 0; i < numPipelines; i++) {
+    uint64_t inputAddrDDR = inputOutputBufferAddr + i * (inputSizeDDR + outputSizeDDR);
+    uint64_t outputAddrDDR = inputAddrDDR + inputSizeDDR;
+    if (print_allocation_result) {
+      DLA_LOG("  Input buffer %" PRIu64 " is at address %" PRIu64, i, inputAddrDDR);
+      DLA_LOG(" (%#" PRIx64 ")\n", inputAddrDDR);
+      DLA_LOG("  Output buffer %" PRIu64 " is at address %" PRIu64, i, outputAddrDDR);
+      DLA_LOG(" (%#" PRIx64 ")\n", outputAddrDDR);
+    }
+    batchJobs_.push_back(move(CoreDlaBatchJob::MakeUnique(mmdWrapper_,
+                                                          totalConfigBytes,
+                                                          configFilterBufferAddr,
+                                                          inputAddrDDR,
+                                                          outputAddrDDR,
+                                                          inputSizeDDR,
+                                                          outputSizeDDR,
+                                                          enable_istream,
+                                                          enable_ostream,
+                                                          instance_,
+                                                          spStreamControllerComms)));
+  }
+  cout.flags(coutFlags);  // restore the state of cout
+}
+
+BatchJob *CoreDlaGraphJob::GetBatchJob() {
+  graphJobMutex.lock();
+  if (batchJobsRequested_ >= batchJobs_.size()) {
+    graphJobMutex.unlock();
+    return nullptr;
+  }
+  auto *batchJob = batchJobs_[batchJobsRequested_].get();
+  batchJobsRequested_++;
+  graphJobMutex.unlock();
+  return batchJob;
+}
diff --git a/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp
new file mode 100644
index 0000000..48844f4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp
@@ -0,0 +1,80 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "device_memory_allocator.h"  //DeviceMemoryAllocator
+#include "dla_dma_constants.h"        //DLA_DMA_CSR_OFFSET_***
+
+#include <stdexcept>  //std::runtime_error
+#include <string>     //std::string
+
+void DeviceMemoryAllocator::Initialize(uint64_t totalSize, MmdWrapper* mmdWrapper) {
+  totalGlobalMemSize_ = totalSize;
+  mmdWrapper_ = mmdWrapper;
+  currentIntermediateMaxBufferSizeAllocated_ = 0;
+  currentStartAddressGraphBufferSpace_ = totalSize;
+}
+
+// The intermediate buffer is shared among all graphs. It gets placed at the lowest address
+// and grows upwards (if a new graph is added which needs a bigger intermediate buffer).
+void DeviceMemoryAllocator::AllocateSharedBuffer(uint64_t bufferSize, int instance) {
+  if (bufferSize > currentIntermediateMaxBufferSizeAllocated_) {
+    currentIntermediateMaxBufferSizeAllocated_ = bufferSize;
+
+    // error intermediate buffer grows into the region of memory used for private buffers
+    if (currentIntermediateMaxBufferSizeAllocated_ > currentStartAddressGraphBufferSpace_) {
+      std::string msg = "FPGA DDR allocation failed, intermediate buffer grew upwards to " +
+                        std::to_string(currentIntermediateMaxBufferSizeAllocated_) +
+                        ", remaining unallocated space is limited to " +
+                        std::to_string(currentStartAddressGraphBufferSpace_);
+      throw std::runtime_error(msg);
+    }
+
+    // tell the fpga where the intermediate buffer is located. At address 0 now. Will change in future with multiple
+    // pe_arrays
+    mmdWrapper_->WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0);
+  }
+}
+
+// The config, filter, input, and output buffers are specific to a graph and therefore require
+// their own space in device memory. Note that filter must come immediately after config, so the
+// allocator allocates both of these together as one buffer. Likewise output must come immediately
+// after input. Private buffers are allocated from the highest to lowest address since the size is
+// known at allocation time. Hardware requires the address to have some alignment, which is
+// specified by the bufferAlignment argument.
+void DeviceMemoryAllocator::AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t& bufferAddr) {
+  uint64_t maxInflatedBufferSize = bufferSize + bufferAlignment;  // be conservative for how much space buffer may take
+
+  // error if the graph does not fit in fpga ddr
+  if (currentIntermediateMaxBufferSizeAllocated_ + maxInflatedBufferSize > currentStartAddressGraphBufferSpace_) {
+    std::string msg =
+      "FPGA DDR allocation failed, allocating buffer of size " + std::to_string(maxInflatedBufferSize) +
+      " exceeds the remaining space available of size " +
+      std::to_string(currentStartAddressGraphBufferSpace_ - currentIntermediateMaxBufferSizeAllocated_) +
+      ". This could be caused by the graph being too large or splitting the graph into too many subgraphs. " +
+      "Memory requirements for large graphs can be reduced by selecting different folding options, " +
+      "reducing batch size or selecting architectures with less padding.";
+    throw std::runtime_error(msg);
+  }
+
+  currentStartAddressGraphBufferSpace_ -= bufferSize;  // allocate from highest to lowest address
+  currentStartAddressGraphBufferSpace_ -=
+      (currentStartAddressGraphBufferSpace_ % bufferAlignment);  // correct for alignment
+  bufferAddr = currentStartAddressGraphBufferSpace_;
+}
+
+void DeviceMemoryAllocator::Clear() {
+  currentIntermediateMaxBufferSizeAllocated_ = 0;
+  currentStartAddressGraphBufferSpace_ = totalGlobalMemSize_;
+}
+
+DeviceMemoryAllocator::~DeviceMemoryAllocator() { Clear(); }
diff --git a/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp
new file mode 100644
index 0000000..bbb052a
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp
@@ -0,0 +1,172 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "mmd_wrapper.h"
+#include "aocl_mmd.h"           // aocl_mmd_***
+#include "dla_dma_constants.h"  // DLA_DMA_CSR_OFFSET_***
+
+#include <cassert>    // assert
+#include <cstddef>    // size_t
+#include <iostream>   // std::cerr
+#include <stdexcept>  // std::runtime_error
+#include <string>     // std::string
+
+// All board variants must obey the CoreDLA CSR spec, which says that all access must be
+// - 32 bits in size
+// - address must be 4 byte aligned
+// - within the address range, CSR size is 2048 bytes
+constexpr uint64_t DLA_CSR_ALIGNMENT = 4;
+constexpr uint64_t DLA_CSR_SIZE = 2048;
+
+// assert(status == 0) is removed by the c++ processor when compiling in release mode
+// this is a handy workaround for suppressing the compiler warning about an unused variable
+template <class T>
+void suppress_warning_unused_varible(const T &) {}
+
+MmdWrapper::MmdWrapper() {
+  // Open the MMD
+  constexpr size_t MAX_BOARD_NAMES_LEN = 4096;
+  char name[MAX_BOARD_NAMES_LEN];
+  size_t sz;
+  int status = aocl_mmd_get_offline_info(AOCL_MMD_BOARD_NAMES, MAX_BOARD_NAMES_LEN, name, &sz);
+  if (status) {
+    std::string msg = "Failed to query a board name from MMD. Perhaps no FPGA device is available?";
+    throw std::runtime_error(msg);
+  }
+  int handle = aocl_mmd_open(name);
+  if (handle < 0) {
+    std::string msg = "Failed to open MMD";
+    throw std::runtime_error(msg);
+  }
+  handle_ = handle;
+
+  // Query some board-specific information from the MMD. Some values can be hardcoded constants
+  // where different boards have different constants, e.g. capacity of FPGA DDR. Others values may
+  // be determined experimentally e.g. start and stop a counter with a known duration in between to
+  // measure the clk_dla frequency.
+  maxInstances_ = dla_mmd_get_max_num_instances();
+  ddrSizePerInstance_ = dla_mmd_get_ddr_size_per_instance();
+  coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_);
+
+  // On DE10 Agilex boards with GCC 8.3.0, we noticed that the clock frequency was being read as 0,
+  // around 50% of the time, and around 10% of the time on GCC 9.2.0, causing failures on perf_est
+  // tests. This retry loop will recall the function until the coreDlaClockFreq is non zero, or
+  // it exhausts 10 retries.
+  // We have no idea why this happens currently, but it typically passes by the second try.
+  int clockFreqRetries = 10;
+  while (coreDlaClockFreq_ == 0 && clockFreqRetries > 0) {
+    coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_);
+    clockFreqRetries--;
+  }
+  ddrClockFreq_ = dla_mmd_get_ddr_clock_freq();
+}
+
+MmdWrapper::~MmdWrapper() {
+  // Close the MMD
+  int status = aocl_mmd_close(handle_);
+  if (status) {
+    // Avoid throwning an exception from a Destructor.  We are ultimately
+    // part of a (virtual) OpenVINO destructor, so we should follow the
+    // noexcept(true) that it advertises.  Perhaps we can close the mmd
+    // as a separate step prior to destruction to make signaling errors
+    // easier?
+    std::cerr << "Failed to close MMD" << std::endl;
+    std::cerr << "Error status " << status << std::endl;
+    std::exit(1);
+  }
+}
+
+void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const {
+  // register an interrupt handler
+  int status = aocl_mmd_set_interrupt_handler(handle_, func, data);
+  if (status) {
+    std::string msg = "Failed to register an interrupt handler with MMD";
+    throw std::runtime_error(msg);
+  }
+}
+
+void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE);
+  assert(addr % DLA_CSR_ALIGNMENT == 0);
+  int status = dla_mmd_csr_write(handle_, instance, addr, &data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE);
+  assert(addr % DLA_CSR_ALIGNMENT == 0);
+  uint32_t data;
+  int status = dla_mmd_csr_read(handle_, instance, addr, &data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+  return data;
+}
+
+void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + length <= ddrSizePerInstance_);
+  int status = dla_mmd_ddr_write(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr + length <= ddrSizePerInstance_);
+  int status = dla_mmd_ddr_read(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+#ifndef STREAM_CONTROLLER_ACCESS
+// Stream controller access is not supported by the platform abstraction
+bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; }
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(false);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(false);
+}
+#else
+// If the mmd layer supports accesses to the Stream Controller
+bool MmdWrapper::bIsStreamControllerValid(int instance) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  bool status = dla_is_stream_controller_valid(handle_, instance);
+  return status;
+}
+
+// 32-bit handshake with each Stream Controller CSR
+void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr % sizeof(uint32_t) == 0);
+  assert(length % sizeof(uint32_t) == 0);
+  int status = dla_mmd_stream_controller_write(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+
+void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const {
+  assert(instance >= 0 && instance < maxInstances_);
+  assert(addr % sizeof(uint32_t) == 0);
+  assert(length % sizeof(uint32_t) == 0);
+  int status = dla_mmd_stream_controller_read(handle_, instance, addr, length, data);
+  assert(status == 0);
+  suppress_warning_unused_varible(status);
+}
+#endif
diff --git a/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp
new file mode 100644
index 0000000..677f6e4
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp
@@ -0,0 +1,274 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "stream_controller_comms.h"
+#include <chrono>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+// StreamControllerComms provides an interface to the Stream Controller
+// microcode running in the NIOS-V
+
+static const uint32_t messageReadyMagicNumber = 0x55225522;
+static constexpr uint32_t mailboxRamSize = 0x1000;
+
+StreamControllerComms::StreamControllerComms() {}
+
+bool StreamControllerComms::IsPresent() {
+  // Check there is an interface to the stream controller
+  if (!_mmdWrapper.bIsStreamControllerValid(_streamControllerInstance)) {
+    return false;
+  }
+
+  // Check that the stream controller responds
+  bool isPresent = Ping();
+  return isPresent;
+}
+
+// Query for the current status
+Payload<StatusMessagePayload> StreamControllerComms::GetStatus() {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return {};
+  }
+
+  if (SendMessage(MessageType_GetStatus)) {
+    if (ReceiveMessage() == MessageType_Status) {
+      return _receivedStatusMessage;
+    }
+  }
+
+  return {};
+}
+
+// Schedule an inference request with the stream controller
+bool StreamControllerComms::ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items) {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  bool status = true;
+
+  for (auto& job : items) {
+    bool thisJobStatus = false;
+
+    if (SendMessage(MessageType_ScheduleItem, job.GetPayload(), job.GetSize())) {
+      if (ReceiveMessage() == MessageType_NoOperation) {
+        thisJobStatus = true;
+      }
+    }
+
+    if (!thisJobStatus) {
+      status = false;
+    }
+  }
+
+  return status;
+}
+
+// Send a ping command to the stream controller and wait for a pong
+// response.
+bool StreamControllerComms::Ping() {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  if (SendMessage(MessageType_Ping)) {
+    return (ReceiveMessage() == MessageType_Pong);
+  }
+
+  return false;
+}
+
+// Initialize and reset the stream controller
+//
+// sourceBufferSize:
+//      The size of the MSGDMA buffers that the stream
+//      controller will receive from the layout transform
+// dropSourceBuffers:
+//      How many source buffers to drop between each
+//      processed one. 0 by default unless set in the configuration
+//      by the app with DLIAPlugin::properties::streaming_drop_source_buffers.name()
+// numInferenceRequest:
+//      A constant value set in the executable network. The
+//      stream controller will start executing once it has
+//      received this number of inference requests from OpenVINO
+bool StreamControllerComms::Initialize(uint32_t sourceBufferSize,
+                                       uint32_t dropSourceBuffers,
+                                       uint32_t numInferenceRequests) {
+  BusyCheck busyCheck(_busyFlag);
+  if (!busyCheck) {
+    return false;
+  }
+
+  Payload<InitializeStreamControllerPayload> initializePayload{};
+  initializePayload._sourceBufferSize = sourceBufferSize;
+  initializePayload._dropSourceBuffers = dropSourceBuffers;
+  initializePayload._numInferenceRequests = numInferenceRequests;
+
+  if (SendMessage(
+          MessageType_InitializeStreamController, initializePayload.GetPayload(), initializePayload.GetSize())) {
+    if (ReceiveMessage() == MessageType_NoOperation) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Receive a message from the stream controller by reading from the
+// mailbox memory until the magic number is set to indicate a message is ready.
+// Only the Status return message has a payload
+MessageType StreamControllerComms::ReceiveMessage() {
+  uint32_t receiveMessageOffset = mailboxRamSize / 2;
+  MessageHeader* pReceiveMessage = nullptr;
+  uint32_t messageReadyMagicNumberOffset = receiveMessageOffset;
+  uint32_t payloadOffset = static_cast<uint32_t>(receiveMessageOffset + (size_t)&pReceiveMessage->_payload);
+  uint32_t waitCount = 0;
+
+  while (waitCount < 100) {
+    MessageHeader messageHeader;
+    _mmdWrapper.ReadFromStreamController(
+        _streamControllerInstance, receiveMessageOffset, sizeof(messageHeader), &messageHeader);
+    if (messageHeader._messageReadyMagicNumber == messageReadyMagicNumber) {
+      MessageType messageType = static_cast<MessageType>(messageHeader._messageType);
+      uint32_t sequenceId = messageHeader._sequenceID;
+
+      bool ok = false;
+
+      if (messageType == MessageType_Status) {
+        ok = StatusMessageHandler(payloadOffset);
+      } else if (messageType == MessageType_Pong) {
+        ok = true;
+      }
+
+      if (!ok) {
+        _numBadMessages++;
+      }
+
+      _mmdWrapper.WriteToStreamController(
+          _streamControllerInstance, messageReadyMagicNumberOffset, sizeof(sequenceId), &sequenceId);
+      _lastReceiveSequenceID = sequenceId;
+      return messageType;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    waitCount++;
+  }
+
+  return MessageType_Invalid;
+}
+
+// Send a message to the stream controller by writing to the mailbox memory,
+// and wait for the message to be received/processed
+bool StreamControllerComms::SendMessage(MessageType messageType, void* pPayload, size_t payloadSize) {
+  uint32_t sendMessageOffset = 0;
+  MessageHeader* pSendMessage = nullptr;
+  uint32_t messageReadyMagicNumberOffset = 0;
+  uint32_t messageTypeOffset = static_cast<uint32_t>((size_t)&pSendMessage->_messageType);
+  uint32_t sequenceIDOffset = static_cast<uint32_t>((size_t)&pSendMessage->_sequenceID);
+  uint32_t payloadOffset = static_cast<uint32_t>((size_t)&pSendMessage->_payload);
+
+  uint32_t uintMessageType = static_cast<uint32_t>(messageType);
+
+  _mmdWrapper.WriteToStreamController(
+      _streamControllerInstance, messageTypeOffset, sizeof(uintMessageType), &uintMessageType);
+  _mmdWrapper.WriteToStreamController(
+      _streamControllerInstance, sequenceIDOffset, sizeof(_sendSequenceID), &_sendSequenceID);
+
+  if (payloadSize > 0) {
+    _mmdWrapper.WriteToStreamController(_streamControllerInstance, payloadOffset, payloadSize, pPayload);
+  }
+
+  // Signal the message as ready
+  _mmdWrapper.WriteToStreamController(_streamControllerInstance,
+                                      messageReadyMagicNumberOffset,
+                                      sizeof(messageReadyMagicNumber),
+                                      &messageReadyMagicNumber);
+
+  // Wait until the message has been processed by looking for the sequence ID
+  // in the magic number position
+  uint32_t waitCount = 0;
+  while (waitCount < 100) {
+    MessageHeader messageHeader;
+    _mmdWrapper.ReadFromStreamController(
+        _streamControllerInstance, sendMessageOffset, sizeof(messageHeader), &messageHeader);
+
+    if (messageHeader._messageReadyMagicNumber == _sendSequenceID) {
+      _sendSequenceID++;
+      return true;
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    waitCount++;
+  }
+
+  return false;
+}
+
+// Read the status message payload
+bool StreamControllerComms::StatusMessageHandler(uint32_t payloadOffset) {
+  _mmdWrapper.ReadFromStreamController(
+      _streamControllerInstance, payloadOffset, sizeof(_receivedStatusMessage), &_receivedStatusMessage);
+  return true;
+}
+
+// Parse the status message payload into a string
+std::string StreamControllerComms::GetStatusString(Payload<StatusMessagePayload>& statusPayload) {
+  std::ostringstream stringStream;
+  stringStream << static_cast<uint32_t>(statusPayload._status) << "," << statusPayload._statusLineNumber << ","
+               << statusPayload._numReceivedSourceBuffers << "," << statusPayload._numScheduledInferences << ","
+               << statusPayload._numExecutedJobs;
+  return stringStream.str();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// BusyFlag is used to prevent concurrent access to the stream controller,
+// without holding a mutex when sending/receiving commands
+using LockGuard = std::lock_guard<std::recursive_mutex>;
+
+bool BusyFlag::Lock() {
+  LockGuard lock(_mutex);
+  if (_busy) {
+    return false;
+  }
+
+  _busy = true;
+  return true;
+}
+
+void BusyFlag::Release() {
+  LockGuard lock(_mutex);
+  _busy = false;
+}
+
+BusyCheck::BusyCheck(BusyFlag& busyFlag) : _busyFlag(busyFlag), _haveLocked(false) {}
+
+BusyCheck::~BusyCheck() {
+  if (_haveLocked) {
+    _busyFlag.Release();
+  }
+}
+
+BusyCheck::operator bool() {
+  bool locked = _busyFlag.Lock();
+  if (locked) {
+    _haveLocked = true;
+  }
+  return locked;
+}
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h
new file mode 100644
index 0000000..d77c5ab
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h
@@ -0,0 +1,45 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+//the numbers below are byte addresses, must be a multiple of 4 since each access is 32 bits
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL      = 512; //0x200
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_MASK         = 516;
+static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR       = 528; //0x210
+static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO = 532;
+static const uint32_t DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR = 536;
+static const uint32_t DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS       = 540;
+static const uint32_t DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR = 544; //0x220
+static const uint32_t DLA_DMA_CSR_OFFSET_COMPLETION_COUNT       = 548;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO       = 576; //0x240
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI       = 580;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO     = 584;
+static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI     = 588;
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR     = 592; //0x250
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID    = 596;
+static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA     = 600;
+
+//bit positions in interrupt control and mask
+static const uint32_t DLA_DMA_CSR_INTERRUPT_ERROR_BIT = 0;
+static const uint32_t DLA_DMA_CSR_INTERRUPT_DONE_BIT  = 1;
+
+//bit positions in descriptor diagnostic
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT    = 0;
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT = 1;
+static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT = 2;
+
+//descriptor queue
+//runtime knows how many jobs it has enqueued and how many jobs have finished
+//runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE  = 64;   //max number of jobs that runtime can enqueue
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB = 8;    //how many words in the queue are needed to enqueue 1 job
+static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE = DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE * DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //number of words in the hardware queue
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c
new file mode 100644
index 0000000..1a12def
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c
@@ -0,0 +1,80 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "message_handlers.h"
+#include "stream_controller_messages.h"
+
+bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    InitializeStreamControllerPayload* pInitializePayload = (InitializeStreamControllerPayload*)pPayload;
+    this->InitializeStreamController(this,
+                                     pInitializePayload->_sourceBufferSize,
+                                     pInitializePayload->_dropSourceBuffers,
+                                     pInitializePayload->_numInferenceRequests);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    volatile CoreDlaJobPayload* pCoreDlaJobPayload = (volatile CoreDlaJobPayload*)pPayload;
+    this->NewInferenceRequestReceived(this, pCoreDlaJobPayload);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    this->SendMessage(this, MessageType_Pong, NULL, 0);
+    return true;
+}
+
+bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    StatusMessagePayload statusMessagePayload;
+    statusMessagePayload._status = this->_status;
+    statusMessagePayload._statusLineNumber = this->_statusLineNumber;
+    statusMessagePayload._numReceivedSourceBuffers = this->_numReceivedSourceBuffers;
+    statusMessagePayload._numScheduledInferences = this->_numScheduledInferences;
+    statusMessagePayload._numExecutedJobs = this->_numExecutedJobs;
+    this->SendMessage(this, MessageType_Status, &statusMessagePayload, sizeof(statusMessagePayload));
+    return true;
+}
+
+bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    ManualArmDmaTransferPayload* pManualArmDmaTransferPayload = (ManualArmDmaTransferPayload*)pPayload;
+    CoreDlaJobItem emptyJob = {};
+    this->_debugJob = emptyJob;
+    this->_debugJob._payload._inputAddressDDR = pManualArmDmaTransferPayload->_inputAddressDDR;
+    this->_sourceBufferSize = pManualArmDmaTransferPayload->_sourceBufferSize;
+    bool fromHPS = (pManualArmDmaTransferPayload->_fromHPS != 0);
+    this->ArmDmaTransfer(this, &this->_debugJob, fromHPS);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload)
+{
+    ManualScheduleDlaInferencePayload* pManualScheduleDlaInferencePayload = (ManualScheduleDlaInferencePayload*)pPayload;
+    CoreDlaJobItem emptyJob = {};
+    this->_debugJob = emptyJob;
+    this->_debugJob._payload._configurationBaseAddressDDR = pManualScheduleDlaInferencePayload->_configurationBaseAddressDDR;
+    this->_debugJob._payload._configurationSize = pManualScheduleDlaInferencePayload->_configurationSize;
+    this->_debugJob._payload._inputAddressDDR = pManualScheduleDlaInferencePayload->_inputAddressDDR;
+    this->ScheduleDlaInference(this, &this->_debugJob);
+    this->SendMessage(this, MessageType_NoOperation, NULL, 0);
+    return true;
+}
+
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h
new file mode 100644
index 0000000..a7e5187
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h
@@ -0,0 +1,22 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include "stream_controller.h"
+
+extern bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload);
+extern bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload);
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c
new file mode 100644
index 0000000..ad8b372
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c
@@ -0,0 +1,426 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#include "stream_controller.h"
+#include "message_handlers.h"
+#include "sys/alt_cache.h"
+#include "dla_registers.h"
+#include <string.h>
+
+static const uint32_t messageReadyMagicNumber = 0x55225522;
+static const uint32_t mailboxBaseAddress = 0x40000;
+static const uint32_t mailboxSize = 0x1000;
+static const uint32_t dlaBaseAddress = 0x30000;
+
+static void Start(StreamController* this);
+static void Reset(StreamController* this);
+static bool InitializeMsgDma(StreamController* this);
+static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS);
+static void RunEventLoop(StreamController* this);
+static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data);
+static void InitializeStreamController(StreamController* this, uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber);
+static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage);
+static bool SendMessage(StreamController* this,
+                        MessageType messageType,
+                        void* pPayload,
+                        size_t payloadSize);
+static void NewSourceBuffer(StreamController* this);
+static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob);
+static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload);
+static void MsgDmaIsr(void* pContext);
+
+int main()
+{
+    StreamController streamController = {};
+    StreamController* this = &streamController;
+
+    this->Start = Start;
+    this->Reset = Reset;
+    this->InitializeMsgDma = InitializeMsgDma;
+    this->ArmDmaTransfer = ArmDmaTransfer;
+    this->RunEventLoop = RunEventLoop;
+    this->WriteToDlaCsr = WriteToDlaCsr;
+    this->InitializeStreamController = InitializeStreamController;
+    this->SetStatus = SetStatus;
+    this->ReceiveMessage = ReceiveMessage;
+    this->SendMessage = SendMessage;
+    this->NewSourceBuffer = NewSourceBuffer;
+    this->ScheduleDlaInference = ScheduleDlaInference;
+    this->NewInferenceRequestReceived = NewInferenceRequestReceived;
+
+    // Message handlers
+    this->GetStatusMessageHandler = GetStatusMessageHandler;
+    this->ScheduleItemMessageHandler = ScheduleItemMessageHandler;
+    this->PingMessageHandler = PingMessageHandler;
+    this->InitializeStreamControllerMessageHandler = InitializeStreamControllerMessageHandler;
+    this->ManualArmDmaTransferMessageHandler = ManualArmDmaTransferMessageHandler;
+    this->ManualScheduleDlaInferenceMessageHandler = ManualScheduleDlaInferenceMessageHandler;
+
+    this->Reset(this);
+    this->Start(this);
+
+    return 0;
+}
+
+static void Start(StreamController* this)
+{
+    // Clear the mailbox memory
+    uint8_t* pMailbox = (uint8_t*)(mailboxBaseAddress);
+    memset(pMailbox, 0, mailboxSize);
+
+    if (this->InitializeMsgDma(this))
+    {
+        // Run the main event loop
+        this->RunEventLoop(this);
+    }
+}
+
+static bool InitializeMsgDma(StreamController* this)
+{
+    this->_pMsgDevice = alt_msgdma_open(DLA_MSGDMA_0_CSR_NAME);
+    if (this->_pMsgDevice)
+    {
+        alt_msgdma_register_callback(this->_pMsgDevice, MsgDmaIsr, 0, this);
+        alt_dcache_flush_all();
+        return true;
+    }
+    else
+    {
+        this->SetStatus(this, NiosStatusType_MsgDmaFailed, __LINE__);
+        return false;
+    }
+}
+
+static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS)
+{
+    this->_pFillingImageJob = pFillJob;
+
+    alt_u32* pWriteBuffer = (alt_u32*)this->_pFillingImageJob->_payload._inputAddressDDR;
+    alt_u32 length = this->_sourceBufferSize;
+    alt_u32 control = ALTERA_MSGDMA_DESCRIPTOR_CONTROL_TRANSFER_COMPLETE_IRQ_MASK;
+
+    int r = 0;
+    if (fromHPS)
+    {
+        r = alt_msgdma_construct_extended_st_to_mm_descriptor(this->_pMsgDevice,
+                                                              &this->_msgdmaDescriptor,
+                                                              pWriteBuffer,
+                                                              length,
+                                                              control,
+                                                              0,
+                                                              0,
+                                                              1);
+    }
+    else
+    {
+        r = alt_msgdma_construct_extended_mm_to_st_descriptor(this->_pMsgDevice,
+                                                              &this->_msgdmaDescriptor,
+                                                              pWriteBuffer,
+                                                              length,
+                                                              control,
+                                                              0,
+                                                              0,
+                                                              1);
+    }
+
+    if (r == 0)
+    {
+        r = alt_msgdma_extended_descriptor_async_transfer(this->_pMsgDevice, &this->_msgdmaDescriptor);
+        if (r != 0)
+        {
+            this->SetStatus(this, NiosStatusType_AsyncTransferFailed, __LINE__);
+        }
+    }
+    else
+    {
+        this->SetStatus(this, NiosStatusType_BadDescriptor, __LINE__);
+    }
+
+    return (r == 0);
+}
+
+static void RunEventLoop(StreamController* this)
+{
+    volatile MessageHeader* pReceiveMessage = (MessageHeader*)(mailboxBaseAddress);
+
+    uint32_t previousIsrCount = this->_isrCount;
+
+    while (true)
+    {
+        uint32_t isrCount = this->_isrCount;
+
+        if (isrCount != previousIsrCount)
+        {
+            this->NewSourceBuffer(this);
+        }
+
+        if (pReceiveMessage->_messageReadyMagicNumber == messageReadyMagicNumber)
+        {
+            this->ReceiveMessage(this, pReceiveMessage);
+        }
+
+        previousIsrCount = isrCount;
+    }
+}
+
+static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage)
+{
+    MessageType messageType = pReceiveMessage->_messageType;
+    uint32_t sequenceId = pReceiveMessage->_sequenceID;
+    this->_commandCounter++;
+
+    bool ok = false;
+
+    volatile uint32_t* pPayload = &pReceiveMessage->_payload;
+
+    if (messageType == MessageType_GetStatus)
+        ok = this->GetStatusMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ScheduleItem)
+        ok = this->ScheduleItemMessageHandler(this, pPayload);
+    else if (messageType == MessageType_Ping)
+        ok = this->PingMessageHandler(this, pPayload);
+    else if (messageType == MessageType_InitializeStreamController)
+        ok = this->InitializeStreamControllerMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ManualArmDmaTransfer)
+        ok = this->ManualArmDmaTransferMessageHandler(this, pPayload);
+    else if (messageType == MessageType_ManualScheduleDlaInference)
+        ok = this->ManualScheduleDlaInferenceMessageHandler(this, pPayload);
+
+    if (!ok)
+        this->SetStatus(this, NiosStatusType_BadMessage, __LINE__);
+
+    pReceiveMessage->_messageReadyMagicNumber = sequenceId;
+
+    if ((this->_lastReceiveSequenceID != 0) && ((this->_lastReceiveSequenceID + 1) != sequenceId))
+    {
+        // If the DLA plugin has restarted, the first message will be InitializeStreamController
+        // with a sequence ID of 0
+        if ((sequenceId != 0) || (messageType != MessageType_InitializeStreamController))
+            this->SetStatus(this, NiosStatusType_BadMessageSequence, __LINE__);
+    }
+
+    this->_lastReceiveSequenceID = sequenceId;
+    return messageType;
+}
+
+static bool SendMessage(StreamController* this,
+                        MessageType messageType,
+                        void *pPayload,
+                        size_t payloadSize)
+{
+    uint32_t mailboxSendAddress = mailboxBaseAddress + (mailboxSize / 2);
+    uint32_t* pMailbox = (uint32_t*)mailboxSendAddress;
+    MessageHeader* pSendMessage = (MessageHeader*)(pMailbox);
+    void* pPayloadDestination = &pSendMessage->_payload;
+
+    pSendMessage->_messageType = messageType;
+    pSendMessage->_sequenceID = this->_sendSequenceID;
+
+    if (payloadSize > 0)
+        memcpy(pPayloadDestination, pPayload, payloadSize);
+
+    // Signal the message as ready
+    pSendMessage->_messageReadyMagicNumber = messageReadyMagicNumber;
+
+    this->_sendSequenceID++;
+    return true;
+}
+
+// We have received a new source buffer via the msgdma
+static void NewSourceBuffer(StreamController* this)
+{
+    // Read the response to flush the buffer
+    CoreDlaJobItem* pJustFilledJob = this->_pFillingImageJob;
+    CoreDlaJobItem* pNextFillJob = NULL;
+
+    uint32_t bufferSequence = this->_numReceivedSourceBuffers;
+    this->_numReceivedSourceBuffers++;
+
+    // Have we just captured a manually armed DMA transfer?
+    if (pJustFilledJob == &this->_debugJob)
+        return;
+
+    if (this->_dropSourceBuffers > 0)
+    {
+        // If _dropSourceBuffers = 1, we process 1, drop 1 etc
+        // if _dropSourceBuffers = 2, we process 1, drop 2, process 1, drop 2 etc
+        if (bufferSequence % (this->_dropSourceBuffers + 1) != 0)
+        {
+            // Drop this buffer, capture the next one in its place
+            this->ArmDmaTransfer(this, pJustFilledJob, true);
+            return;
+        }
+    }
+
+    pJustFilledJob->_hasSourceBuffer = true;
+
+    if (pJustFilledJob->_pNextJob->_hasSourceBuffer)
+    {
+        // No space in the next job, so keep filling the same job
+        pNextFillJob = pJustFilledJob;
+
+        // It already has a buffer but we have to
+        // consider this as dropped as we will write another
+        // in its place
+        pNextFillJob->_hasSourceBuffer = false;
+    }
+    else
+    {
+        pNextFillJob = pJustFilledJob->_pNextJob;
+    }
+
+    // Re-arm the DMA transfer
+    this->ArmDmaTransfer(this, pNextFillJob, true);
+
+    // If there are less than two scheduled buffers, then we can schedule another one
+    // _pNextInferenceRequestJob is the executing job if it is marked as scheduled
+
+    uint32_t nScheduled = 0;
+    if (this->_pNextInferenceRequestJob->_scheduledWithDLA)
+        nScheduled++;
+    if (this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA)
+        nScheduled++;
+
+    if (nScheduled < 2)
+        this->ScheduleDlaInference(this, pJustFilledJob);
+}
+
+static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload)
+{
+    // Once we have received all '_totalNumInferenceRequests' inference requests,
+    // we set the state to running and can now capture the input dma's
+    bool wasRunning = this->_running;
+    this->_numInferenceRequests++;
+    this->_running = (this->_numInferenceRequests >= this->_totalNumInferenceRequests);
+
+    CoreDlaJobItem* pThisJob = this->_pNextInferenceRequestJob;
+
+    // Store the job details and move to the next
+    uint32_t previousAddress = pThisJob->_payload._inputAddressDDR;
+    pThisJob->_payload = *pJobPayload;
+
+    // This job has just completed so clear its state
+    pThisJob->_scheduledWithDLA = false;
+    pThisJob->_hasSourceBuffer = false;
+
+    // The jobs are recycled by the DLA plugin so the inputAddrDDR should
+    // stay the same for each _jobs[n]
+    if ((pThisJob->_payload._inputAddressDDR != previousAddress) && (previousAddress != 0))
+        this->SetStatus(this, NiosStatusType_Error, __LINE__);
+
+    this->_pNextInferenceRequestJob = this->_pNextInferenceRequestJob->_pNextJob;
+
+    if (wasRunning)
+    {
+        this->_numExecutedJobs++;
+
+        // Check if we have any jobs ready to be scheduled. Maximum of 2 can have _scheduledWithDLA set
+        if (!this->_pNextInferenceRequestJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_hasSourceBuffer)
+        {
+            this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob);
+        }
+        else if (!this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_pNextJob->_hasSourceBuffer)
+        {
+            this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob->_pNextJob);
+        }
+    }
+    else if (this->_running)
+    {
+        // We have just started running
+        // Arm the DMA transfer to start receiving source buffers
+        this->ArmDmaTransfer(this, &this->_jobs[0], true);
+    }
+}
+
+static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob)
+{
+    // The DLA has an input FIFO. By setting the base address register,
+    // we add this request to the FIFO
+    pJob->_scheduledWithDLA = true;
+    this->_numScheduledInferences++;
+
+    CoreDlaJobPayload* pJobPayload = &pJob->_payload;
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, pJobPayload->_configurationBaseAddressDDR);
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, pJobPayload->_configurationSize);
+    this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, pJobPayload->_inputAddressDDR);
+}
+
+static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber)
+{
+    this->_status = statusType;
+    this->_statusLineNumber = lineNumber;
+}
+
+static void InitializeStreamController(StreamController* this,
+                                       uint32_t sourceBufferSize,
+                                       uint32_t dropSourceBuffers,
+                                       uint32_t numInferenceRequests)
+{
+    // This is called once when the inference app is run,
+    // so acts like a reset
+    this->_sourceBufferSize = sourceBufferSize;
+    this->_dropSourceBuffers = dropSourceBuffers;
+    this->_totalNumInferenceRequests = numInferenceRequests;
+    this->_jobs = malloc(sizeof(CoreDlaJobItem) * this->_totalNumInferenceRequests);
+
+    // Reset any previous state
+    this->Reset(this);
+}
+
+static void Reset(StreamController* this)
+{
+    CoreDlaJobItem emptyJob = {};
+    uint32_t lastIndex = this->_totalNumInferenceRequests - 1;
+
+    // Set up the circular job buffers
+    for (uint32_t i = 0; i < this->_totalNumInferenceRequests; i++)
+    {
+        this->_jobs[i] = emptyJob;
+        this->_jobs[i]._index = i;
+        uint32_t previousIndex = (i == 0) ? lastIndex : i - 1;
+        uint32_t nextIndex = (i == lastIndex) ? 0 : i + 1;
+        this->_jobs[i]._pPreviousJob = &this->_jobs[previousIndex];
+        this->_jobs[i]._pNextJob = &this->_jobs[nextIndex];
+    }
+
+    this->_pNextInferenceRequestJob = &this->_jobs[0];
+    this->_pFillingImageJob = &this->_jobs[0];
+    this->_status = NiosStatusType_OK;
+    this->_statusLineNumber = 0;
+    this->_commandCounter = 0;
+    this->_numInferenceRequests = 0;
+    this->_numExecutedJobs = 0;
+    this->_numScheduledInferences = 0;
+    this->_lastReceiveSequenceID = 0;
+    this->_sendSequenceID = 0;
+    this->_running = false;
+    this->_isrCount = 0;
+    this->_numReceivedSourceBuffers = 0;
+}
+
+static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data)
+{
+    uint32_t* pRegister = (uint32_t*)(dlaBaseAddress + addr);
+    pRegister[0] = data;
+}
+
+// Incrementing the ISR count here will result in NewSourceBuffer above being called
+// in the event loop
+static void MsgDmaIsr(void* pContext)
+{
+    StreamController* this = (StreamController*)pContext;
+    this->_isrCount++;
+}
+
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h
new file mode 100644
index 0000000..8b19066
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h
@@ -0,0 +1,86 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "altera_msgdma.h"
+#include "system.h"
+#include "stream_controller_messages.h"
+
+typedef struct CoreDlaJobItem
+{
+    uint32_t                _index;
+    bool                    _hasSourceBuffer;
+    bool                    _scheduledWithDLA;
+    CoreDlaJobPayload       _payload;
+    struct CoreDlaJobItem*  _pPreviousJob;
+    struct CoreDlaJobItem*  _pNextJob;
+} CoreDlaJobItem;
+
+typedef struct StreamController
+{
+    void        (*Start)(struct StreamController* this);
+    void        (*Reset)(struct StreamController* this);
+    bool        (*InitializeMsgDma)(struct StreamController* this);
+    bool        (*ArmDmaTransfer)(struct StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS);
+    void        (*RunEventLoop)(struct StreamController* this);
+    void        (*WriteToDlaCsr)(struct StreamController* this, uint32_t addr, uint32_t data);
+    void        (*InitializeStreamController)(struct StreamController* this,
+                                              uint32_t sourceBufferSize,
+                                              uint32_t dropSourceBuffers,
+                                              uint32_t numInferenceRequests);
+    void        (*SetStatus)(struct StreamController* this,
+                             NiosStatusType statusType, uint32_t lineNumber);
+    MessageType (*ReceiveMessage)(struct StreamController *this, volatile MessageHeader* pReceiveMessage);
+    bool        (*SendMessage)(struct StreamController* this,
+                               MessageType messageType,
+                               void* pPayload,
+                               size_t payloadSize);
+    void        (*NewSourceBuffer)(struct StreamController* this);
+    void        (*ScheduleDlaInference)(struct StreamController* this, CoreDlaJobItem* pJob);
+    void        (*NewInferenceRequestReceived)(struct StreamController* this, volatile CoreDlaJobPayload* pJob);
+
+    // Message handlers
+    bool        (*GetStatusMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ScheduleItemMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*PingMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*InitializeStreamControllerMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ManualArmDmaTransferMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+    bool        (*ManualScheduleDlaInferenceMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload);
+
+    CoreDlaJobItem* _jobs;
+    CoreDlaJobItem* _pNextInferenceRequestJob;
+    CoreDlaJobItem* _pFillingImageJob;
+    CoreDlaJobItem  _debugJob;
+    NiosStatusType  _status;
+    uint32_t        _statusLineNumber;
+    uint32_t        _commandCounter;
+    uint32_t        _sourceBufferSize;
+    uint32_t        _dropSourceBuffers;
+    uint32_t        _totalNumInferenceRequests;
+    uint32_t        _numInferenceRequests;
+    uint32_t        _numExecutedJobs;
+    uint32_t        _numScheduledInferences;
+    uint32_t        _lastReceiveSequenceID;
+    uint32_t        _sendSequenceID;
+    bool            _running;
+    uint32_t        _numReceivedSourceBuffers;
+    volatile uint32_t   _isrCount;
+    alt_msgdma_dev*     _pMsgDevice;
+    alt_msgdma_extended_descriptor _msgdmaDescriptor;
+} StreamController;
diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h
new file mode 100644
index 0000000..3891326
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h
@@ -0,0 +1,90 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <stdint.h>
+
+typedef enum
+{
+    MessageType_Invalid,
+    MessageType_NoOperation,
+    MessageType_GetStatus,
+    MessageType_Status,
+    MessageType_ScheduleItem,
+    MessageType_Ping,
+    MessageType_Pong,
+    MessageType_InitializeStreamController,
+    MessageType_ManualArmDmaTransfer,
+    MessageType_ManualScheduleDlaInference
+} MessageType;
+
+typedef enum
+{
+    NiosStatusType_OK = 1000,
+    NiosStatusType_Error,
+    NiosStatusType_BadMessage,
+    NiosStatusType_BadMessageSequence,
+    NiosStatusType_BadDescriptor,
+    NiosStatusType_AsyncTransferFailed,
+    NiosStatusType_MsgDmaFailed,
+    NiosStatusType_InvalidParameter
+} NiosStatusType;
+
+typedef struct
+{
+    uint32_t _messageReadyMagicNumber;
+    uint32_t _messageType;
+    uint32_t _sequenceID;
+    uint32_t _payload;
+} MessageHeader;
+
+// Message payloads:
+
+typedef struct
+{
+    uint32_t _configurationBaseAddressDDR;
+    uint32_t _configurationSize;
+    uint32_t _inputAddressDDR;
+    uint32_t _outputAddressDDR;
+} CoreDlaJobPayload;
+
+typedef struct
+{
+    uint32_t _sourceBufferSize;
+    uint32_t _dropSourceBuffers;
+    uint32_t _numInferenceRequests;
+} InitializeStreamControllerPayload;
+
+typedef struct
+{
+    NiosStatusType _status;
+    uint32_t _statusLineNumber;
+    uint32_t _numReceivedSourceBuffers;
+    uint32_t _numScheduledInferences;
+    uint32_t _numExecutedJobs;
+} StatusMessagePayload;
+
+typedef struct
+{
+    uint32_t _sourceBufferSize;
+    uint32_t _inputAddressDDR;
+    uint32_t _fromHPS;
+} ManualArmDmaTransferPayload;
+
+typedef struct
+{
+    uint32_t _configurationBaseAddressDDR;
+    uint32_t _configurationSize;
+    uint32_t _inputAddressDDR;
+} ManualScheduleDlaInferencePayload;
+
diff --git a/python/openvino/runtime/coredla_device/stream_controller/build.sh b/python/openvino/runtime/coredla_device/stream_controller/build.sh
new file mode 100755
index 0000000..2d22c5e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/stream_controller/build.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+# Run in Nios V Command Shell, Quartus Prime 22.4 or later
+
+quartus_project=$1
+qsys_file=$2
+hex_file=$3
+
+usage()
+{
+    echo "Usage:"
+    echo "    build.sh <quartus_project_file> <qsys_file> <destination_hex_file>"
+}
+
+if [ -z "$quartus_project" ]; then
+    usage
+    exit 1
+fi
+
+if [ -z "$qsys_file" ]; then
+    usage
+    exit 1
+fi
+
+if [ -z "$hex_file" ]; then
+    usage
+    exit 1
+fi
+
+if [ ! -f "$quartus_project" ]; then
+    echo Quartus project file not found "$quartus_project"
+    usage
+    exit 1
+fi
+
+if [ ! -f "$qsys_file" ]; then
+    echo qsys file not found "$qsys_file"
+    usage
+    exit 1
+fi
+
+# Export the bsp folder from the Quartus project, create the
+# CMakeFiles.txt for the application, build the app, then
+# build the stream_controller.hex binary, in the 'build' folder
+
+niosv-bsp -c --quartus-project=$quartus_project --qsys=$qsys_file --type=hal bsp/settings.bsp
+niosv-app --bsp-dir=bsp --app-dir=app --srcs=app --elf-name=stream_controller.elf
+
+# cmake dependency, version 3.14.10 or later. https://cmake.org/download/
+cmake -B build -DCMAKE_BUILD_TYPE=Release app
+cmake --build build
+elf2hex build/stream_controller.elf -b 0x0 -w 32 -e 0x1ffff -r 4 -o build/stream_controller.hex
+cp build/stream_controller.hex $hex_file
+
+exit 0