summaryrefslogtreecommitdiff
path: root/python/openvino/runtime/coredla_device/inc
diff options
context:
space:
mode:
authorEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
committerEric Dao <eric@erickhangdao.com>2025-03-10 17:54:31 -0400
commitab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch)
treea1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/inc
parent40da1752f2c8639186b72f6838aa415e854d0b1d (diff)
downloadthesis-master.tar.gz
thesis-master.tar.bz2
thesis-master.zip
completed thesisHEADmaster
Diffstat (limited to 'python/openvino/runtime/coredla_device/inc')
-rw-r--r--python/openvino/runtime/coredla_device/inc/batch_job.h31
-rw-r--r--python/openvino/runtime/coredla_device/inc/coredla_batch_job.h88
-rw-r--r--python/openvino/runtime/coredla_device/inc/coredla_device.h144
-rw-r--r--python/openvino/runtime/coredla_device/inc/coredla_graph_job.h83
-rw-r--r--python/openvino/runtime/coredla_device/inc/device.h81
-rw-r--r--python/openvino/runtime/coredla_device/inc/device_memory_allocator.h61
-rw-r--r--python/openvino/runtime/coredla_device/inc/dla_dma_constants.h27
-rw-r--r--python/openvino/runtime/coredla_device/inc/graph_job.h28
-rw-r--r--python/openvino/runtime/coredla_device/inc/mmd_wrapper.h63
-rw-r--r--python/openvino/runtime/coredla_device/inc/stream_controller_comms.h69
10 files changed, 675 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h
new file mode 100644
index 0000000..76fd968
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/batch_job.h
@@ -0,0 +1,31 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef BATCH_JOB_H
+#define BATCH_JOB_H
+
+class BatchJob {
+ public:
+ // @param inputArray - ptr to CPU array containing input data to be copied to DDR
+ // blocking function
+ virtual void LoadInputFeatureToDDR(void* inputArray) = 0;
+ // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+ // outputArray must be allocated by the caller (size >= output_size_ddr)
+ // blocking function
+ virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0;
+ virtual void ScheduleInputFeature() const = 0;
+ virtual void StartDla() = 0;
+ virtual ~BatchJob() {}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
new file mode 100644
index 0000000..7d91f0e
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h
@@ -0,0 +1,88 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "batch_job.h" // BatchJob
+#include "mmd_wrapper.h" // MmdWrapper
+
+// TODO:integrate with dla compiler later
+// #include "dla_types.h"
+// #include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint> // uint64_t
+#include <memory> // std::unique_ptr
+
+class StreamControllerComms;
+
+// BatchJob represents one batch execution
+// Contains input/output address and size in DDR for one batch
+// Contains functions to write feature data to DDR, start DLA and read output data from DDR
+class CoreDlaBatchJob : public BatchJob {
+ private:
+ // MMD access is required to handshake with CSR and transfer data between host/device memory
+ MmdWrapper* mmdWrapper_;
+ int instance_;
+ // size and address of graph config data allocated in DDR
+ uint64_t totalConfigWords_;
+ uint64_t configBaseAddrDDR_;
+ // size and address of input and output data allocated in DDR for 1 batch
+ uint64_t inputAddrDDR_;
+ uint64_t outputAddrDDR_;
+ uint64_t inputSizeDDR_;
+ uint64_t outputSizeDDR_;
+ const bool enableIstream_;
+ const bool enableOstream_;
+ uint64_t lastJobQueueNumber_;
+
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+
+ CoreDlaBatchJob(MmdWrapper* mmdWrapper,
+ uint64_t totalConfigWords,
+ uint64_t configBaseAddrDDR,
+ uint64_t inputAddrDDR,
+ uint64_t outputAddrDDR,
+ uint64_t inputSizeDDR,
+ uint64_t outputSizeDDR,
+ const bool enableIstream,
+ const bool enableOstream,
+ int instance,
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+
+ public:
+ CoreDlaBatchJob(const CoreDlaBatchJob&) = delete;
+ CoreDlaBatchJob(CoreDlaBatchJob&) = delete;
+ CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete;
+ static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper,
+ uint64_t totalConfigWords,
+ uint64_t configBaseAddrDDR,
+ uint64_t inputAddrDDR,
+ uint64_t outputAddrDDR,
+ uint64_t inputSizeDDR,
+ uint64_t outputSizeDDR,
+ const bool enableIstream,
+ const bool enableOstream,
+ int instance,
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+ // @param inputArray - ptr to CPU array containing input data tp be copied to DDR
+ // blocking function
+ void LoadInputFeatureToDDR(void* inputArray) override;
+ void ScheduleInputFeature() const override;
+
+ // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data
+ void StartDla() override;
+ // @param outputArray - ptr to CPU array where the output data in DDR is copied into
+ // outputArray must be allocated by the caller (size >= output_size_ddr)
+ // blocking function
+ void ReadOutputFeatureFromDDR(void* outputArray) const override;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h
new file mode 100644
index 0000000..2a04fa8
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h
@@ -0,0 +1,144 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h" //dla::CompiledResult
+#include "device.h" //Device
+#include "device_memory_allocator.h" //DeviceMemoryAllocator
+#include "graph_job.h" //GraphJob
+#include "mmd_wrapper.h" //MmdWrapper
+
+#include <condition_variable> //std::condition_variable
+#include <cstdint> //uint64_t
+#include <map> //std::map
+#include <memory> //std::unique_ptr
+#include <mutex> //std::mutex
+#include <vector> //std::vector
+
+class StreamControllerComms;
+
+// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through
+// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job
+// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of
+// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and
+// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector.
+// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR
+// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide
+struct InterruptServiceRoutineData {
+ MmdWrapper* mmdWrapper;
+ std::vector<uint64_t> jobsFinished;
+ std::vector<uint32_t> base_multiplier;
+ std::vector<uint32_t> prevCount;
+ std::vector<uint32_t> desc_queue_diag;
+ std::vector<std::mutex> isrMutex;
+ std::vector<std::condition_variable> isrCondVar;
+};
+
+/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack
+ * On construction, dynamically loads MMD library at runtime and initialized the state of MMD
+ * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts
+ */
+class CoreDlaDevice : public Device {
+ public:
+ GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+#ifndef USE_OLD_COREDLA_DEVICE
+ size_t numPipelines,
+#else
+ uint64_t numPipelines,
+#endif
+ int instance,
+ std::string AES_key,
+ std::string IV_key,
+ bool encryption_enabled,
+ // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers
+ // for both HW and SW emulator runtime. The software emulator has output file where as currently the
+ // HW runtime does not.
+ const std::string export_dir,
+ const std::string parameter_rom_export_dir);
+ // Return number of DLA jobs completed till now
+ // Used for debugging
+ int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); }
+ // Must be called when there are no active jobs on DLA
+ // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+ double GetActiveHWTimeMs(int instance) const override;
+ // Must be called when there are no active jobs on DLA
+ // Returns the average of time taken per job (in milliseconds)
+ // Avg Time per job < Active Time
+ double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory read made by the input feature reader
+ uint64_t GetNumInputFeatureMemoryReads(int instance) const override;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory read made by the filter reader
+ uint64_t GetNumFilterMemoryReads(int instance) const override;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory writes made by the output feature writer
+ uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override;
+
+ private:
+ // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail
+ // if the module number and address have not been implemented. The debug network is fault tolerant to both read
+ // requests never being accepted as well as read responses never being produced.
+ bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const;
+
+#ifndef USE_OLD_COREDLA_DEVICE
+ // Must be called when there are no active jobs on DLA
+ // Returns total number of clocks by DLA jobs on hardware.
+ uint64_t GetClocksActive(int instance) const;
+
+ // Must be called when there are no active jobs on DLA
+ // Returns the clocks of all jobs
+ uint64_t GetClocksAllJobs(int instance) const;
+#endif
+
+ uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const;
+
+ uint64_t GetNumFilterMemoryReadsTotal(int instance) const;
+
+ uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const;
+
+ public:
+ // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse
+ // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of
+ // information the debug register contains, and the value is the data of the debug register.
+ DebugNetworkData ReadDebugNetwork(int instance) const override;
+
+ CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds);
+ ~CoreDlaDevice();
+ int GetSizeCsrDescriptorQueue() const override;
+ double GetCoreDlaClockFreq() const override;
+ int GetNumInstances() const override { return numInstances_; }
+ void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override; // threadId is optional and for debugging purpose only
+ std::string SchedulerGetStatus() const override;
+ bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests,
+ const std::string source_fifo_file="") override;
+
+ private:
+ std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_;
+ std::vector<std::unique_ptr<GraphJob>> allGraphJobs_;
+ int numInstances_;
+ MmdWrapper mmdWrapper_;
+ InterruptServiceRoutineData isrData_;
+ std::vector<uint64_t> jobsWaited_;
+#ifndef USE_OLD_COREDLA_DEVICE
+ std::vector<uint64_t> startClocksActive;
+ std::vector<uint64_t> startClockAllJobs;
+#endif
+ std::vector<uint64_t> startNumInputFeatureMemoryReads;
+ std::vector<uint64_t> startNumFilterMemoryReads;
+ std::vector<uint64_t> startNumOutputFeatureMemoryWrites;
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms_;
+ bool runtimePolling_;
+ uint32_t waitForDlaTimeoutSeconds_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
new file mode 100644
index 0000000..3dc91bc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h
@@ -0,0 +1,83 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "compiled_result.h" //dla::CompiledResult
+#include "coredla_batch_job.h" //BatchJob
+#include "device.h" //DLA_LOG
+#include "device_memory_allocator.h" //DeviceMemoryAllocator
+#include "graph_job.h" //GraphJob
+#include "mmd_wrapper.h" //MmdWrapper
+
+// TODO:integrate with dla compiler later
+//#include "dla_types.h"
+//#include "compiled_result_runtime_required_elements.h"
+
+#include <cstdint> //uint64_t
+#include <memory> //std::unique_ptr
+#include <mutex> //std::mutex
+#include <vector> //std::vector
+
+/*! GraphJob is a DLA compiled graph loaded onto a device
+ * Initialized with DlaDevice object
+ * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs
+ * It provides handle to "batch job" objects that are used to load input and start DLA for one batch
+ */
+
+class CoreDlaGraphJob : public GraphJob {
+ public:
+ // Function to construct and return a unique pointer GraphJob object to the runtime user
+ // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below
+ // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias
+ // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA
+ // compiler's output)
+ // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h
+ // "CONFIG_READER_DATA_BYTES")
+ // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers
+ // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously
+ // placed
+ // @param outputSizeDDR - size of one batch output data in DDR
+ // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs
+ // @param spStreamControllerComms - optional interface to stream controller
+ static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator,
+ MmdWrapper* mmdWrapper,
+ const dla::CompiledResult* compiled_result,
+ uint64_t numPipelines,
+ int instance,
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+ // Returns an unused batch job object
+ // If all batch jobs are used, returns null
+ // Increments batchJobsRequested_
+ // Thread safe
+ BatchJob* GetBatchJob();
+ CoreDlaGraphJob(const GraphJob&) = delete;
+ CoreDlaGraphJob(CoreDlaGraphJob&) = delete;
+ CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete;
+
+ private:
+ uint64_t configFilterBiasBufferSizeDDR_;
+ uint64_t intermediateBufferSizeDDR_;
+ DeviceMemoryAllocator* ddrBufferAllocator_;
+ MmdWrapper* mmdWrapper_;
+ std::vector<std::unique_ptr<BatchJob>> batchJobs_;
+ unsigned int batchJobsRequested_;
+ unsigned int instance_;
+ std::mutex graphJobMutex;
+ CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator,
+ MmdWrapper* mmdWrapper,
+ const dla::CompiledResult* compiledResult,
+ uint64_t numPipelines,
+ int instance,
+ std::shared_ptr<StreamControllerComms> spStreamControllerComms);
+};
diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h
new file mode 100644
index 0000000..e506578
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device.h
@@ -0,0 +1,81 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dla_runtime_log.h"
+
+using namespace std;
+using DebugNetworkData = std::map<std::string, uint64_t>;
+
+// dla log macro
+#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__);
+#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__);
+
+class GraphJob;
+class arch_params;
+namespace dla {
+class CompiledResult;
+}
+class Device {
+ public:
+ static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds);
+ virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult,
+ size_t numPipelines,
+ int instance,
+ std::string AES_key,
+ std::string IV_key,
+ bool encryption_enabled,
+ const std::string export_dir,
+ const std::string parameter_rom_export_dir) = 0;
+ // Return number of DLA jobs completed till now
+ // Used for debugging
+ virtual int GetNumInferencesCompleted(int instance) const = 0;
+ // Must be called when there are no active jobs on DLA
+ // Returns the total time taken by DLA jobs on hardware (in milliseconds)
+ virtual double GetActiveHWTimeMs(int instance) const = 0;
+ // Must be called when there are no active jobs on DLA
+ // Returns the average of time taken per job (in milliseconds)
+ // Avg Time per job < Active Time
+ virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory read made by the input feature reader
+ virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory read made by the filter reader
+ virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0;
+ // Must be called when there are no active jobs on DLA
+ // Returns the number of memory writes made by the output feature writer
+ virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0;
+ // Waits for a job to finish on specified instance
+ virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0;
+ virtual int GetNumInstances() const = 0;
+ virtual double GetCoreDlaClockFreq() const = 0;
+ virtual int GetSizeCsrDescriptorQueue() const = 0;
+ virtual std::string SchedulerGetStatus() const = 0;
+ virtual bool InitializeScheduler(uint32_t sourceBufferSize,
+ uint32_t dropSourceBuffers,
+ uint32_t numInferenceRequests,
+ const std::string source_fifo_file="") = 0;
+ virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0;
+ virtual ~Device(){}
+};
+
+#endif // DEVICE_H
diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
new file mode 100644
index 0000000..adc0a71
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h
@@ -0,0 +1,61 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include "mmd_wrapper.h" //MmdWrapper
+
+#include <cstdint> //uint64_t
+
+/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR
+ * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O)
+ * A graph buffer is allocated in DDR from right to left
+ * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data
+ * This intermediate buffer space is allocated from left to right (starting address is 0)
+ * and is expanded based on graph's requirement
+ */
+class DeviceMemoryAllocator {
+ public:
+ void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper);
+ ~DeviceMemoryAllocator();
+
+ // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate
+ // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards.
+ // @param bufferSize - the size of the buffer in bytes
+ // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for
+ void AllocateSharedBuffer(uint64_t bufferSize, int instance);
+
+ // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is
+ // an example of this. We have decided to allocate this at the upper address and allocate downwards from there.
+ // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add
+ // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call
+ // to the allocator.
+ // @param bufferSize - the size of the buffer in bytes
+ // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2
+ // @param bufferAddr - the allocator indicates where it placed this buffer
+ void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr);
+
+ // Clears whole DDR space including the intermediate buffer
+ void Clear();
+
+ private:
+ // total DDR size (BSP parameter)
+ uint64_t totalGlobalMemSize_;
+ // For access to MMD
+ MmdWrapper *mmdWrapper_;
+ // current starting address of allocated graph buffer region
+ // graph buffers are allocated right to left
+ uint64_t currentStartAddressGraphBufferSpace_;
+ // current maximum allocated size for intermediate data
+ uint64_t currentIntermediateMaxBufferSizeAllocated_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
new file mode 100644
index 0000000..13fb56b
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h
@@ -0,0 +1,27 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+// save a copy
+#pragma push_macro("localparam")
+
+// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;"
+#undef localparam
+#define localparam constexpr
+
+// include the verilog header
+#include "dla_dma_constants.svh"
+
+// undo the syntax change
+#pragma pop_macro("localparam")
diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h
new file mode 100644
index 0000000..b04dde1
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/graph_job.h
@@ -0,0 +1,28 @@
+// Copyright 2020-2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#ifndef GRAPH_JOB_H
+#define GRAPH_JOB_H
+
+#include "batch_job.h"
+using namespace std;
+class GraphJob {
+ public:
+ // Returns an unused batch job object
+ // If all batch jobs are used, returns null
+ virtual BatchJob* GetBatchJob() = 0;
+
+ virtual ~GraphJob(){}
+};
+
+#endif
diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
new file mode 100644
index 0000000..4014454
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h
@@ -0,0 +1,63 @@
+// Copyright 2020 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+
+#include <cstdint> //uint32_t
+
+using interrupt_service_routine_signature = void (*)(int handle, void *data);
+
+class MmdWrapper {
+ public:
+ MmdWrapper();
+ // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close()
+ // fails. Ideally we would find some way to re-order the code so that it
+ // can throw an exception (before calling the destructor) if aocl_mmd_close()
+ // fails.
+ ~MmdWrapper();
+
+ // class cannot be copied
+ MmdWrapper(const MmdWrapper &) = delete;
+ MmdWrapper &operator=(const MmdWrapper &) = delete;
+
+ // Register a function to run as the interrupt service routine
+ void RegisterISR(interrupt_service_routine_signature func, void *data) const;
+
+ // 32-bit handshake with each CSR
+ void WriteToCsr(int instance, uint32_t addr, uint32_t data) const;
+ uint32_t ReadFromCsr(int instance, uint32_t addr) const;
+
+ // Copy data between host and device memory
+ void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const;
+ void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const;
+
+ // If the mmd layer supports accesses to the STREAM CONTROLLER
+ bool bIsStreamControllerValid(int instance) const;
+
+ // 32-bit handshake with each Stream Controller CSR
+ void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const;
+ void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const;
+
+ // Provide read-only access to board-specific constants
+ int GetMaxInstances() const { return maxInstances_; }
+ uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; }
+ double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; }
+ double GetDDRClockFreq() const { return ddrClockFreq_; }
+
+ private:
+ int handle_;
+ int maxInstances_;
+ uint64_t ddrSizePerInstance_;
+ double coreDlaClockFreq_;
+ double ddrClockFreq_;
+};
diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
new file mode 100644
index 0000000..e2fcdfc
--- /dev/null
+++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h
@@ -0,0 +1,69 @@
+// Copyright 2023 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+#pragma once
+#include <mutex>
+#include <string>
+#include <vector>
+#include "mmd_wrapper.h"
+#include "stream_controller_messages.h"
+
+template <class T>
+struct Payload : public T {
+ void* GetPayload() { return this; }
+ size_t GetSize() { return sizeof(*this); }
+};
+
+class BusyFlag {
+ public:
+ bool Lock();
+ void Release();
+
+ private:
+ std::recursive_mutex _mutex;
+ bool _busy = false;
+};
+
+class BusyCheck {
+ public:
+ BusyCheck(BusyFlag& busyFlag);
+ ~BusyCheck();
+ operator bool();
+
+ private:
+ BusyFlag& _busyFlag;
+ bool _haveLocked;
+};
+
+class StreamControllerComms {
+ public:
+ StreamControllerComms();
+ bool IsPresent();
+ Payload<StatusMessagePayload> GetStatus();
+ std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload);
+ bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items);
+ bool Ping();
+ bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests);
+
+ private:
+ bool StatusMessageHandler(uint32_t payloadOffset);
+ MessageType ReceiveMessage();
+ bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0);
+ MmdWrapper _mmdWrapper;
+ uint32_t _lastReceiveSequenceID = 0;
+ uint32_t _sendSequenceID = 0;
+ uint32_t _numBadMessages = 0;
+ const int _streamControllerInstance = 0;
+ Payload<StatusMessagePayload> _receivedStatusMessage;
+ BusyFlag _busyFlag;
+};