diff options
| author | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
|---|---|---|
| committer | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
| commit | ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch) | |
| tree | a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/runtime/coredla_device/inc | |
| parent | 40da1752f2c8639186b72f6838aa415e854d0b1d (diff) | |
| download | thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip | |
Diffstat (limited to 'python/openvino/runtime/coredla_device/inc')
10 files changed, 675 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h new file mode 100644 index 0000000..76fd968 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/batch_job.h @@ -0,0 +1,31 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef BATCH_JOB_H +#define BATCH_JOB_H + +class BatchJob { + public: + // @param inputArray - ptr to CPU array containing input data to be copied to DDR + // blocking function + virtual void LoadInputFeatureToDDR(void* inputArray) = 0; + // @param outputArray - ptr to CPU array where the output data in DDR is copied into + // outputArray must be allocated by the caller (size >= output_size_ddr) + // blocking function + virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0; + virtual void ScheduleInputFeature() const = 0; + virtual void StartDla() = 0; + virtual ~BatchJob() {} +}; + +#endif diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h new file mode 100644 index 0000000..7d91f0e --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h @@ -0,0 +1,88 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "batch_job.h" // BatchJob +#include "mmd_wrapper.h" // MmdWrapper + +// TODO:integrate with dla compiler later +// #include "dla_types.h" +// #include "compiled_result_runtime_required_elements.h" + +#include <cstdint> // uint64_t +#include <memory> // std::unique_ptr + +class StreamControllerComms; + +// BatchJob represents one batch execution +// Contains input/output address and size in DDR for one batch +// Contains functions to write feature data to DDR, start DLA and read output data from DDR +class CoreDlaBatchJob : public BatchJob { + private: + // MMD access is required to handshake with CSR and transfer data between host/device memory + MmdWrapper* mmdWrapper_; + int instance_; + // size and address of graph config data allocated in DDR + uint64_t totalConfigWords_; + uint64_t configBaseAddrDDR_; + // size and address of input and output data allocated in DDR for 1 batch + uint64_t inputAddrDDR_; + uint64_t outputAddrDDR_; + uint64_t inputSizeDDR_; + uint64_t outputSizeDDR_; + const bool enableIstream_; + const bool enableOstream_; + uint64_t lastJobQueueNumber_; + + std::shared_ptr<StreamControllerComms> spStreamControllerComms_; + + CoreDlaBatchJob(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + + public: + CoreDlaBatchJob(const CoreDlaBatchJob&) = delete; + CoreDlaBatchJob(CoreDlaBatchJob&) = delete; + CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete; + static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + // @param inputArray - ptr to CPU array containing input data tp be copied to DDR + // blocking function + void LoadInputFeatureToDDR(void* inputArray) override; + void ScheduleInputFeature() const override; + + // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data + void StartDla() override; + // @param outputArray - ptr to CPU array where the output data in DDR is copied into + // outputArray must be allocated by the caller (size >= output_size_ddr) + // blocking function + void ReadOutputFeatureFromDDR(void* outputArray) const override; +}; diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h new file mode 100644 index 0000000..2a04fa8 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h @@ -0,0 +1,144 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "compiled_result.h" //dla::CompiledResult +#include "device.h" //Device +#include "device_memory_allocator.h" //DeviceMemoryAllocator +#include "graph_job.h" //GraphJob +#include "mmd_wrapper.h" //MmdWrapper + +#include <condition_variable> //std::condition_variable +#include <cstdint> //uint64_t +#include <map> //std::map +#include <memory> //std::unique_ptr +#include <mutex> //std::mutex +#include <vector> //std::vector + +class StreamControllerComms; + +// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through +// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job +// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of +// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and +// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector. +// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR +// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide +struct InterruptServiceRoutineData { + MmdWrapper* mmdWrapper; + std::vector<uint64_t> jobsFinished; + std::vector<uint32_t> base_multiplier; + std::vector<uint32_t> prevCount; + std::vector<uint32_t> desc_queue_diag; + std::vector<std::mutex> isrMutex; + std::vector<std::condition_variable> isrCondVar; +}; + +/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack + * On construction, dynamically loads MMD library at runtime and initialized the state of MMD + * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts + */ +class CoreDlaDevice : public Device { + public: + GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult, +#ifndef USE_OLD_COREDLA_DEVICE + size_t numPipelines, +#else + uint64_t numPipelines, +#endif + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers + // for both HW and SW emulator runtime. The software emulator has output file where as currently the + // HW runtime does not. + const std::string export_dir, + const std::string parameter_rom_export_dir); + // Return number of DLA jobs completed till now + // Used for debugging + int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); } + // Must be called when there are no active jobs on DLA + // Returns the total time taken by DLA jobs on hardware (in milliseconds) + double GetActiveHWTimeMs(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the average of time taken per job (in milliseconds) + // Avg Time per job < Active Time + double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the input feature reader + uint64_t GetNumInputFeatureMemoryReads(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the filter reader + uint64_t GetNumFilterMemoryReads(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory writes made by the output feature writer + uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override; + + private: + // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail + // if the module number and address have not been implemented. The debug network is fault tolerant to both read + // requests never being accepted as well as read responses never being produced. + bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const; + +#ifndef USE_OLD_COREDLA_DEVICE + // Must be called when there are no active jobs on DLA + // Returns total number of clocks by DLA jobs on hardware. + uint64_t GetClocksActive(int instance) const; + + // Must be called when there are no active jobs on DLA + // Returns the clocks of all jobs + uint64_t GetClocksAllJobs(int instance) const; +#endif + + uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const; + + uint64_t GetNumFilterMemoryReadsTotal(int instance) const; + + uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const; + + public: + // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse + // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of + // information the debug register contains, and the value is the data of the debug register. + DebugNetworkData ReadDebugNetwork(int instance) const override; + + CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds); + ~CoreDlaDevice(); + int GetSizeCsrDescriptorQueue() const override; + double GetCoreDlaClockFreq() const override; + int GetNumInstances() const override { return numInstances_; } + void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override; // threadId is optional and for debugging purpose only + std::string SchedulerGetStatus() const override; + bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests, + const std::string source_fifo_file="") override; + + private: + std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_; + std::vector<std::unique_ptr<GraphJob>> allGraphJobs_; + int numInstances_; + MmdWrapper mmdWrapper_; + InterruptServiceRoutineData isrData_; + std::vector<uint64_t> jobsWaited_; +#ifndef USE_OLD_COREDLA_DEVICE + std::vector<uint64_t> startClocksActive; + std::vector<uint64_t> startClockAllJobs; +#endif + std::vector<uint64_t> startNumInputFeatureMemoryReads; + std::vector<uint64_t> startNumFilterMemoryReads; + std::vector<uint64_t> startNumOutputFeatureMemoryWrites; + std::shared_ptr<StreamControllerComms> spStreamControllerComms_; + bool runtimePolling_; + uint32_t waitForDlaTimeoutSeconds_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h new file mode 100644 index 0000000..3dc91bc --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h @@ -0,0 +1,83 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "compiled_result.h" //dla::CompiledResult +#include "coredla_batch_job.h" //BatchJob +#include "device.h" //DLA_LOG +#include "device_memory_allocator.h" //DeviceMemoryAllocator +#include "graph_job.h" //GraphJob +#include "mmd_wrapper.h" //MmdWrapper + +// TODO:integrate with dla compiler later +//#include "dla_types.h" +//#include "compiled_result_runtime_required_elements.h" + +#include <cstdint> //uint64_t +#include <memory> //std::unique_ptr +#include <mutex> //std::mutex +#include <vector> //std::vector + +/*! GraphJob is a DLA compiled graph loaded onto a device + * Initialized with DlaDevice object + * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs + * It provides handle to "batch job" objects that are used to load input and start DLA for one batch + */ + +class CoreDlaGraphJob : public GraphJob { + public: + // Function to construct and return a unique pointer GraphJob object to the runtime user + // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below + // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias + // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA + // compiler's output) + // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h + // "CONFIG_READER_DATA_BYTES") + // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers + // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously + // placed + // @param outputSizeDDR - size of one batch output data in DDR + // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs + // @param spStreamControllerComms - optional interface to stream controller + static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator, + MmdWrapper* mmdWrapper, + const dla::CompiledResult* compiled_result, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + // Returns an unused batch job object + // If all batch jobs are used, returns null + // Increments batchJobsRequested_ + // Thread safe + BatchJob* GetBatchJob(); + CoreDlaGraphJob(const GraphJob&) = delete; + CoreDlaGraphJob(CoreDlaGraphJob&) = delete; + CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete; + + private: + uint64_t configFilterBiasBufferSizeDDR_; + uint64_t intermediateBufferSizeDDR_; + DeviceMemoryAllocator* ddrBufferAllocator_; + MmdWrapper* mmdWrapper_; + std::vector<std::unique_ptr<BatchJob>> batchJobs_; + unsigned int batchJobsRequested_; + unsigned int instance_; + std::mutex graphJobMutex; + CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator, + MmdWrapper* mmdWrapper, + const dla::CompiledResult* compiledResult, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); +}; diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h new file mode 100644 index 0000000..e506578 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/device.h @@ -0,0 +1,81 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef DEVICE_H +#define DEVICE_H + +#include <functional> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "dla_runtime_log.h" + +using namespace std; +using DebugNetworkData = std::map<std::string, uint64_t>; + +// dla log macro +#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__); +#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__); + +class GraphJob; +class arch_params; +namespace dla { +class CompiledResult; +} +class Device { + public: + static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds); + virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult, + size_t numPipelines, + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + const std::string export_dir, + const std::string parameter_rom_export_dir) = 0; + // Return number of DLA jobs completed till now + // Used for debugging + virtual int GetNumInferencesCompleted(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the total time taken by DLA jobs on hardware (in milliseconds) + virtual double GetActiveHWTimeMs(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the average of time taken per job (in milliseconds) + // Avg Time per job < Active Time + virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the input feature reader + virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the filter reader + virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory writes made by the output feature writer + virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0; + // Waits for a job to finish on specified instance + virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0; + virtual int GetNumInstances() const = 0; + virtual double GetCoreDlaClockFreq() const = 0; + virtual int GetSizeCsrDescriptorQueue() const = 0; + virtual std::string SchedulerGetStatus() const = 0; + virtual bool InitializeScheduler(uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests, + const std::string source_fifo_file="") = 0; + virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0; + virtual ~Device(){} +}; + +#endif // DEVICE_H diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h new file mode 100644 index 0000000..adc0a71 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h @@ -0,0 +1,61 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "mmd_wrapper.h" //MmdWrapper + +#include <cstdint> //uint64_t + +/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR + * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O) + * A graph buffer is allocated in DDR from right to left + * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data + * This intermediate buffer space is allocated from left to right (starting address is 0) + * and is expanded based on graph's requirement + */ +class DeviceMemoryAllocator { + public: + void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper); + ~DeviceMemoryAllocator(); + + // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate + // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards. + // @param bufferSize - the size of the buffer in bytes + // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for + void AllocateSharedBuffer(uint64_t bufferSize, int instance); + + // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is + // an example of this. We have decided to allocate this at the upper address and allocate downwards from there. + // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add + // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call + // to the allocator. + // @param bufferSize - the size of the buffer in bytes + // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2 + // @param bufferAddr - the allocator indicates where it placed this buffer + void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr); + + // Clears whole DDR space including the intermediate buffer + void Clear(); + + private: + // total DDR size (BSP parameter) + uint64_t totalGlobalMemSize_; + // For access to MMD + MmdWrapper *mmdWrapper_; + // current starting address of allocated graph buffer region + // graph buffers are allocated right to left + uint64_t currentStartAddressGraphBufferSpace_; + // current maximum allocated size for intermediate data + uint64_t currentIntermediateMaxBufferSizeAllocated_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h new file mode 100644 index 0000000..13fb56b --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h @@ -0,0 +1,27 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +// save a copy +#pragma push_macro("localparam") + +// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;" +#undef localparam +#define localparam constexpr + +// include the verilog header +#include "dla_dma_constants.svh" + +// undo the syntax change +#pragma pop_macro("localparam") diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h new file mode 100644 index 0000000..b04dde1 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/graph_job.h @@ -0,0 +1,28 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef GRAPH_JOB_H +#define GRAPH_JOB_H + +#include "batch_job.h" +using namespace std; +class GraphJob { + public: + // Returns an unused batch job object + // If all batch jobs are used, returns null + virtual BatchJob* GetBatchJob() = 0; + + virtual ~GraphJob(){} +}; + +#endif diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h new file mode 100644 index 0000000..4014454 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h @@ -0,0 +1,63 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include <cstdint> //uint32_t + +using interrupt_service_routine_signature = void (*)(int handle, void *data); + +class MmdWrapper { + public: + MmdWrapper(); + // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close() + // fails. Ideally we would find some way to re-order the code so that it + // can throw an exception (before calling the destructor) if aocl_mmd_close() + // fails. + ~MmdWrapper(); + + // class cannot be copied + MmdWrapper(const MmdWrapper &) = delete; + MmdWrapper &operator=(const MmdWrapper &) = delete; + + // Register a function to run as the interrupt service routine + void RegisterISR(interrupt_service_routine_signature func, void *data) const; + + // 32-bit handshake with each CSR + void WriteToCsr(int instance, uint32_t addr, uint32_t data) const; + uint32_t ReadFromCsr(int instance, uint32_t addr) const; + + // Copy data between host and device memory + void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const; + void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const; + + // If the mmd layer supports accesses to the STREAM CONTROLLER + bool bIsStreamControllerValid(int instance) const; + + // 32-bit handshake with each Stream Controller CSR + void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const; + void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const; + + // Provide read-only access to board-specific constants + int GetMaxInstances() const { return maxInstances_; } + uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; } + double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; } + double GetDDRClockFreq() const { return ddrClockFreq_; } + + private: + int handle_; + int maxInstances_; + uint64_t ddrSizePerInstance_; + double coreDlaClockFreq_; + double ddrClockFreq_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h new file mode 100644 index 0000000..e2fcdfc --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h @@ -0,0 +1,69 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once +#include <mutex> +#include <string> +#include <vector> +#include "mmd_wrapper.h" +#include "stream_controller_messages.h" + +template <class T> +struct Payload : public T { + void* GetPayload() { return this; } + size_t GetSize() { return sizeof(*this); } +}; + +class BusyFlag { + public: + bool Lock(); + void Release(); + + private: + std::recursive_mutex _mutex; + bool _busy = false; +}; + +class BusyCheck { + public: + BusyCheck(BusyFlag& busyFlag); + ~BusyCheck(); + operator bool(); + + private: + BusyFlag& _busyFlag; + bool _haveLocked; +}; + +class StreamControllerComms { + public: + StreamControllerComms(); + bool IsPresent(); + Payload<StatusMessagePayload> GetStatus(); + std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload); + bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items); + bool Ping(); + bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests); + + private: + bool StatusMessageHandler(uint32_t payloadOffset); + MessageType ReceiveMessage(); + bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0); + MmdWrapper _mmdWrapper; + uint32_t _lastReceiveSequenceID = 0; + uint32_t _sendSequenceID = 0; + uint32_t _numBadMessages = 0; + const int _streamControllerInstance = 0; + Payload<StatusMessagePayload> _receivedStatusMessage; + BusyFlag _busyFlag; +}; |
