diff options
Diffstat (limited to 'python/openvino/runtime/coredla_device')
118 files changed, 26443 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/inc/batch_job.h b/python/openvino/runtime/coredla_device/inc/batch_job.h new file mode 100644 index 0000000..76fd968 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/batch_job.h @@ -0,0 +1,31 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef BATCH_JOB_H +#define BATCH_JOB_H + +class BatchJob { + public: + // @param inputArray - ptr to CPU array containing input data to be copied to DDR + // blocking function + virtual void LoadInputFeatureToDDR(void* inputArray) = 0; + // @param outputArray - ptr to CPU array where the output data in DDR is copied into + // outputArray must be allocated by the caller (size >= output_size_ddr) + // blocking function + virtual void ReadOutputFeatureFromDDR(void* outputArray) const = 0; + virtual void ScheduleInputFeature() const = 0; + virtual void StartDla() = 0; + virtual ~BatchJob() {} +}; + +#endif diff --git a/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h new file mode 100644 index 0000000..7d91f0e --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_batch_job.h @@ -0,0 +1,88 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "batch_job.h" // BatchJob +#include "mmd_wrapper.h" // MmdWrapper + +// TODO:integrate with dla compiler later +// #include "dla_types.h" +// #include "compiled_result_runtime_required_elements.h" + +#include <cstdint> // uint64_t +#include <memory> // std::unique_ptr + +class StreamControllerComms; + +// BatchJob represents one batch execution +// Contains input/output address and size in DDR for one batch +// Contains functions to write feature data to DDR, start DLA and read output data from DDR +class CoreDlaBatchJob : public BatchJob { + private: + // MMD access is required to handshake with CSR and transfer data between host/device memory + MmdWrapper* mmdWrapper_; + int instance_; + // size and address of graph config data allocated in DDR + uint64_t totalConfigWords_; + uint64_t configBaseAddrDDR_; + // size and address of input and output data allocated in DDR for 1 batch + uint64_t inputAddrDDR_; + uint64_t outputAddrDDR_; + uint64_t inputSizeDDR_; + uint64_t outputSizeDDR_; + const bool enableIstream_; + const bool enableOstream_; + uint64_t lastJobQueueNumber_; + + std::shared_ptr<StreamControllerComms> spStreamControllerComms_; + + CoreDlaBatchJob(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + + public: + CoreDlaBatchJob(const CoreDlaBatchJob&) = delete; + CoreDlaBatchJob(CoreDlaBatchJob&) = delete; + CoreDlaBatchJob& operator=(const CoreDlaBatchJob&) = delete; + static std::unique_ptr<BatchJob> MakeUnique(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + // @param inputArray - ptr to CPU array containing input data tp be copied to DDR + // blocking function + void LoadInputFeatureToDDR(void* inputArray) override; + void ScheduleInputFeature() const override; + + // Starts DLA by writing to CSR in DLA DMA; the DDR addresses of graph config and input data + void StartDla() override; + // @param outputArray - ptr to CPU array where the output data in DDR is copied into + // outputArray must be allocated by the caller (size >= output_size_ddr) + // blocking function + void ReadOutputFeatureFromDDR(void* outputArray) const override; +}; diff --git a/python/openvino/runtime/coredla_device/inc/coredla_device.h b/python/openvino/runtime/coredla_device/inc/coredla_device.h new file mode 100644 index 0000000..2a04fa8 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_device.h @@ -0,0 +1,144 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "compiled_result.h" //dla::CompiledResult +#include "device.h" //Device +#include "device_memory_allocator.h" //DeviceMemoryAllocator +#include "graph_job.h" //GraphJob +#include "mmd_wrapper.h" //MmdWrapper + +#include <condition_variable> //std::condition_variable +#include <cstdint> //uint64_t +#include <map> //std::map +#include <memory> //std::unique_ptr +#include <mutex> //std::mutex +#include <vector> //std::vector + +class StreamControllerComms; + +// The interface of the interrupt service routine dictates that all the data the ISR needs must be passed in through +// one pointer of type void *. Package it up here. WaitForDla() uses jobsWaited and jobsFinished to determine if a job +// has already finished or it still needs wait. The ISR only updates jobsFinished, so jobsWaited is only a member of +// CoreDlaDevice. The mutex and condition variable are used to synchronize between InterruptServiceRoutine() and +// WaitForDla(). All of these are replicated per CoreDLA IP instance, hence the use of vector. +// base_multiplier and prevCount are used to handle the jobsFinished wrap-around that could happen in the hardware CSR +// as the CSR is only 32-bit wide but the jobsFinished is 64-bit wide +struct InterruptServiceRoutineData { + MmdWrapper* mmdWrapper; + std::vector<uint64_t> jobsFinished; + std::vector<uint32_t> base_multiplier; + std::vector<uint32_t> prevCount; + std::vector<uint32_t> desc_queue_diag; + std::vector<std::mutex> isrMutex; + std::vector<std::condition_variable> isrCondVar; +}; + +/*! DlaDevice class represents a DLA device mapped using the MMD + OPAE SW stack + * On construction, dynamically loads MMD library at runtime and initialized the state of MMD + * Implememts functions that wrap various MMD calls to read/write to DDR/CSR and process HW interrupts + */ +class CoreDlaDevice : public Device { + public: + GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult, +#ifndef USE_OLD_COREDLA_DEVICE + size_t numPipelines, +#else + uint64_t numPipelines, +#endif + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + // This param is unused for HW runtime! So why inlcude it? CoreDLA utilizes base pointers + // for both HW and SW emulator runtime. The software emulator has output file where as currently the + // HW runtime does not. + const std::string export_dir, + const std::string parameter_rom_export_dir); + // Return number of DLA jobs completed till now + // Used for debugging + int GetNumInferencesCompleted(int instance) const override { return isrData_.jobsFinished.at(instance); } + // Must be called when there are no active jobs on DLA + // Returns the total time taken by DLA jobs on hardware (in milliseconds) + double GetActiveHWTimeMs(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the average of time taken per job (in milliseconds) + // Avg Time per job < Active Time + double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the input feature reader + uint64_t GetNumInputFeatureMemoryReads(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the filter reader + uint64_t GetNumFilterMemoryReads(int instance) const override; + // Must be called when there are no active jobs on DLA + // Returns the number of memory writes made by the output feature writer + uint64_t GetNumOutputFeatureMemoryWrites(int instance) const override; + + private: + // Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail + // if the module number and address have not been implemented. The debug network is fault tolerant to both read + // requests never being accepted as well as read responses never being produced. + bool ReadDebugCsr(uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose = false) const; + +#ifndef USE_OLD_COREDLA_DEVICE + // Must be called when there are no active jobs on DLA + // Returns total number of clocks by DLA jobs on hardware. + uint64_t GetClocksActive(int instance) const; + + // Must be called when there are no active jobs on DLA + // Returns the clocks of all jobs + uint64_t GetClocksAllJobs(int instance) const; +#endif + + uint64_t GetNumInputFeatureMemoryReadsTotal(int instance) const; + + uint64_t GetNumFilterMemoryReadsTotal(int instance) const; + + uint64_t GetNumOutputFeatureMemoryWritesTotal(int instance) const; + + public: + // Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse + // this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of + // information the debug register contains, and the value is the data of the debug register. + DebugNetworkData ReadDebugNetwork(int instance) const override; + + CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds); + ~CoreDlaDevice(); + int GetSizeCsrDescriptorQueue() const override; + double GetCoreDlaClockFreq() const override; + int GetNumInstances() const override { return numInstances_; } + void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) override; // threadId is optional and for debugging purpose only + std::string SchedulerGetStatus() const override; + bool InitializeScheduler(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests, + const std::string source_fifo_file="") override; + + private: + std::unique_ptr<DeviceMemoryAllocator[]> ddrAllocator_; + std::vector<std::unique_ptr<GraphJob>> allGraphJobs_; + int numInstances_; + MmdWrapper mmdWrapper_; + InterruptServiceRoutineData isrData_; + std::vector<uint64_t> jobsWaited_; +#ifndef USE_OLD_COREDLA_DEVICE + std::vector<uint64_t> startClocksActive; + std::vector<uint64_t> startClockAllJobs; +#endif + std::vector<uint64_t> startNumInputFeatureMemoryReads; + std::vector<uint64_t> startNumFilterMemoryReads; + std::vector<uint64_t> startNumOutputFeatureMemoryWrites; + std::shared_ptr<StreamControllerComms> spStreamControllerComms_; + bool runtimePolling_; + uint32_t waitForDlaTimeoutSeconds_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h new file mode 100644 index 0000000..3dc91bc --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/coredla_graph_job.h @@ -0,0 +1,83 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "compiled_result.h" //dla::CompiledResult +#include "coredla_batch_job.h" //BatchJob +#include "device.h" //DLA_LOG +#include "device_memory_allocator.h" //DeviceMemoryAllocator +#include "graph_job.h" //GraphJob +#include "mmd_wrapper.h" //MmdWrapper + +// TODO:integrate with dla compiler later +//#include "dla_types.h" +//#include "compiled_result_runtime_required_elements.h" + +#include <cstdint> //uint64_t +#include <memory> //std::unique_ptr +#include <mutex> //std::mutex +#include <vector> //std::vector + +/*! GraphJob is a DLA compiled graph loaded onto a device + * Initialized with DlaDevice object + * GraphJob allocates space in DDR for filter, bias, config, inputs and outputs + * It provides handle to "batch job" objects that are used to load input and start DLA for one batch + */ + +class CoreDlaGraphJob : public GraphJob { + public: + // Function to construct and return a unique pointer GraphJob object to the runtime user + // TODO: Provide DLA compiled result object which will contain all the necessary rutime elements as below + // @param configFilterBiasBufferSizeDDR - total size of the constants - config, filter and bias + // @param configFilterBiasBuffer - ptr to one contigous CPU array for config, filter and bias (obtained from DLA + // compiler's output) + // @param totalConfigWords - size of config data in words (size of 1 config word is defined in dla_device.h + // "CONFIG_READER_DATA_BYTES") + // @param intermediateBufferSizeDDR - size of the buffer space required in DDR for feature data of intermediate layers + // @param inputSizeDDR - size of one batch input data in DDR. Multiple images in one batch should be contigously + // placed + // @param outputSizeDDR - size of one batch output data in DDR + // @param numPipelines - number of I/O bufffer pairs created for CPU-FPGA pipelining of multiple batch runs + // @param spStreamControllerComms - optional interface to stream controller + static std::unique_ptr<GraphJob> MakeUnique(DeviceMemoryAllocator* ddrBufferAllocator, + MmdWrapper* mmdWrapper, + const dla::CompiledResult* compiled_result, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); + // Returns an unused batch job object + // If all batch jobs are used, returns null + // Increments batchJobsRequested_ + // Thread safe + BatchJob* GetBatchJob(); + CoreDlaGraphJob(const GraphJob&) = delete; + CoreDlaGraphJob(CoreDlaGraphJob&) = delete; + CoreDlaGraphJob& operator=(const CoreDlaGraphJob&) = delete; + + private: + uint64_t configFilterBiasBufferSizeDDR_; + uint64_t intermediateBufferSizeDDR_; + DeviceMemoryAllocator* ddrBufferAllocator_; + MmdWrapper* mmdWrapper_; + std::vector<std::unique_ptr<BatchJob>> batchJobs_; + unsigned int batchJobsRequested_; + unsigned int instance_; + std::mutex graphJobMutex; + CoreDlaGraphJob(DeviceMemoryAllocator* ddrBufferAllocator, + MmdWrapper* mmdWrapper, + const dla::CompiledResult* compiledResult, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms); +}; diff --git a/python/openvino/runtime/coredla_device/inc/device.h b/python/openvino/runtime/coredla_device/inc/device.h new file mode 100644 index 0000000..e506578 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/device.h @@ -0,0 +1,81 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef DEVICE_H +#define DEVICE_H + +#include <functional> +#include <map> +#include <memory> +#include <string> +#include <vector> + +#include "dla_runtime_log.h" + +using namespace std; +using DebugNetworkData = std::map<std::string, uint64_t>; + +// dla log macro +#define DLA_LOG(fmt, ...) printf(fmt, ##__VA_ARGS__); +#define DLA_ERROR(fmt, ...) printf(fmt, ##__VA_ARGS__); + +class GraphJob; +class arch_params; +namespace dla { +class CompiledResult; +} +class Device { + public: + static unique_ptr<Device> MakeUnique(const arch_params* archParams, uint32_t waitForDlaTimeoutSeconds); + virtual GraphJob* CreateGraphJob(const dla::CompiledResult* compiledResult, + size_t numPipelines, + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + const std::string export_dir, + const std::string parameter_rom_export_dir) = 0; + // Return number of DLA jobs completed till now + // Used for debugging + virtual int GetNumInferencesCompleted(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the total time taken by DLA jobs on hardware (in milliseconds) + virtual double GetActiveHWTimeMs(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the average of time taken per job (in milliseconds) + // Avg Time per job < Active Time + virtual double GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the input feature reader + virtual uint64_t GetNumInputFeatureMemoryReads(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory read made by the filter reader + virtual uint64_t GetNumFilterMemoryReads(int instance) const = 0; + // Must be called when there are no active jobs on DLA + // Returns the number of memory writes made by the output feature writer + virtual uint64_t GetNumOutputFeatureMemoryWrites(int instance) const = 0; + // Waits for a job to finish on specified instance + virtual void WaitForDla(int instance, size_t threadId = 0, std::function<bool()> isCancelled = nullptr) = 0; + virtual int GetNumInstances() const = 0; + virtual double GetCoreDlaClockFreq() const = 0; + virtual int GetSizeCsrDescriptorQueue() const = 0; + virtual std::string SchedulerGetStatus() const = 0; + virtual bool InitializeScheduler(uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests, + const std::string source_fifo_file="") = 0; + virtual DebugNetworkData ReadDebugNetwork(int instance) const = 0; + virtual ~Device(){} +}; + +#endif // DEVICE_H diff --git a/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h new file mode 100644 index 0000000..adc0a71 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/device_memory_allocator.h @@ -0,0 +1,61 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include "mmd_wrapper.h" //MmdWrapper + +#include <cstdint> //uint64_t + +/*! DeviceMemoryAllocator class allocates multiple DLA graph buffers in DDR + * Each graph is expected to have one contigous buffer containing all data (config, filter, bias, I/O) + * A graph buffer is allocated in DDR from right to left + * A scratchpad space is allocated in DDR to be shared across all graphs for intermediate feature data + * This intermediate buffer space is allocated from left to right (starting address is 0) + * and is expanded based on graph's requirement + */ +class DeviceMemoryAllocator { + public: + void Initialize(uint64_t totalSize, MmdWrapper *mmdWrapper); + ~DeviceMemoryAllocator(); + + // Buffers that can be shared across multiple graphs may grow in size after they are allocated. The intermediate + // buffer is an example of this. We have decided to allocate this at the lowest address and let it grow upwards. + // @param bufferSize - the size of the buffer in bytes + // @param instance - there can be multiple instances of DLA on FPGA, specify which DLA instance is this buffer for + void AllocateSharedBuffer(uint64_t bufferSize, int instance); + + // Buffers that are private to one graph will not change in size after allocation. The config/filter buffer is + // an example of this. We have decided to allocate this at the upper address and allocate downwards from there. + // Hardware requires the starting address of each buffer to have some alignment, and the allocator will add + // as much padding as needed to ensure this. Each contiguous section in device memory should have its own call + // to the allocator. + // @param bufferSize - the size of the buffer in bytes + // @param bufferAlignment - specify how much address alignment is needed for this buffer, must be a power of 2 + // @param bufferAddr - the allocator indicates where it placed this buffer + void AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t &bufferAddr); + + // Clears whole DDR space including the intermediate buffer + void Clear(); + + private: + // total DDR size (BSP parameter) + uint64_t totalGlobalMemSize_; + // For access to MMD + MmdWrapper *mmdWrapper_; + // current starting address of allocated graph buffer region + // graph buffers are allocated right to left + uint64_t currentStartAddressGraphBufferSpace_; + // current maximum allocated size for intermediate data + uint64_t currentIntermediateMaxBufferSizeAllocated_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h new file mode 100644 index 0000000..13fb56b --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/dla_dma_constants.h @@ -0,0 +1,27 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +// save a copy +#pragma push_macro("localparam") + +// convert the syntax of verilog into C++, replace "localparam int MY_VAR = 123;" with "constexpr int MY_VAR = 123;" +#undef localparam +#define localparam constexpr + +// include the verilog header +#include "dla_dma_constants.svh" + +// undo the syntax change +#pragma pop_macro("localparam") diff --git a/python/openvino/runtime/coredla_device/inc/graph_job.h b/python/openvino/runtime/coredla_device/inc/graph_job.h new file mode 100644 index 0000000..b04dde1 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/graph_job.h @@ -0,0 +1,28 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#ifndef GRAPH_JOB_H +#define GRAPH_JOB_H + +#include "batch_job.h" +using namespace std; +class GraphJob { + public: + // Returns an unused batch job object + // If all batch jobs are used, returns null + virtual BatchJob* GetBatchJob() = 0; + + virtual ~GraphJob(){} +}; + +#endif diff --git a/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h new file mode 100644 index 0000000..4014454 --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/mmd_wrapper.h @@ -0,0 +1,63 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include <cstdint> //uint32_t + +using interrupt_service_routine_signature = void (*)(int handle, void *data); + +class MmdWrapper { + public: + MmdWrapper(); + // Note that ~MmdWrapper() can call std::exit(1) if aocl_mmd_close() + // fails. Ideally we would find some way to re-order the code so that it + // can throw an exception (before calling the destructor) if aocl_mmd_close() + // fails. + ~MmdWrapper(); + + // class cannot be copied + MmdWrapper(const MmdWrapper &) = delete; + MmdWrapper &operator=(const MmdWrapper &) = delete; + + // Register a function to run as the interrupt service routine + void RegisterISR(interrupt_service_routine_signature func, void *data) const; + + // 32-bit handshake with each CSR + void WriteToCsr(int instance, uint32_t addr, uint32_t data) const; + uint32_t ReadFromCsr(int instance, uint32_t addr) const; + + // Copy data between host and device memory + void WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const; + void ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const; + + // If the mmd layer supports accesses to the STREAM CONTROLLER + bool bIsStreamControllerValid(int instance) const; + + // 32-bit handshake with each Stream Controller CSR + void WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const; + void ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const; + + // Provide read-only access to board-specific constants + int GetMaxInstances() const { return maxInstances_; } + uint64_t GetDDRSizePerInstance() const { return ddrSizePerInstance_; } + double GetCoreDlaClockFreq() const { return coreDlaClockFreq_; } + double GetDDRClockFreq() const { return ddrClockFreq_; } + + private: + int handle_; + int maxInstances_; + uint64_t ddrSizePerInstance_; + double coreDlaClockFreq_; + double ddrClockFreq_; +}; diff --git a/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h new file mode 100644 index 0000000..e2fcdfc --- /dev/null +++ b/python/openvino/runtime/coredla_device/inc/stream_controller_comms.h @@ -0,0 +1,69 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once +#include <mutex> +#include <string> +#include <vector> +#include "mmd_wrapper.h" +#include "stream_controller_messages.h" + +template <class T> +struct Payload : public T { + void* GetPayload() { return this; } + size_t GetSize() { return sizeof(*this); } +}; + +class BusyFlag { + public: + bool Lock(); + void Release(); + + private: + std::recursive_mutex _mutex; + bool _busy = false; +}; + +class BusyCheck { + public: + BusyCheck(BusyFlag& busyFlag); + ~BusyCheck(); + operator bool(); + + private: + BusyFlag& _busyFlag; + bool _haveLocked; +}; + +class StreamControllerComms { + public: + StreamControllerComms(); + bool IsPresent(); + Payload<StatusMessagePayload> GetStatus(); + std::string GetStatusString(Payload<StatusMessagePayload>& statusPayload); + bool ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items); + bool Ping(); + bool Initialize(uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests); + + private: + bool StatusMessageHandler(uint32_t payloadOffset); + MessageType ReceiveMessage(); + bool SendMessage(MessageType, void* pPayload = nullptr, size_t size = 0); + MmdWrapper _mmdWrapper; + uint32_t _lastReceiveSequenceID = 0; + uint32_t _sendSequenceID = 0; + uint32_t _numBadMessages = 0; + const int _streamControllerInstance = 0; + Payload<StatusMessagePayload> _receivedStatusMessage; + BusyFlag _busyFlag; +}; diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt new file mode 100644 index 0000000..445a304 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/CMakeLists.txt @@ -0,0 +1,62 @@ +# (C) 2017 Intel Corporation. All rights reserved. +# Your use of Intel Corporation's design tools, logic functions and other +# software and tools, and its AMPP partner logic functions, and any output +# files any of the foregoing (including device programming or simulation +# files), and any associated documentation or information are expressly subject +# to the terms and conditions of the Intel Program License Subscription +# Agreement, Intel MegaCore Function License Agreement, or other applicable +# license agreement, including, without limitation, that your use is for the +# sole purpose of programming logic devices manufactured by Intel and sold by +# Intel or its authorized distributors. Please refer to the applicable +# agreement for further details. + +cmake_minimum_required(VERSION 2.8.12) +project(mmd) + +add_definitions(-DI_DK_AFU_ID="11446C9D-AA42-4085-9B3D-4EEF9429A4AD") + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") + +find_package(OPAE REQUIRED) +find_package(NUMA REQUIRED) + +# DLA specific modifications made to the MMD +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD") + +enable_language(C ASM) + +set(ASM_OPTIONS "-x assembler-with-cpp") +if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as") +endif() + +set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}") + +set(MMD_SRC + ./host/mmd.cpp + ./host/mmd_device.cpp + ./host/mmd_dma.cpp + ./host/mmd_helper.cpp + ./host/kernel_interrupt.cpp +) + +# Add a shared library target called intel_opae_mmd +# and build it from the MMD_SRC files +add_library(intel_opae_mmd SHARED ${MMD_SRC}) + +# Specify the include directories to be used when compiling intel_opae_mmd library +target_include_directories(intel_opae_mmd PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + +# Specify libraries needed when linking the intel_opae_mmd library +target_link_libraries(intel_opae_mmd + libopae-c + libnuma +) + +# Set the installation rules for the project +install(TARGETS intel_opae_mmd + LIBRARY DESTINATION lib + COMPONENT intel_opae_mmd +) diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake new file mode 100755 index 0000000..c981150 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindNUMA.cmake @@ -0,0 +1,34 @@ +# - Try to find libnuma +# Once done will define: +# +# NUMA_FOUND - system has libnuma +# NUMA_INCLUDE_DIRS - include directory with numa.h +# NUMA_LIBRARIES - link with this for libnuma + +find_path(NUMA_INCLUDE_DIRS + NAMES numa.h + PATHS + ${LIBNUMA_ROOT}/include + /usr/include + /p/psg/swip/dla/resources/numactl/2.0.16/include + + ) + +find_library(NUMA_LIBRARIES + NAMES numa + PATHS + ${LIBNUMA_ROOT}/lib + ${LIBNUMA_ROOT}/lib64 + /usr/lib + /usr/lib64 + /p/psg/swip/dla/resources/numactl/2.0.16/lib + + ) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA + REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES) + +add_library(libnuma IMPORTED SHARED) +set_target_properties(libnuma PROPERTIES + IMPORTED_LOCATION ${NUMA_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS}) diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake new file mode 100755 index 0000000..6395d7c --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/cmake/modules/FindOPAE.cmake @@ -0,0 +1,44 @@ +# - Try to find libintelfpga +# Once done, this will define +# +# libopae-c_FOUND - system has libopae-c +# libopae-c_INCLUDE_DIRS - the libopae-c include directories +# libopae-c_LIBRARIES - link these to use libopae-c + +find_package(PkgConfig) +pkg_check_modules(PC_OPAE QUIET opae-c) + +# Use pkg-config to get hints about paths +execute_process(COMMAND pkg-config --cflags opae-c --silence-errors + COMMAND cut -d I -f 2 + OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS) +set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library") + +# Include dir +find_path(libopae-c_INCLUDE_DIRS + NAMES opae/fpga.h + PATHS ${LIBOPAE-C_ROOT}/include + ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS} + /usr/local/include + /usr/include + ${CMAKE_EXTRA_INCLUDES}) + +# The library itself +find_library(libopae-c_LIBRARIES + NAMES opae-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE + REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS) + +add_library(libopae-c IMPORTED SHARED) +set_target_properties(libopae-c PROPERTIES + IMPORTED_LOCATION ${libopae-c_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS}) + diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp new file mode 100644 index 0000000..97882d4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.cpp @@ -0,0 +1,257 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#include "kernel_interrupt.h" + +#include <poll.h> +#include <sys/eventfd.h> + +#include <cassert> +#include <cstdint> +#include <cstdlib> +#include <iostream> +#include <thread> + +#include "mmd_device.h" + +using namespace intel_opae_mmd; + +static const int mmd_kernel_interrupt_line_num = 1; +static const uint32_t enable_int_mask = 0x00000001; +static const uint32_t disable_int_mask = 0x00000000; + +bool KernelInterrupt::enable_thread = false; + +static const int debug_log_level = 0; + +// TODO: use consistent function throughout MMD for controlling debug +// messages. This debug_print function is from OFS. +static void debug_print(std::string &err_msg, int msglog) { + if (debug_log_level >= msglog) { + std::cerr << "KernelInterrupt: " << err_msg << std::endl; + } +} + +static inline void check_result(fpga_result res, const char *err_str) { + if (res == FPGA_OK) { + return; + } + std::string opae_err_str = + std::string("KernelInterrupt: ") + std::string(err_str) + std::string(": ") + std::string(fpgaErrStr(res)); +} + +/** KernelInterrupt constructor + */ +KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle) + : m_work_thread_active(false), + m_eventfd(0), + m_kernel_interrupt_fn(nullptr), + m_kernel_interrupt_user_data(nullptr), + m_fpga_handle(fpga_handle_arg), + m_mmd_handle(mmd_handle), + m_event_handle(nullptr) { + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt Constructor\n"); + } + set_member_for_interrupts(); + enable_interrupts(); +} + +/** KernelInterrupt destructor + * calls disable_interrupts() + */ +KernelInterrupt::~KernelInterrupt() { + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt Destructor\n"); + } + try { + disable_interrupts(); + } catch (...) { + std::string err("destructor error"); + debug_print(err, 0); + } +} + +/** disable_interrupts() function is used in KernelInterrupt destructor + * if interupt not enabled , !enable_thread + * then disable interrupt mask + * else if interrupts are used, + * call noftify_work_thread(), join the thread + * we call OPAE API fpgaUnregisterEvent() to unregister FPGA event, + * it tells driver caller is no longer interested in notification for event associated with m_event_handle + * we call OPAE API fpgaDestroyEventHandle() to free resources + */ +void KernelInterrupt::disable_interrupts() { + if (!enable_thread) { + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n"); + } + assert(m_work_thread_active == false); + return; + } + + m_work_thread_active = false; + notify_work_thread(); + m_work_thread->join(); + + if (m_event_handle != nullptr) { + fpga_result res; + + res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle); + check_result(res, "error fpgaUnregisterEvent"); + + res = fpgaDestroyEventHandle(&m_event_handle); + check_result(res, "error fpgaDestroyEventHandle"); + } + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt disabling interrupts\n"); + } +} + +/** notify_work_thread() function is called by disable_interrupts() function + * eventfd object created by OPAE API fpgaGetOSObjectFromEventHandle() , m_eventfd, + * can be used as an event wait/notify mechanism by user space applications and by kernel, + * to notify user space applications of events + * every time write() is performed on eventfd, + * the value of uint64_t being written is added to count and wakeup is performed. + * We dont use read() below but read() will return count value to user space and reset count to 0 + */ +void KernelInterrupt::notify_work_thread() { + uint64_t val = 1; + ssize_t res = write(m_eventfd, &val, sizeof(val)); + if (res < 0) { + std::cerr << "Warning: KernelInterrupts::notify_work_thread()" + " write to eventfd failed: " + << strerror(errno) << std::endl; + } +} + +/** enable_interrupts() function is called by Kernel Interrupt constructor + * if interrupt is not enabled it will disable interrupt mask , set thread active as false and return + * if interrupt is enabled, it will use OPAE APIs to create event handle fpgaCreateEventHandle() + * OPAE event APIs provide functions for handling asynchronous events such as errors and interrupts + * Associated with every event a process has registered for is an fpga_event_handle, + * which encapsulates OS specific data structure for event objects + * On Linux fpga_event_handle can be used as file descriptor + * and passed to select(), poll() and similar functions to wait for asynchronous events + * OPAE API fpgaRegisterEvent() is used to tell driver that caller is interested in notification for event specified + * OPAE API fpgaGetOSObjectFromEventHandle() checks validity of event handle and + * gets OS object used to subscribe and unsubscribe to events + * we create a thread and call work_thread() + */ +void KernelInterrupt::enable_interrupts() { + if (!enable_thread) { + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n"); + } + m_work_thread_active = false; + return; + } + + fpga_result res; + + res = fpgaCreateEventHandle(&m_event_handle); + check_result(res, "error creating event handle"); + + res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, mmd_kernel_interrupt_line_num); + check_result(res, "error registering event"); + + res = fpgaGetOSObjectFromEventHandle(m_event_handle, &m_eventfd); + check_result(res, "error getting event file handle"); + + m_work_thread_active = true; + m_work_thread = std::unique_ptr<std::thread>(new std::thread([this] { this->work_thread(); })); + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt enabling interrupts\n"); + } +} + +/** work_thread() is called from enable_interrupts() function while creating new thread + * it calls wait_for_event(), disables interrupt mask + * creates lock_guard with m_mutex, calls kernel interrupt function and then enables interrupt mask + */ +void KernelInterrupt::work_thread() { + while (m_work_thread_active) { + wait_for_event(); + std::lock_guard<std::mutex> lock(m_mutex); + if (m_kernel_interrupt_fn != nullptr) { + m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data); + } + } +} + +/** wait_for_event() is called from work_thread() function + * it uses poll() function to wait for event on a file descriptor, + * the m_event_fd file descriptor which we got from fpgaOSObjectFromEventHandle() + * poll() uses pollfd struct, which inncludes + * fd - file descriptor, events - requested events, revents - returned events + * timeout argument in poll() specifies number of milliseconds, + * poll() will block waiting for file descriptor + * On success, poll() returns a nonnegative value which is the + * number of elements in the pollfds whose revents fields have been + * set to a nonzero value (indicating an event or an error). A + * return value of zero indicates that the system call timed out + * before any file descriptors became read + */ +void KernelInterrupt::wait_for_event() { + // Use timeout when polling eventfd because sometimes interrupts are missed. + // This may be caused by knonw race condition with runtime, or there may + // be occasional events lost from OPAE. + + MMD_DEBUG("DEBUG LOG : KernelInterrupt waiting for event using poll()\n"); + const int timeout_ms = 250; + struct pollfd pfd = {.fd = m_eventfd, .events = POLLIN, .revents = 0}; + int num_events = poll(&pfd, 1, timeout_ms); + if (num_events <= 0) { + std::string err(num_events < 0 ? strerror(errno) : "timed out"); + std::string err_str("poll(): "); + debug_print(err_str.append(err), 1); + } else if (pfd.revents != POLLIN) { + std::string err("poll error num: ", pfd.revents); + debug_print(err, 0); + } else { + uint64_t val = 0; + ssize_t bytes_read = read(pfd.fd, &val, sizeof(val)); + if (bytes_read < 0) { + std::string err(strerror(errno)); + std::string err_str("read: "); + debug_print(err_str.append(err), 1); + } + } +} + +void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + MMD_DEBUG("DEBUG LOG : KernelInterrupt setting kernel interrupt\n"); + std::lock_guard<std::mutex> lock(m_mutex); + m_kernel_interrupt_fn = fn; + m_kernel_interrupt_user_data = user_data; +} + +/** Configure interrupts + * set_member_for_interrupts() called from KernelInterrupts constructor + */ +void KernelInterrupt::set_member_for_interrupts() { + static bool initialized = false; + if (initialized) { + return; + } + // Use interrupts + MMD_DEBUG("DEBUG LOG : Using interrupts\n"); + + enable_thread = true; + initialized = true; +} diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h new file mode 100644 index 0000000..9ea6e68 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/kernel_interrupt.h @@ -0,0 +1,68 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#ifndef KERNEL_INTERRUPT_H_ +#define KERNEL_INTERRUPT_H_ + +#include <opae/fpga.h> + +#include <atomic> +#include <chrono> +#include <mutex> +#include <thread> + +#include "aocl_mmd.h" + +namespace intel_opae_mmd { + +class KernelInterrupt final { + public: + KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle); + ~KernelInterrupt(); + + void enable_interrupts(); + void disable_interrupts(); + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data); + + KernelInterrupt(const KernelInterrupt &) = delete; + KernelInterrupt &operator=(const KernelInterrupt &) = delete; + KernelInterrupt(KernelInterrupt &&) = delete; + KernelInterrupt &operator=(KernelInterrupt &&) = delete; + + private: + static void set_member_for_interrupts(); + + void notify_work_thread(); + void wait_for_event(); + void work_thread(); + + static bool enable_thread; + + std::mutex m_mutex; + std::unique_ptr<std::thread> m_work_thread; + std::atomic<bool> m_work_thread_active; + int m_eventfd; + aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn; + void *m_kernel_interrupt_user_data; + fpga_handle m_fpga_handle; + int m_mmd_handle; + fpga_event_handle m_event_handle; +}; + +}; // namespace intel_opae_mmd + +#endif // KERNEL_INTERRUPT_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp new file mode 100644 index 0000000..58cd8e0 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd.cpp @@ -0,0 +1,830 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include <linux/mman.h> +#include <sys/mman.h> + +// On some systems MAP_HUGE_2MB is not defined. It should be defined for all +// platforms that DCP supports, but we also want ability to compile MMD on +// CentOS 6 systems. +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#endif + +#ifndef MAP_HUGE_1GB +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) +#endif + +#include <algorithm> +#include <cassert> +#include <cstdio> +#include <iomanip> +#include <iostream> +#include <map> +#include <sstream> +#include <unordered_map> +#include <vector> +#ifdef DLA_MMD +#include <chrono> +#include <thread> +#endif + +#include "aocl_mmd.h" +#include "mmd_device.h" + +bool diagnose = 0; + +/** If the MMD is loaded dynamically, destructors in the MMD will execute before + * the destructors in the runtime upon program termination. The DeviceMapManager + * guards accesses to the device/handle maps to make sure the runtime doesn't + * get to reference them after MMD destructors have been called. Destructor + * makes sure that all devices are closed at program termination regardless of + * what the runtime does. Implemented as a singleton. + */ +class DeviceMapManager final { + public: + /** C++ std map data structure to keep track of + * object id -> handle and handle -> device + */ + typedef std::map<int, Device *> t_handle_to_dev_map; + typedef std::map<uint64_t, int> t_id_to_handle_map; + + static const int SUCCESS = 0; + static const int FAILURE = -1; + + /** Returns handle and device pointer to the device with the specified name + * Creates a new entry for this device if it doesn't already exist + * Return 0 on success, -1 on failure + */ + int get_or_create_device(const char *board_name, int *handle, Device **device); + + /** Return obj id based on ASP name.*/ + uint64_t id_from_name(const char *board_name); + + /** Return MMD handle based on obj id. Returned value is negative if board + * doesn't exist + */ + inline int handle_from_id(uint64_t obj_id); + + /** Return pointer to device based on MMD handle. Returned value is null + * if board doesn't exist + */ + Device *device_from_handle(int handle); + + /** Closes specified device if it exists */ + void close_device_if_exists(int handle); + + /* Returns a reference to the class singleton */ + static DeviceMapManager &get_instance() { + static DeviceMapManager instance; + return instance; + } + + DeviceMapManager(DeviceMapManager const &) = delete; + void operator=(DeviceMapManager const &) = delete; + ~DeviceMapManager() { + // delete all allocated Device* entries + while (handle_to_dev_map->size() > 0) { + int handle = handle_to_dev_map->begin()->first; + aocl_mmd_close(handle); +#ifdef SIM + std::cout << "# mmd.cpp: When destroying DeviceMapManager in ASE, assume it worked.\n"; + break; +#endif + MMD_DEBUG("DEBUG LOG : In DeviceMapManager destructor, closing device with handle %d \n", handle); + } + delete handle_to_dev_map; + delete id_to_handle_map; + handle_to_dev_map = nullptr; + id_to_handle_map = nullptr; + } + + private: + DeviceMapManager() { + handle_to_dev_map = new t_handle_to_dev_map(); + id_to_handle_map = new t_id_to_handle_map(); + + MMD_DEBUG("DEBUG LOG : Constructing DeviceMapManager object\n"); + } + t_handle_to_dev_map *handle_to_dev_map = nullptr; + t_id_to_handle_map *id_to_handle_map = nullptr; +}; +static DeviceMapManager &device_manager = DeviceMapManager::get_instance(); + +/** Returns handle and device pointer to the device with the specified name + * Creates a new entry for this device if it doesn't already exist + * Return 0 on success, -1 on failure + */ +int DeviceMapManager::get_or_create_device(const char *board_name, int *handle, Device **device) { + int _handle = MMD_INVALID_PARAM; + Device *_device = nullptr; + + if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) { + MMD_DEBUG( + "DEBUG LOG : Failure in DeviceMapManager::get_or_create_device,id_to_handle_map or handle_to_dev_map is " + "NULL\n"); + return DeviceMapManager::FAILURE; + } + + uint64_t obj_id = id_from_name(board_name); + if (!obj_id) { + MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device. obj_id : %ld \n", obj_id); + return false; + } + if (id_to_handle_map->count(obj_id) == 0) { + try { + _device = new Device(obj_id); + _handle = _device->get_mmd_handle(); + id_to_handle_map->insert({obj_id, _handle}); + handle_to_dev_map->insert({_handle, _device}); + } catch (std::runtime_error &e) { + MMD_DEBUG("DEBUG LOG : Failure in DeviceMapManager::get_or_create_device %s\n", e.what()); + delete _device; + return DeviceMapManager::FAILURE; + } + MMD_DEBUG("DEBUG LOG : Success in creating new device object handle : %d \n", _handle); + } else { + _handle = id_to_handle_map->at(obj_id); + _device = handle_to_dev_map->at(_handle); + MMD_DEBUG("DEBUG LOG : Success in retrieving device metadata(handle , object) , handle : %d\n", _handle); + } + + (*handle) = _handle; + (*device) = _device; + + MMD_DEBUG("DEBUG LOG : Success in creating new device object , handle : %d\n", _handle); + return DeviceMapManager::SUCCESS; +} + +/** Return obj id based on ASP name.*/ +uint64_t DeviceMapManager::id_from_name(const char *board_name) { + uint64_t obj_id = 0; + if (Device::parse_board_name(board_name, obj_id)) { + MMD_DEBUG("DEBUG LOG : Success in retrieving object id from board name\n"); + return obj_id; + } else { + MMD_DEBUG("DEBUG LOG : Failed to retrieve object id from board name\n"); + return 0; + } +} + +/** Return MMD handle based on obj id. Returned value is negative if board + * doesn't exist + */ +inline int DeviceMapManager::handle_from_id(uint64_t obj_id) { + int handle = MMD_INVALID_PARAM; + if (id_to_handle_map) { + auto it = id_to_handle_map->find(obj_id); + if (it != id_to_handle_map->end()) { + handle = it->second; + } + MMD_DEBUG("DEBUG LOG : Success in retrieving handle from object id. handle : %d \n", handle); + } else { + MMD_DEBUG("DEBUG LOG : Failed to retrieve handle from object id \n"); + } + return handle; +} + +/** Return pointer to device based on MMD handle. Returned value is null + * if board doesn't exist + */ +Device *DeviceMapManager::device_from_handle(int handle) { + Device *dev = nullptr; + if (handle_to_dev_map) { + auto it = handle_to_dev_map->find(handle); + if (it != handle_to_dev_map->end()) { + return it->second; + } + MMD_DEBUG("DEBUG LOG : Success in retrieving device from handle. handle : %d \n", handle); + } else { + MMD_DEBUG("DEBUG LOG : Failed to retrieve device from handle\n"); + } + return dev; +} + +/** Closes specified device if it exists */ +void DeviceMapManager::close_device_if_exists(int handle) { + if (handle_to_dev_map) { + if (handle_to_dev_map->count(handle) > 0) { + Device *dev = handle_to_dev_map->at(handle); + uint64_t obj_id = dev->get_fpga_obj_id(); + delete dev; + + handle_to_dev_map->erase(handle); + id_to_handle_map->erase(obj_id); + MMD_DEBUG("DEBUG LOG : Closing device with handle : %d\n", handle); + } else { + MMD_DEBUG("DEBUG LOG : Nothing to close. Device with handle : %d already closed\n", handle); + } + } else { + MMD_DEBUG("DEBUG LOG : Error, no handle to device map entry found for handle : %d \n", handle); + } +} + +/** Interface for checking if AFU has ASP loaded */ +bool mmd_asp_loaded(const char *name) { + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + MMD_DEBUG("DEBUG LOG : Error, no object id found for board : %s \n", name); + return false; + } + + int handle = device_manager.handle_from_id(obj_id); + if (handle > 0) { + Device *dev = device_manager.device_from_handle(handle); + if (dev) { + MMD_DEBUG("DEBUG LOG : ASP loaded for handle : %d \n", handle); + return dev->asp_loaded(); + } else { + MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d \n", handle); + return false; + } + } else { + bool asp_loaded = false; + try { + Device dev(obj_id); + asp_loaded = dev.asp_loaded(); + } catch (std::runtime_error &e) { + MMD_DEBUG("DEBUG LOG : ASP not loaded for handle : %d , %s\n", handle, e.what()); + return false; + } + + MMD_DEBUG("DEBUG LOG : ASP loaded : %d (0 - not loaded , 1 - loaded) for handle : %d \n", asp_loaded, handle); + return asp_loaded; + } +} + +/** Function called as part of aocl_mmd_get_offline_info() + * to determine number of baords in system + */ +static unsigned int get_offline_num_acl_boards(const char *asp_uuid) { + bool asp_only = true; + fpga_guid guid; + fpga_result res = FPGA_OK; + uint32_t num_matches = 0; + bool ret_err = false; + fpga_properties filter = NULL; + + if (uuid_parse(asp_uuid, guid) < 0) { + MMD_DEBUG("Error parsing guid '%s'\n", asp_uuid); + ret_err = true; + goto out; + } + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + MMD_DEBUG("Error creating properties object: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + if (asp_only) { + res = fpgaPropertiesSetGUID(filter, guid); + if (res != FPGA_OK) { + MMD_DEBUG("Error setting GUID: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + MMD_DEBUG("Error setting object type: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches); + if (res != FPGA_OK) { + MMD_DEBUG("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + +out: + if (filter) fpgaDestroyProperties(&filter); + + if (ret_err) { + return MMD_AOCL_ERR; + } else { + return num_matches; + } +} + +/** Function called as part of aocl_mmd_get_offline_info() + * to determine names of boards in the system + */ +static bool get_offline_board_names(std::string &boards, bool asp_only = true) { + boards = "dla_agx7_ofs_board"; + return true; +} + +// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info +#define RESULT_INT(X) \ + { \ + *((int *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(int); \ + } +#define RESULT_SIZE_T(X) \ + { \ + *((size_t *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(size_t); \ + } + +#define RESULT_STR(X) \ + do { \ + unsigned Xlen = strnlen(X, 4096) + 1; \ + unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \ + memcpy((void *)param_value, X, Xcpylen); \ + if (param_size_ret) *param_size_ret = Xcpylen; \ + } while (0) + +/** Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ + +// From DLA perspective, only AOCL_MMD_BOARD_NAMES info we care +int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void *param_value, + size_t *param_size_ret) { + /** aocl_mmd_get_offline_info can be called many times by the runtime + * and it is expensive to query the system. Only compute values first + * time aocl_mmd_get_offline_info called future iterations use saved results + */ + static bool initialized = false; + static int mem_type_info; + static unsigned int num_acl_boards; + static std::string boards; + static bool success; + + if (!initialized) { + mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY; + num_acl_boards = get_offline_num_acl_boards(I_DK_AFU_ID); + success = get_offline_board_names(boards, true); + initialized = true; + } + + switch (requested_info_id) { + case AOCL_MMD_VERSION: + RESULT_STR(AOCL_MMD_VERSION_STRING); + break; + case AOCL_MMD_NUM_BOARDS: { + RESULT_INT(num_acl_boards); + break; + } + case AOCL_MMD_VENDOR_NAME: + RESULT_STR("Intel Corp"); + break; + case AOCL_MMD_BOARD_NAMES: { + if (success) { + RESULT_STR(boards.c_str()); + } else { + return MMD_AOCL_ERR; + } + break; + } + case AOCL_MMD_VENDOR_ID: + RESULT_INT(0); + break; + case AOCL_MMD_USES_YIELD: + RESULT_INT(0); + break; + case AOCL_MMD_MEM_TYPES_SUPPORTED: + RESULT_INT(mem_type_info); + break; + } + + return 0; +} + +/** Get information about the board using the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +int aocl_mmd_get_info( + int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) { + MMD_DEBUG("DEBUG LOG : called aocl_mmd_get_info\n"); + Device *dev = device_manager.device_from_handle(handle); + if (dev == NULL) return 0; + + assert(param_value); + switch (requested_info_id) { + case AOCL_MMD_BOARD_NAME: { + std::ostringstream board_name; + board_name << "Intel OFS Platform" + << " (" << dev->get_dev_name() << ")"; + RESULT_STR(board_name.str().c_str()); + break; + } + case AOCL_MMD_NUM_KERNEL_INTERFACES: + RESULT_INT(1); + break; + case AOCL_MMD_KERNEL_INTERFACES: + RESULT_INT(AOCL_MMD_KERNEL); + break; +#ifdef SIM + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#else + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#endif + case AOCL_MMD_MEMORY_INTERFACE: + RESULT_INT(AOCL_MMD_MEMORY); + break; + case AOCL_MMD_PCIE_INFO: { + RESULT_STR(dev->get_bdf().c_str()); + break; + } + case AOCL_MMD_BOARD_UNIQUE_ID: + RESULT_INT(0); + break; + case AOCL_MMD_TEMPERATURE: { + if (param_value_size == sizeof(float)) { + float *ptr = static_cast<float *>(param_value); + *ptr = dev->get_temperature(); + if (param_size_ret) *param_size_ret = sizeof(float); + } + break; + } + case AOCL_MMD_CONCURRENT_READS: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_WRITES: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_READS_OR_WRITES: + RESULT_INT(2); + break; + + case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT: + RESULT_SIZE_T(64); + break; + + case AOCL_MMD_HOST_MEM_CAPABILITIES: { + RESULT_INT(0); + break; + } + case AOCL_MMD_SHARED_MEM_CAPABILITIES: { + RESULT_INT(0); + break; + } + + case AOCL_MMD_DEVICE_MEM_CAPABILITIES: + RESULT_INT(0); + break; + case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + } + return 0; +} + +#undef RESULT_INT +#undef RESULT_STR + +/** Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signaled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) { + Device *dev = device_manager.device_from_handle(handle); + if (dev) { + dev->set_kernel_interrupt(fn, user_data); + MMD_DEBUG("DEBUG LOG : Set kernel interrupt handler for device handle : %d\n", handle); + } else { + MMD_DEBUG("DEBUG LOG : Error setting kernel interrupt handler for device handle : %d\n", handle); + return MMD_AOCL_ERR; + } + return 0; +} + +/** Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ + +int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) { + Device *dev = device_manager.device_from_handle(handle); + if (dev) { + dev->set_status_handler(fn, user_data); + MMD_DEBUG("DEBUG LOG : Set status handler for device handle : %d\n", handle); + } + return 0; +} + +/** Host to device-global-memory write (HOST DDR -> FPGA DDR) + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +int AOCL_MMD_CALL +aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) { + MMD_DEBUG( + "DEBUG LOG : aocl_mmd_write: handle : %d\t operation : %p\t len : 0x%zx\t src : %p\t mmd_interface : %d\t offset " + ": 0x%zx\n", + handle, + op, + len, + src, + mmd_interface, + offset); + Device *dev = device_manager.device_from_handle(handle); + if (dev){ + return dev->write_block(op, mmd_interface, src, offset, len); + } + else { + MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_write , device not found for handle : %d\n", handle); + return -1; + } +} + +/** Host reading from device-global-memory (FPGA DDR -> HOST DDR) + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ + +int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) { + MMD_DEBUG( + "DEBUG LOG : aocl_mmd_read: handle : %d\t operation : %p\t len : 0x%zx\t dst : %p\t mmd_interface : %d\t offset " + ": 0x%zx\n", + handle, + op, + len, + dst, + mmd_interface, + offset); + Device *dev = device_manager.device_from_handle(handle); + if (dev){ + return dev->read_block(op, mmd_interface, dst, offset, len); + } + else { + MMD_DEBUG("DEBUG LOG : Error in aocl_mmd_read , device not found for handle : %d\n", handle); + return -1; + } +} + +/** Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ + +int AOCL_MMD_CALL aocl_mmd_open(const char *name) { + + MMD_DEBUG("DEBUG LOG : aocl_mmd_open, Opening device: %s\n", name); + + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, object id not found for board : %s\n", name); + return MMD_INVALID_PARAM; + } + + int handle; + Device *dev = nullptr; + if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) { + if (std::getenv("MMD_PROGRAM_DEBUG") || std::getenv("MMD_DMA_DEBUG") || std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, device not found for board : %s\n", name); + } + return MMD_AOCL_ERR; + } + + assert(dev); + if (dev->asp_loaded()) { + if (!dev->initialize_asp()) { + MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, Error initializing asp for board : %s\n", name); + return MMD_ASP_INIT_FAILED; + } + } else { + MMD_DEBUG("DEBUG LOG : Error while aocl_mmd_open, asp not loaded for board : %s\n", name); + return MMD_ASP_NOT_LOADED; + } + MMD_DEBUG("end of aocl_mmd_open \n"); + MMD_DEBUG("DEBUG LOG : Success aocl_mmd_open for board : %s, handle : %d \n", name, handle); + return handle; +} + +/** Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +int AOCL_MMD_CALL aocl_mmd_close(int handle) { +#ifndef SIM + device_manager.close_device_if_exists(handle); +#else + std::cout << "# mmd.cpp: During simulation (ASE) we are not closing the device.\n"; +#endif + return 0; +} + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; } + +// DLA can only uses 4GB DDR as of 2024.2 +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; } +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { + #ifdef USE_N6001_BOARD + return 300.0; // MHz + #else + return 333.333333; // MHz + #endif +} + +// Helper functions for the wrapper functions around CSR and DDR +uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x10000 + (0x800 * instance) + addr; } +uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { + #ifdef USE_N6001_BOARD + return (1ULL << 32) * instance + addr; + #else + return (1ULL << 33) * instance + addr; + #endif +} + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) { + return aocl_mmd_write( + handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr)); +} + +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) { + return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_DLA_CSR, dla_get_raw_csr_address(instance, addr)); +} + +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) { + return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} + +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) { + return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} + +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) { + constexpr uint64_t hw_timer_address = 0x37000; + const uint32_t start_bit = 1; + const uint32_t stop_bit = 2; + + // Send the start command to the hardware counter + std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now(); + int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_DLA_CSR, hw_timer_address); + assert(status == 0); + + // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to + // determine the amount of time between the start and stop commands for the hardware counter + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Send the stop command to the hardware counter + std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now(); + status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_DLA_CSR, hw_timer_address); + assert(status == 0); + + // Read back the value of the counter + uint32_t counter = 0; + status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_DLA_CSR, hw_timer_address); + assert(status == 0); + + // Calculate the clock frequency of the counter, which is running on clk_dla + double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count(); + return 1.0e-6 * counter / elapsed_seconds; // 1.0e-6 is to convert to MHz +} +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp new file mode 100644 index 0000000..dd4ca42 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.cpp @@ -0,0 +1,448 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#include <assert.h> +#include <numa.h> + +#include <inttypes.h> +#include <string.h> +#include <unistd.h> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <limits> +#include <sstream> + +#include "mmd_device.h" +#include "mmd_helper.h" + +int Device::next_mmd_handle{1}; + +/** + * The Device object is created for each device/board opened and + * it has methods to interact with fpga device. + * The entry point for Device is in DeviceMapManager Class + * which maintains mapping between device names and handles. + * Device Object is foundation for interacting with device. + */ +Device::Device(uint64_t obj_id) + : fpga_obj_id(obj_id), + kernel_interrupt_thread(NULL), + event_update(NULL), + event_update_user_data(NULL), + enable_set_numa(false), + fme_sysfs_temp_initialized(false), + bus(0), + device(0), + function(0), + afu_initialized(false), + asp_initialized(false), + mmio_is_mapped(false), + filter(NULL), + mmio_token(NULL), + mmio_handle(NULL), + fme_token(NULL), + guid(), + mmd_dma(NULL) { + // Note that this constructor is not thread-safe because next_mmd_handle + // is shared between all class instances + MMD_DEBUG("DEBUG LOG : Constructing Device object\n"); + + mmd_handle = next_mmd_handle; + if (next_mmd_handle == std::numeric_limits<int>::max()) + next_mmd_handle = 1; + else + next_mmd_handle++; + + fpga_properties filter = NULL; + uint32_t num_matches; + fpga_result r; + + // Set up a filter that will search for an accelerator + fpgaGetProperties(NULL, &filter); + fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + + // Add the desired UUID to the filter + uuid_parse(I_DK_AFU_ID, guid); + fpgaPropertiesSetGUID(filter, guid); + + // Do the search across the available FPGA contexts + num_matches = 1; + fpgaEnumerate(&filter, 1, &mmio_token, 1, &num_matches); + + fpgaPropertiesGetParent(filter, &fme_token); + + // Not needed anymore so we destroy the filter + fpgaDestroyProperties(&filter); + + if (num_matches < 1) { + throw std::runtime_error(std::string("Cannot find accelerator")); + } + + // Open accelerator + r = fpgaOpen(mmio_token, &mmio_handle, 0); + assert(FPGA_OK == r); + + // While the token is available, check whether it is for HW + // or for ASE simulation. + fpga_properties accel_props; + uint16_t vendor_id, dev_id; + fpgaGetProperties(mmio_token, &accel_props); + fpgaPropertiesGetVendorID(accel_props, &vendor_id); + fpgaPropertiesGetDeviceID(accel_props, &dev_id); + + afu_initialized = true; + MMD_DEBUG("DEBUG LOG : Done constructing Device object\n"); +} + +/** Return true if board name parses correctly, false if it does not + * Return the parsed object_id in obj_id as an [out] parameter + */ +bool Device::parse_board_name(const char *board_name_str, uint64_t &obj_id) { + MMD_DEBUG("DEBUG LOG : Parsing board name\n"); + std::string prefix(ASP_NAME); + std::string board_name(board_name_str); + + obj_id = 0; + if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) { + MMD_DEBUG("DEBUG LOG : Error parsing device name '%s'\n", board_name_str); + return false; + } + + std::string device_num_str = board_name.substr(prefix.length()); + obj_id = std::stol(device_num_str, 0, 16); + + // Assume that OPAE does not use 0 as a valid object ID. This is true for now + // but relies somewhat on an implementaion dependent feature. + assert(obj_id > 0); + return true; +} + +/** initialize_asp() function is used in aocl_mmd_open() API + * It resets AFC and reinitializes DMA, Kernel Interrupts if in use + */ +bool Device::initialize_asp() { + MMD_DEBUG("DEBUG LOG : Initializing ASP ... \n"); + if (asp_initialized) { + MMD_DEBUG("DEBUG LOG : ASP already initialized \n"); + return true; + } + + fpga_result res = fpgaMapMMIO(mmio_handle, 0, NULL); + if (res != FPGA_OK) { + MMD_DEBUG("Error mapping MMIO space: %s\n", fpgaErrStr(res)); + return false; + } + mmio_is_mapped = true; + + // Trigger an user reset + uint64_t reset = 1; + fpgaWriteMMIO64(mmio_handle, 0, 0x40000, reset); + + AFU_RESET_DELAY(); + + // DMA performance is heavily dependent on the memcpy operation that transfers + // data from user allocated buffer to the pinned buffer that is used for + // DMA. On some machines with multiple NUMA nodes it is critical for + // performance that the pinned buffer is located on the NUMA node as the + // threads that performs the DMA operation. + // + // The performance also improves slighlty if the DMA threads are on the same + // NUMA node as the FPGA PCI device. + // + // This code pins memory allocation to occur from FPGA NUMA node prior to + // initializing the DMA buffers. It also pins all threads in the process + // to run on this same node. + struct bitmask *mask = NULL; + if (enable_set_numa) { + mask = numa_parse_nodestring(fpga_numa_node.c_str()); + numa_set_membind(mask); + int ret = numa_run_on_node_mask_all(mask); + if (ret < 0) { + fprintf(stderr, " Error setting NUMA node mask\n"); + } + } + + MMD_DEBUG("DEBUG LOG : Initializing HOST -> FPGA DMA channel \n"); + + mmd_dma = new intel_opae_mmd::mmd_dma(mmio_handle, mmd_handle); + if (!mmd_dma->initialized()) { + MMD_DEBUG("DEBUG LOG : Error initializing DMA channel \n"); + delete mmd_dma; + return false; + } + + // Turn off membind restriction in order to allow future allocation to + // occur on different NUMA nodes if needed. Hypothesis is that only + // the pinned buffers are performance critical for the memcpy. Other + // allocations in the process can occur on other NUMA nodes if needed. + if (enable_set_numa) { + numa_set_membind(numa_nodes_ptr); + numa_free_nodemask(mask); + } + +// Do not enable interrupt if polling mode is enabled in the DLA runtime. +#ifndef COREDLA_RUNTIME_POLLING + try { + kernel_interrupt_thread = new intel_opae_mmd::KernelInterrupt(mmio_handle, mmd_handle); + } catch (const std::system_error &e) { + std::cerr << "Error initializing kernel interrupt thread: " << e.what() << e.code() << std::endl; + return false; + } catch (const std::exception &e) { + std::cerr << "Error initializing kernel interrupt thread: " << e.what() << std::endl; + return false; + } +#endif + + asp_initialized = true; + MMD_DEBUG("DEBUG LOG : ASP Initialized ! \n"); + return asp_initialized; +} + +/** Device Class Destructor implementation + * Properly releasing and free-ing memory + * part of best coding practices and help + * with stable system performance and + * helps reduce bugs + */ +Device::~Device() { + MMD_DEBUG("DEBUG LOG : Destructing Device object \n"); + int num_errors = 0; + + if (kernel_interrupt_thread != nullptr) { + delete kernel_interrupt_thread; + kernel_interrupt_thread = NULL; + } + + if (mmd_dma) { + delete mmd_dma; + mmd_dma = NULL; + } + + if (mmio_is_mapped) { + if (fpgaUnmapMMIO(mmio_handle, 0)) { + MMD_DEBUG("DEBUG LOG : fpgaUnmapMMIO failed\n"); + num_errors++; + } + } + + if (mmio_handle) { + if (fpgaClose(mmio_handle) != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : fpgaClose mmio_handle failed\n"); + num_errors++; + } + } + + if (mmio_token) { + if (fpgaDestroyToken(&mmio_token) != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : fpgaDestroyToken mmio_token failed\n"); + num_errors++; + } + } + + if (filter) { + if (fpgaDestroyProperties(&filter) != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : fpgaDestroyProperties filter failed\n"); + num_errors++; + } + } + + if (num_errors > 0) { + MMD_DEBUG("DEBUG LOG : Error freeing resources in Device destructor\n"); + } +} + +/** asp_loaded() function which checks if asp is loaded on board + * it is used in aocl_mmd_open() API + */ +bool Device::asp_loaded() { + fpga_guid pci_guid; + fpga_guid afu_guid; + fpga_properties prop; + fpga_result res; + + if (uuid_parse(I_DK_AFU_ID, pci_guid) < 0) { + MMD_DEBUG("DEBUG LOG : Error parsing guid\n"); + return false; + } + + res = fpgaGetProperties(mmio_token, &prop); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error reading properties: %s \n", fpgaErrStr(res)); + fpgaDestroyProperties(&prop); + return false; + } + + if (!mmio_token) { + fpgaDestroyProperties(&prop); + MMD_DEBUG("DEBUG LOG : Error reading the mmio_token\n"); + return false; + } + + res = fpgaPropertiesGetGUID(prop, &afu_guid); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error reading GUID \n"); + fpgaDestroyProperties(&prop); + return false; + } + + fpgaDestroyProperties(&prop); + if (uuid_compare(pci_guid, afu_guid) == 0) { + MMD_DEBUG("DEBUG LOG : asp loaded : true \n"); + return true; + } else { + MMD_DEBUG("DEBUG LOG : asp loaded : false \n"); + return false; + } +} + +/** get_bdf() function is called + * in aocl_mmd_get_info() API + */ +std::string Device::get_bdf() { + std::ostringstream bdf; + bdf << std::setfill('0') << std::setw(2) << std::hex << unsigned(bus) << ":" << std::setfill('0') << std::setw(2) + << std::hex << unsigned(device) << "." << std::hex << unsigned(function); + + return bdf.str(); +} + +/** get_temperature() function is called + * in aocl_mmd_get_info() API + * We currently use hardcoded paths to retrieve temperature information + * We will replace with OPAE APIs in future + */ +float Device::get_temperature() { + if (std::getenv("MMD_ENABLE_DEBUG")) { + MMD_DEBUG("DEBUG LOG : Reading temperature ... \n"); + } + float temp = 0; + fpga_object obj; + const char *name; + name = "dfl_dev.*/spi_master/spi*/spi*.*/*-hwmon.*.auto/hwmon/hwmon*/temp1_input"; + fpga_result res; + res = fpgaTokenGetObject(fme_token, name, &obj, FPGA_OBJECT_GLOB); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error reading temperature monitor from BMC :"); + MMD_DEBUG(" %s \n", fpgaErrStr(res)); + temp = -999; + return temp; + } + + uint64_t value = 0; + fpgaObjectRead64(obj, &value, FPGA_OBJECT_SYNC); + fpgaDestroyObject(&obj); + temp = value / 1000; + return temp; +} + +/** set_kernel_interrupt() function is used in aocl_mmd_set_interrupt_handler() API + */ +void Device::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + MMD_DEBUG("DEBUG LOG : Device::set_kernel_interrupt() \n"); + if (kernel_interrupt_thread) { + kernel_interrupt_thread->set_kernel_interrupt(fn, user_data); + } +} + +/** set_kernel_interrupt() function is used in aocl_mmd_set_status_handler() API + */ +void Device::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + MMD_DEBUG("DEBUG LOG : Device::set_status_handler() \n"); + event_update = fn; + event_update_user_data = user_data; +} + +/** event_update_fn() is used in read_block(), write_block(), copy_block() functions + * OPAE provides event API for handling asynchronous events sucj as errors and interrupts + * under the hood those are used + */ +void Device::event_update_fn(aocl_mmd_op_t op, int status) { + MMD_DEBUG("DEBUG LOG : Device::event_update_fn() \n"); + event_update(mmd_handle, event_update_user_data, op, status); +} + +/** read_block() is used in aocl_mmd_read() API + * as name suggests its used for fpga->host DMA and MMIO transfers + */ +int Device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) { + MMD_DEBUG("DEBUG LOG : Device::read_block()\n"); + int res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO read. + + if (mmd_interface == AOCL_MMD_MEMORY) { + std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex); + MMD_DEBUG("DEBUG LOG : Using DMA to read block\n"); + res = mmd_dma->fpga_to_host(host_addr, (uint64_t)offset, size); + } else if (mmd_interface == AOCL_MMD_DLA_CSR) { + assert(size == 4); // DLA CSR read should be always size ==4 as of 2024.2 + MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n"); + res = read_mmio(host_addr, offset, size); + } else { + MMD_DEBUG("DEBUG LOG : Using MMIO to read block\n"); + res = read_mmio(host_addr, mmd_interface + offset, size); + + if (op) { + this->event_update_fn(op, res); + } + } + return res; +} + +/** write_block() is used in aocl_mmd_write() API + * as name suggests its used for DMA and MMIO transfers + */ +int Device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) { + MMD_DEBUG("DEBUG LOG : Device::write_block()\n"); + int res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO write + if (mmd_interface == AOCL_MMD_MEMORY) { + std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex); + MMD_DEBUG("DEBUG LOG : Using DMA to write block\n"); + res = mmd_dma->host_to_fpga(host_addr, (uint64_t)offset, size); + } else if (mmd_interface == AOCL_MMD_DLA_CSR) { + assert(size == 4); // DLA CSR read should be always size ==4 as of 2024.2 + MMD_DEBUG("DEBUG LOG : Using MMIO to read block in the DLA CSR space\n"); + res = write_mmio(host_addr, offset, size); + } else { + MMD_DEBUG("DEBUG LOG : Using MMIO to write block\n"); + res = write_mmio(host_addr, mmd_interface + offset, size); + if (op) { + this->event_update_fn(op, res); + } + } + + return res; +} + +/** read_mmio() is used in read_block() function + * it uses OPAE APIs fpgaReadMMIO64() and fpgaReadMMIO32() + */ +int Device::read_mmio(void *host_addr, size_t mmio_addr, size_t size) { + return mmd_helper::read_mmio(mmio_handle, host_addr, mmio_addr, size); +} + +/** write_mmio() is used in write_block() function + * it uses OPAE APIs fpgaWriteMMIO64() and fpgaWriteMMIO32() + */ +int Device::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) { + return mmd_helper::write_mmio(mmio_handle, host_addr, mmio_addr, size); +} diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h new file mode 100644 index 0000000..1cded83 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_device.h @@ -0,0 +1,151 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#ifndef MMD_DEVICE_H +#define MMD_DEVICE_H + +#include <limits.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <string> + +#include <opae/fpga.h> + +#include <uuid/uuid.h> + +#include "aocl_mmd.h" +#include "mmd_dma.h" +#include "mmd_helper.h" + +#include "kernel_interrupt.h" + +// Tune delay for simulation or HW. Eventually delay +// should be removed for HW, may still be needed for ASE simulation +#ifdef SIM +#define DELAY_MULTIPLIER 100 +#else +#define DELAY_MULTIPLIER 1 +#endif + +// Most AOCL_MMD_CALL functions return negative number in case of error, +// MMD_AOCL_ERR is used to indicate an error from the MMD that is being +// returned to the runtime. Simply set to -2 for now since neither interface +// defines a meaning to return codes for errors. +#define MMD_AOCL_ERR -1 + +// NOTE: some of the code relies on invalid handle returning -1 +// future TODO eliminate dependency on specific error values +#define MMD_INVALID_PARAM -1 + +// Our diagnostic script relies on handle values < -1 to determine when +// a valid device is present but a functioning ASP is not loaded. +#define MMD_ASP_NOT_LOADED -2 +#define MMD_ASP_INIT_FAILED -3 + +// Delay settings +#define MMIO_DELAY() +#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER) +#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER) +#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER) + +#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30) + +#define ASP_NAME "ofs_" + +#define SVM_MMD_MPF 0x24000 + +#define SVM_DDR_OFFSET 0x1000000000000 +#define PCI_DDR_OFFSET 0 + +enum { + // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + AOCL_IRQ_POLLING_BASE = 0x0100, // CSR to polling interrupt status + AOCL_IRQ_MASKING_BASE = 0x0108, // CSR to set/unset interrupt mask + AOCL_MMD_KERNEL = 0, + AOCL_MMD_MEMORY = 1, + AOCL_MMD_DLA_CSR = 2, +}; + +enum AfuStatu { MMD_INVALID_ID = 0, MMD_ASP, MMD_AFU }; + +class Device final { + public: + Device(uint64_t); + Device(const Device &) = delete; + Device &operator=(const Device &) = delete; + ~Device(); + + static bool parse_board_name(const char *board_name, uint64_t &obj_id); + + int get_mmd_handle() { return mmd_handle; } + uint64_t get_fpga_obj_id() { return fpga_obj_id; } + std::string get_dev_name() { return mmd_dev_name; } + std::string get_bdf(); + float get_temperature(); + + bool initialize_asp(); + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data); + void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + void event_update_fn(aocl_mmd_op_t op, int status); + bool asp_loaded(); + + int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size); + int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size); + + private: + static int next_mmd_handle; + + int mmd_handle; + uint64_t fpga_obj_id; + std::string mmd_dev_name; + intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread; + aocl_mmd_status_handler_fn event_update; + void *event_update_user_data; + + std::string fpga_numa_node; + bool enable_set_numa; + bool fme_sysfs_temp_initialized; + void initialize_fme_sysfs(); + void initialize_local_cpus_sysfs(); + bool find_dma_dfh_offsets(); + + uint8_t bus; + uint8_t device; + uint8_t function; + + bool afu_initialized; + bool asp_initialized; + bool mmio_is_mapped; + + fpga_properties filter; + fpga_token mmio_token; + fpga_handle mmio_handle; + fpga_token fme_token; + fpga_guid guid; + intel_opae_mmd::mmd_dma *mmd_dma; + std::mutex m_dma_mutex; + + // Helper functions + int read_mmio(void *host_addr, size_t dev_addr, size_t size); + int write_mmio(const void *host_addr, size_t dev_addr, size_t size); +}; + +#endif // MMD_DEVICE_H diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp new file mode 100644 index 0000000..6a4e13c --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.cpp @@ -0,0 +1,573 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#include <memory.h> +#include <sys/mman.h> +#include <cassert> +#include <chrono> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <unordered_map> + +#include <inttypes.h> +#include <sstream> + +#include "mmd_device.h" +#include "mmd_dma.h" +#include "mmd_helper.h" + +namespace intel_opae_mmd { + +/** mmd_dma class constructor + */ +mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle) : m_initialized(false), m_fpga_handle(fpga_handle_arg) { + MMD_DEBUG("DEBUG LOG : Constructing DMA \n"); + // Initialize shared buffer + auto res = fpgaPrepareBuffer(m_fpga_handle, DMA_BUFFER_SIZE, (void **)&dma_buf_ptr, &dma_buf_wsid, 0); + + assert(FPGA_OK == res && "Allocating DMA Buffer failed"); + + memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE); + + // Store virtual address of IO registers + res = fpgaGetIOAddress(m_fpga_handle, dma_buf_wsid, &dma_buf_iova); + assert(FPGA_OK == res && "getting dma DMA_BUF_IOVA failed"); + + m_initialized = true; +} + +/** mmd_dma destructor + * free-ing , releasing various resources created during object construction is a good idea + * it helps with system stability and reduces code bugs + */ +mmd_dma::~mmd_dma() { + MMD_DEBUG("DEBUG LOG : Destructing DMA \n"); + auto res = fpgaReleaseBuffer(m_fpga_handle, dma_buf_wsid); + assert(FPGA_OK == res && "Release DMA Buffer failed"); + m_initialized = false; +} + +// Called in dma_transfer() to send DMA descriptor +int mmd_dma::send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc) { + // mmio requires 8 byte alignment + assert(mmio_dst % 8 == 0); + + fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.src_address); + MMD_DEBUG("Writing %lX to address %lX\n", desc.src_address, mmio_dst); + mmio_dst += 8; + fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.dest_address); + MMD_DEBUG("Writing %lX to address %lX\n", desc.dest_address, mmio_dst); + mmio_dst += 8; + fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.len); + MMD_DEBUG("Writing %X to address %lX\n", desc.len, mmio_dst); + mmio_dst += 8; + fpgaWriteMMIO64(m_fpga_handle, 0, mmio_dst, desc.control); + MMD_DEBUG("Writing %X to address %lX\n", desc.control, mmio_dst); + + return 0; +} + +// Use ASE to handle unaligned transfer and DMA to do aligned transfer. +int mmd_dma::fpga_to_host(void *host_addr, uint64_t dev_src, size_t size) { + fpga_result res = FPGA_OK; + uint64_t count_left = size; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + uint64_t curr_dev_src = dev_src; + void *curr_host_addr = host_addr; + + if (dev_src % 64 != 0) { + // We use ASE to handle unaligned DMA transfer + MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src is non 64B aligned\n"); + if (count_left < 64) { + MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host dev_src count < 64\n"); + res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left); + assert(FPGA_OK == res && "_ase_fpga_to_host failed"); + return res; + } else { + aligned_addr = ((curr_dev_src / 64) + 1) * 64; + align_bytes = aligned_addr - curr_dev_src; + res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, align_bytes); + assert(FPGA_OK == res && "_ase_fpga_to_host failed"); + + // Update the processed data + count_left -= align_bytes; + curr_dev_src += align_bytes; + curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + align_bytes); + } + } + + if (count_left) { + uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE; + for (uint64_t i = 0; i < dma_chunks; i++) { + // constant size transfer + + uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK; + int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1; // Ceiling of test_buffer_size / DMA_LINE_SIZE + + dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host); + + // Copy data from shared buffer to host addr + memcpy(curr_host_addr, (void *)dma_buf_ptr, DMA_BUFFER_SIZE); + + memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE); + + // Update the curr source and dest + curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + DMA_BUFFER_SIZE); + curr_dev_src += DMA_BUFFER_SIZE; + } + + // Updated the count_left for the for loop + count_left -= (dma_chunks * DMA_BUFFER_SIZE); + + if (count_left) { + uint64_t dma_tx_bytes = (count_left / 64) * 64; + if (dma_tx_bytes != 0) { + assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n"); + + uint64_t dev_dest = dma_buf_iova | DMA_HOST_MASK; + int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1; // Ceiling of test_buffer_size / DMA_LINE_SIZE + + dma_transfer(curr_dev_src, dev_dest, len, ddr_to_host); + + // Copy data from shared buffer to host addr + memcpy(curr_host_addr, (void *)dma_buf_ptr, dma_tx_bytes); + + memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE); + + // Update the address + curr_host_addr = (void *)(static_cast<char *>(curr_host_addr) + dma_tx_bytes); + curr_dev_src += dma_tx_bytes; + count_left -= dma_tx_bytes; + } + if (count_left) { + MMD_DEBUG("DEBUG LOG : mmd_dma::fpga_to_host count_left after DMA transfer is "); + MMD_DEBUG("%" PRIu64 "\n", count_left); + // Handle the rest unaligned transfer using ASE + res = _ase_fpga_to_host(curr_dev_src, curr_host_addr, count_left); + if (FPGA_OK != res) { + MMD_DEBUG("DEBUG LOG : mmd_dma::_ase_fpga_to_host failed\n"); + return -1; + } + count_left = 0; + + // No need to update address as the transaction is done. + } + } + } + assert(count_left==0 && "fpga_to_host failed"); + return 0; +} + +// Use ASE to handle unaligned transfer and DMA to do aligned transfer. +int mmd_dma::host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size) { + fpga_result res = FPGA_OK; + uint64_t count_left = size; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + uint64_t curr_dest = dev_dest; + const void *curr_host_addr = host_addr; + + if (dev_dest % 64 != 0) { + // We use ASE to handle unaligned DMA transfer + MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga dev_dest is non 64B aligned\n"); + if (count_left < 64) { + res = _ase_host_to_fpga(dev_dest, host_addr, count_left); + assert(FPGA_OK == res && "_ase_host_to_fpga failed"); + return res; + } else { + aligned_addr = ((dev_dest / 64) + 1) * 64; + align_bytes = aligned_addr - dev_dest; + res = _ase_host_to_fpga(dev_dest, host_addr, align_bytes); + assert(FPGA_OK == res && "_ase_host_to_fpga failed"); + + // Update the processed data + count_left -= align_bytes; + curr_dest += align_bytes; + curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + align_bytes); + } + } + + if (count_left) { + uint64_t dma_chunks = count_left / DMA_BUFFER_SIZE; + for (uint64_t i = 0; i < dma_chunks; i++) { + // constant size transfer + // Copy host_src value to the shared buffer + memcpy((void *)dma_buf_ptr, curr_host_addr, DMA_BUFFER_SIZE); + uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK; + + int len = ((DMA_BUFFER_SIZE - 1) / DMA_LINE_SIZE) + 1; // Ceiling of test_buffer_size / DMA_LINE_SIZE + + dma_transfer(dev_src, curr_dest, len, host_to_ddr); + + memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE); + + // Update the curr source and dest + curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + DMA_BUFFER_SIZE); + curr_dest += DMA_BUFFER_SIZE; + } + + // Updated the count_left for the for loop + count_left -= (dma_chunks * DMA_BUFFER_SIZE); + + if (count_left) { + uint64_t dma_tx_bytes = (count_left / 64) * 64; + if (dma_tx_bytes != 0) { + assert(dma_tx_bytes <= DMA_BUFFER_SIZE && "Illegal transfer size\n"); + + // Copy host_src value to the shared buffer + memcpy((void *)dma_buf_ptr, curr_host_addr, dma_tx_bytes); + uint64_t dev_src = dma_buf_iova | DMA_HOST_MASK; + + int len = ((dma_tx_bytes - 1) / DMA_LINE_SIZE) + 1; // Ceiling of dma_tx_bytes / DMA_LINE_SIZE + dma_transfer(dev_src, curr_dest, len, host_to_ddr); + + memset((void *)dma_buf_ptr, 0x0, DMA_BUFFER_SIZE); + } + + // Update the address + curr_host_addr = (const void *)(static_cast<const char *>(curr_host_addr) + dma_tx_bytes); + curr_dest += dma_tx_bytes; + count_left -= dma_tx_bytes; + + if (count_left) { + MMD_DEBUG("DEBUG LOG : mmd_dma::host_to_fpga count_left after DMA transfer is "); + MMD_DEBUG("%" PRIu64 "\n", count_left); + // Handle the rest unaligned transfer using ASE + res = _ase_host_to_fpga(curr_dest, curr_host_addr, count_left); + assert(FPGA_OK == res && "_ase_host_to_fpga failed"); + count_left = 0; + } + } + } + assert(count_left==0 && "host_to_fpga failed"); + return 0; +} + +int mmd_dma::dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode) { + + // Get debug information for thread id + std::stringstream ss; + ss << std::this_thread::get_id(); + uint64_t id = std::stoull(ss.str()); + MMD_DEBUG("dma_transfer start current thread_id is %04lX\n", id); + + // Native DMA transfer requires 64 byte alignment + assert(dev_src % 64 == 0); + assert(dev_dest % 64 == 0); + + const uint64_t MASK_FOR_35BIT_ADDR = 0x7FFFFFFFF; + + dma_descriptor_t desc; + + MMD_DEBUG("DEBUG LOG : mmd_dma::dma_transfer starts\n"); + MMD_DEBUG("DEBUG LOG dev_dest = %04lX\n", dev_dest); + + desc.src_address = dev_src & MASK_FOR_35BIT_ADDR; + desc.dest_address = dev_dest & MASK_FOR_35BIT_ADDR; + desc.len = len; + desc.control = 0x80000000 | (descriptor_mode << MODE_SHIFT); + + const uint64_t DMA_DESC_BASE = 8 * DMA_CSR_IDX_SRC_ADDR; + const uint64_t DMA_STATUS_BASE = 8 * DMA_CSR_IDX_STATUS; + uint64_t mmio_data = 0; + + int desc_size = sizeof(desc); + + MMD_DEBUG("Descriptor size = %d\n", desc_size); + MMD_DEBUG("desc.src_address = %04lX\n", desc.src_address); + MMD_DEBUG("desc.dest_address = %04lX\n", desc.dest_address); + MMD_DEBUG("desc.len = %d\n", desc.len); + MMD_DEBUG("desc.control = %04X\n", desc.control); + MMD_DEBUG("descriptor_mode = %04X\n", descriptor_mode); + + // send descriptor + send_descriptor(DMA_DESC_BASE, desc); + + fpga_result r; + r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data); + MMD_DEBUG("DMA_STATUS_BASE before = %04lX\n", mmio_data); + if (FPGA_OK != r) return -1; + + // If the busy bit is empty, then we are done. + while ((mmio_data & 0x1) == 0x1) { + r = fpgaReadMMIO64(m_fpga_handle, 0, DMA_STATUS_BASE, &mmio_data); + assert(FPGA_OK == r); + } + MMD_DEBUG("dma_transfer end current thread_id is %04lX\n", id); + return 0; +} + +// Transfer "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make +// calls to handle unaligned and aligned MMIO writes. +fpga_result mmd_dma::_ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count) { + MMD_DEBUG("DEBUG LOG: _ase_host_to_fpga is being called\n "); + + MMD_DEBUG("DEBUG LOG : dev_dest is "); + MMD_DEBUG("%" PRIu64 "\n", dev_dest); + + assert(count < 64); // DLA only uses ASE transfer with less than 64 Byte transfer. + + fpga_result res = FPGA_OK; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + // For ASE window + uint64_t ase_window; + uint64_t ase_addr; + uint64_t dev_addr; + + const void *curr_src_ptr = src_ptr; + + if (count == 0) return res; + + if (dev_dest % 8 == 0) { + while (count > 0) { + ase_window = dev_dest & ~(0xfff); + ase_addr = (dev_dest & 0xfff); // only keep the lower 12 bits. + + uint64_t mmio_base_control = ASE_MMIO_BASE + ASE_MMIO_CTRL; + + MMD_DEBUG("DEBUG LOG : ase_window is "); + MMD_DEBUG("%" PRIu64 "\n", ase_window); + + // Write to ASE control + res = fpgaWriteMMIO64(m_fpga_handle, 0, mmio_base_control, ase_window); + assert(res == FPGA_OK && "Write to ASE control failed"); + + // Set final dev_addr + // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned. + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + + assert(dev_addr % 8 == 0); + + MMD_DEBUG("DEBUG LOG : _ase_host_to_fpga count is "); + MMD_DEBUG("%" PRIu64 "\n", count); + + MMD_DEBUG("DEBUG LOG : dev addr is "); + MMD_DEBUG("%" PRIu64 "\n", dev_addr); + + size_t size = (count > 8) ? 8 : count; + mmd_helper::write_mmio(m_fpga_handle, curr_src_ptr, dev_addr, size); + + count -= size; + dev_dest += size; + curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size); + } + + assert(count == 0); + + } else { + // First we need to handle the non byte aligned transfer + + MMD_DEBUG("DEBUG LOG : _ase_host_to_fpga count is "); + MMD_DEBUG("%" PRIu64 "\n", count); + + // Aligns address to 8 byte using dst masking method + unaligned_size = 8 - (dev_dest % 8); + if (unaligned_size > count_left) unaligned_size = count_left; + + // Write to the unaligned address + assert(unaligned_size < 8); + uint64_t shift = dev_dest % 8; + + // Write to ASE control to switch page. + ase_window = dev_dest & ~(0xfff); + + MMD_DEBUG("DEBUG LOG : ase_window in non-aligned is "); + MMD_DEBUG("%" PRIu64 "\n", ase_window); + + fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window); + + // Get aligned dest address + uint64_t dev_aligned_addr = dev_dest - shift; + assert(dev_aligned_addr % 8 == 0); + + // read data from device memory with aligned dev dest + ase_addr = (dev_aligned_addr & 0xfff); + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + uint64_t read_tmp = 0; + fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp); + + // overlay our data, check if the shift is correct here + memcpy((reinterpret_cast<char *>(&read_tmp) + shift), src_ptr, unaligned_size); + + // Write back data to the device + fpgaWriteMMIO64(m_fpga_handle, 0, dev_addr, read_tmp); + + count_left -= unaligned_size; + + // Check if there is any byte left + if (count_left == 0) { + return res; + } + + // Now the dest address should be byte aligned now + // Start the regular ASE transfer + + const void *curr_src_ptr = (const void *)(static_cast<const char *>(src_ptr) + unaligned_size); + uint64_t next_dev_dest = dev_dest + unaligned_size; + + while (count_left > 0) { + ase_window = next_dev_dest & ~(0xfff); + ase_addr = (next_dev_dest & 0xfff); // only keep the lower 12 bits. + + MMD_DEBUG("DEBUG LOG : ase_window in non-aligned loop is "); + MMD_DEBUG("%" PRIu64 "\n", ase_window); + + // Write to ASE control + fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window); + + // Set final dev_addr + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + + assert(dev_addr % 8 == 0); + + size_t size = (count_left > 8) ? 8 : count_left; + mmd_helper::write_mmio(m_fpga_handle, + curr_src_ptr, + dev_addr, + size); + + count_left -= size; + next_dev_dest += size; + curr_src_ptr = (const void *)(static_cast<const char *>(curr_src_ptr) + size); + } + assert(count_left == 0); + } + + return FPGA_OK; +} + +// Transfer "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make +// calls to handle unaligned and aligned MMIO reads. +fpga_result mmd_dma::_ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count) { + MMD_DEBUG("DEBUG LOG : _ase_fpga_to_host is being called\n "); + + assert(count < 64); + + fpga_result res = FPGA_OK; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + // For ASE window + + uint64_t ase_window; + uint64_t ase_addr; + uint64_t dev_addr; + + if (count == 0) return res; + + void *curr_host_ptr = host_ptr; + + if (dev_dest % 8 == 0) { + while (count > 0) { + ase_window = dev_dest & ~(0xfff); + ase_addr = (dev_dest & 0xfff); // only keep the lower 12 bits. + + MMD_DEBUG("DEBUG LOG : ase_window is "); + MMD_DEBUG("%" PRIu64 "\n", ase_window); + + // Write to ASE control to switch page. + fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window); + + // Set final dev_addr + // dev_addr will be 8 byte aligned as long as dev_dest is 8 byte aligned. + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + + assert(dev_addr % 8 == 0); + + size_t size = (count > 8) ? 8 : count; + + mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size); + + count -= size; + dev_dest += size; + curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size); + } + + } else { + // First we need to handle the non byte aligned transfer + + // Aligns address to 8 byte using dst masking method + unaligned_size = 8 - (dev_dest % 8); + if (unaligned_size > count_left) unaligned_size = count_left; + + // Write to the unaligned address + assert(unaligned_size < 8); + uint64_t shift = dev_dest % 8; + + // Write to ASE control to switch page. + ase_window = dev_dest & ~(0xfff); + + MMD_DEBUG("DEBUG LOG : ase_window is "); + MMD_DEBUG("%" PRIu64 "\n", ase_window); + + fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window); + + // Get aligned dest address + uint64_t dev_aligned_addr = dev_dest - shift; + assert(dev_aligned_addr % 8 == 0); + + // read data from device memory with aligned dev dest + ase_addr = (dev_aligned_addr & 0xfff); + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + + uint64_t read_tmp = 0; + fpgaReadMMIO64(m_fpga_handle, 0, dev_addr, &read_tmp); + + // overlay our data + memcpy(host_ptr, (reinterpret_cast<char *>(&read_tmp) + shift), unaligned_size); + + count_left -= unaligned_size; + + // Check if there is any byte left + if (count_left == 0) { + return res; + } + + // Now the dest address should be byte aligned now + // Start the regular ASE transfer + curr_host_ptr = (void *)(static_cast<char *>(host_ptr) + unaligned_size); + uint64_t next_dev_dest = dev_dest + unaligned_size; + + while (count_left > 0) { + ase_window = next_dev_dest & ~(0xfff); + ase_addr = (next_dev_dest & 0xfff); // only keep the lower 12 bits. + + // Write to ASE control to switch page. + fpgaWriteMMIO64(m_fpga_handle, 0, ASE_MMIO_BASE + ASE_MMIO_CTRL, ase_window); + + // Set final dev_addr + dev_addr = ASE_MMIO_BASE + ASE_MMIO_WINDOW + ase_addr; + + assert(dev_addr % 8 == 0); + + size_t size = (count_left > 8) ? 8 : count_left; + mmd_helper::read_mmio(m_fpga_handle, curr_host_ptr, dev_addr, size); + + count_left -= size; + next_dev_dest += size; + curr_host_ptr = (void *)(static_cast<char *>(curr_host_ptr) + size); + } + + assert(count_left == 0); + } + return FPGA_OK; +} +} // namespace intel_opae_mmd diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h new file mode 100644 index 0000000..a2841b1 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_dma.h @@ -0,0 +1,89 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. +#ifndef MMD_DMA_H_ +#define MMD_DMA_H_ + +#include <opae/fpga.h> +#include <poll.h> + +#include <atomic> +#include <chrono> +#include <condition_variable> +#include <mutex> +#include <queue> +#include <thread> +#include <unordered_map> + +#include "aocl_mmd.h" +#include "mmd_helper.h" + +#define DMA_CSR_IDX_SRC_ADDR 0x5 +#define DMA_CSR_IDX_STATUS 0x9 +#define MODE_SHIFT 26 +// For now limits to 16K to avoid DMA transfer hang in hw, further testing required to increase the value. +#define DMA_BUFFER_SIZE (1024 * 16) +#define DMA_LINE_SIZE 64 +#define DMA_HOST_MASK 0x2000000000000 + +#define ASE_MMIO_BASE 0x20000 +#define ASE_MMIO_CTRL 0x200 +#define ASE_MMIO_WINDOW 0x1000 + +namespace intel_opae_mmd { + +enum dma_mode { stand_by = 0x0, host_to_ddr = 0x1, ddr_to_host = 0x2, ddr_to_ddr = 0x3 }; + +struct dma_descriptor_t { + uint64_t src_address; + uint64_t dest_address; + uint32_t len; + uint32_t control; +}; + +class mmd_dma final { + public: + mmd_dma(fpga_handle fpga_handle_arg, int mmd_handle); + ~mmd_dma(); + + bool initialized() { return m_initialized; } + + int fpga_to_host(void *host_addr, uint64_t dev_src, size_t size); + int host_to_fpga(const void *host_addr, uint64_t dev_dest, size_t size); + int dma_transfer(uint64_t dev_src, uint64_t dev_dest, int len, dma_mode descriptor_mode); + fpga_result _ase_host_to_fpga(uint64_t dev_dest, const void *src_ptr, uint64_t count); + fpga_result _ase_fpga_to_host(uint64_t dev_dest, void *host_ptr, uint64_t count); + mmd_dma(mmd_dma &other) = delete; + mmd_dma &operator=(const mmd_dma &other) = delete; + + private: + // Helper functions + int send_descriptor(uint64_t mmio_dst, dma_descriptor_t desc); + // Member variables + bool m_initialized; + fpga_handle m_fpga_handle; + + // Shared buffer in host memory + uint64_t *dma_buf_ptr = NULL; + // Workspace ID used by OPAE to identify buffer + uint64_t dma_buf_wsid; + // IO virtual address + uint64_t dma_buf_iova; +}; + +}; // namespace intel_opae_mmd + +#endif // MMD_DMA_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp new file mode 100644 index 0000000..4af482a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.cpp @@ -0,0 +1,163 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#include "mmd_helper.h" +#include <inttypes.h> + +namespace mmd_helper { + +int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + MMD_DEBUG("DEBUG LOG : Device::read_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr, + mmio_addr, + size); + + if (mmio_addr % 4 != 0) { + MMD_DEBUG("DEBUG LOG : ead_mmio function doesn't support non 4 Byte aligned mmio_addr due to OPAE\n"); + return -1; + } + + uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr); + + while (size >= 8) { + MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO64() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n", + host_addr64, + mmio_addr); + res = fpgaReadMMIO64(mmio_handle, 0, mmio_addr, host_addr64); + if (res != FPGA_OK) { + MMD_DEBUG( + "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x8\n", host_addr64, mmio_addr); + return -1; + } + MMD_DEBUG("DEBUG LOG : the host_addr64 value is "); + MMD_DEBUG("%" PRIu64 "\n", *host_addr64); + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64); + while (size >= 4) { + MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n", + host_addr32, + mmio_addr); + res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, host_addr32); + if (res != FPGA_OK) { + MMD_DEBUG( + "DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x4\n", host_addr32, mmio_addr); + return -1; + } + host_addr32 += 1; + mmio_addr += 4; + size -= 4; + } + + if (size > 0) { + uint32_t read_data; + MMD_DEBUG("DEBUG LOG : Using fpgaReadMMIO32() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr, + mmio_addr, + size); + res = fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &read_data); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error in read_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr, + mmio_addr, + size); + MMD_DEBUG("result is %d \n", res); + return -1; + } + + memcpy(host_addr32, &read_data, size); + } + + return res; +} + +int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + MMD_DEBUG("DEBUG LOG : Device::write_mmio start: host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr, + mmio_addr, + size); + + const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr); + while (size >= 8) { + MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO64() host_addr : %p\t mmio_addr : 0x%zx\t \n", + host_addr64, + mmio_addr); + res = fpgaWriteMMIO64(mmio_handle, 0, mmio_addr, *host_addr64); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t \n", + host_addr64, + mmio_addr); + return -1; + } + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64); + + while (size >= 4) { + MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32() host_addr : %p\t mmio_addr : 0x%zx\t \n", + host_addr32, + mmio_addr); + res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, *host_addr32); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t\n", + host_addr32, + mmio_addr); + return -1; + } + host_addr32 += 1; + mmio_addr += 4; + size -= 4; + } + + while (size > 0) { + MMD_DEBUG("DEBUG LOG : Using fpgaWriteMMIO32() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr32, + mmio_addr, + size); + uint32_t tmp_data32 = 0; + fpgaReadMMIO32(mmio_handle, 0, mmio_addr, &tmp_data32); // First read the data back + size_t chunk_size = (size >= 4) ? 4 : size; + + memcpy(&tmp_data32, host_addr32, chunk_size); // Apply our data overlay + + res = fpgaWriteMMIO32(mmio_handle, 0, mmio_addr, tmp_data32); + if (res != FPGA_OK) { + MMD_DEBUG("DEBUG LOG : Error in write_mmio() host_addr : %p\t mmio_addr : 0x%zx\t size : 0x%zx\n", + host_addr32, + mmio_addr, + size); + return -1; + } + host_addr32 += 1; + mmio_addr += chunk_size; + size -= chunk_size; + } + + return 0; +} + +}; // namespace mmd_helper diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h new file mode 100644 index 0000000..b7e2667 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/host/mmd_helper.h @@ -0,0 +1,41 @@ +// (c) 1992-2024 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +#ifndef MMD_HELPER_H +#define MMD_HELPER_H + +#include <opae/fpga.h> +#include <stdarg.h> + +inline void MMD_DEBUG(const char *format, ...) { + if (std::getenv("MMD_ENABLE_DEBUG")) { + va_list arglist; + va_start(arglist, format); + vprintf(format, arglist); + va_end(arglist); + fflush(stdout); + } +} + +namespace mmd_helper { + +int read_mmio(fpga_handle mmio_handle, void *host_addr, size_t mmio_addr, size_t size); +int write_mmio(fpga_handle mmio_handle, const void *host_addr, size_t mmio_addr, size_t size); + +}; // namespace mmd_helper + +#endif // MMD_HELPER_H diff --git a/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h new file mode 100644 index 0000000..16992da --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/agx7_ofs_pcie/include/aocl_mmd.h @@ -0,0 +1,377 @@ +// Copyright 2022 Intel Corporation +// SPDX-License-Identifier: MIT + +#ifndef AOCL_MMD_H +#define AOCL_MMD_H + +/* TODO: this file comes from OpenCL SDK and should be formatted there first */ +/* clang-format off */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Support for memory mapped ACL devices. + * + * Typical API lifecycle, from the perspective of the caller. + * + * 1. aocl_mmd_open must be called first, to provide a handle for further + * operations. + * + * 2. The interrupt and status handlers must be set. + * + * 3. Read and write operations are performed. + * + * 4. aocl_mmd_close may be called to shut down the device. No further + * operations are permitted until a subsequent aocl_mmd_open call. + * + * aocl_mmd_get_offline_info can be called anytime including before + * open. aocl_mmd_get_info can be called anytime between open and close. + */ + +// #ifndef AOCL_MMD_CALL +// #if defined(_WIN32) +// #define AOCL_MMD_CALL __declspec(dllimport) +// #else +// #define AOCL_MMD_CALL +// #endif +// #endif + +#ifndef AOCL_MMD_CALL +#if defined(_WIN32) +#define AOCL_MMD_CALL __declspec(dllimport) +#else +#define AOCL_MMD_CALL __attribute__((visibility ("default"))) +#endif +#endif + +#ifndef WEAK +#if defined(_WIN32) +#define WEAK +#else +#define WEAK __attribute__((weak)) +#endif +#endif + +#ifdef __cplusplus +#include <cstddef> //size_t +#else +#include <stddef.h> //size_t +#endif + +/* The MMD API's version - the runtime expects this string when + * AOCL_MMD_VERSION is queried. This changes only if the API has changed */ +#define AOCL_MMD_VERSION_STRING "20.3" + +/* Memory types that can be supported - bitfield. Other than physical memory + * these types closely align with the OpenCL SVM types. + * + * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate + * directly with physical memory such as DDR, QDR, etc. + * + * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires explicit function calls from the user + * to synchronize the cache between the host processor and the FPGA. This level + * of SVM is not currently supported by Altera except as a subset of + * SVM_FINE_GAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires additional information from the user + * and/or host runtime that can be collected during pointer allocation in order + * to synchronize the cache between the host processor and the FPGA. Once this + * additional data is provided for an SVM pointer, the vendor interface handles + * cache synchronization between the host processor & the FPGA automatically. + * This level of SVM is not currently supported by Altera except as a subset + * of SVM_FINE_GRAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for + * caching SVM pointer data and does not require any additional information to + * synchronize the cache between the host processor and the FPGA. The vendor + * interface handles cache synchronization between the host processor & the + * FPGA automatically for all SVM pointers. This level of SVM support is + * currently under development by Altera and some features may not be fully + * supported. + */ +#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0) +#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1) +#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2) +#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3) + +/* program modes - bitfield + * + * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory + * when this bit is set to 1. If programming can't occur without preserving + * global memory contents, the program function must fail, in which case the + * runtime may re-invoke program with this bit set to 0, allowing programming + * to occur even if doing so destroys global memory contents. + * + * more modes are reserved for stacking on in the future + */ +#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0) +typedef int aocl_mmd_program_mode_t; + + +typedef void* aocl_mmd_op_t; + +typedef struct { + unsigned lo; /* 32 least significant bits of time value. */ + unsigned hi; /* 32 most significant bits of time value. */ +} aocl_mmd_timestamp_t; + + +/* Defines the set of characteristics that can be probed about the board before + * opening a device. The type of data returned by each is specified in + * parentheses in the adjacent comment. + * + * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES + * These two fields can be used to implement multi-device support. The MMD + * layer may have a list of devices it is capable of interacting with, each + * identified with a unique name. The length of the list should be returned + * in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in + * AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open + * for each board name returned in AOCL_MMD_BOARD_NAMES. + */ +typedef enum { + AOCL_MMD_VERSION = 0, /* Version of MMD (char*)*/ + AOCL_MMD_NUM_BOARDS = 1, /* Number of candidate boards (int)*/ + AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/ + AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */ + AOCL_MMD_VENDOR_ID = 4, /* An integer ID for the vendor (int) */ + AOCL_MMD_USES_YIELD = 5, /* 1 if yield must be called to poll hw (int) */ + /* The following can be combined in a bit field: + * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. + * Prior to 14.1, all existing devices supported physical memory and no types of SVM memory, so this + * is the default when this operation returns '0' for board MMDs with a version prior to 14.1 + */ + AOCL_MMD_MEM_TYPES_SUPPORTED = 6, +} aocl_mmd_offline_info_t; + + +/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */ +/** + * If not set allocation function is not supported, even if other capabilities are set. + */ +#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0) +/** + * Supports atomic access to the memory by either the host or device. + */ +#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1) +/** + * Supports concurrent access to the memory either by host or device if the + * accesses are not on the same block. Block granularity is defined by + * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this + * granularity + */ +#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2) +/** + * Memory can be accessed by multiple devices at the same time. + */ +#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3) + + +/* Defines the set of characteristics that can be probed about the board after + * opening a device. This can involve communication to the device + * + * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1 + * + * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface. + * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int + * + * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each + * kernel interface. If a kernel interface is not clocked by acl_kernel_clk + * then return -1 + * + * */ +typedef enum { + AOCL_MMD_NUM_KERNEL_INTERFACES = 1, /* Number of Kernel interfaces (int) */ + AOCL_MMD_KERNEL_INTERFACES = 2, /* Kernel interface (int*) */ + AOCL_MMD_PLL_INTERFACES = 3, /* Kernel clk handles (int*) */ + AOCL_MMD_MEMORY_INTERFACE = 4, /* Global memory handle (int) */ + AOCL_MMD_TEMPERATURE = 5, /* Temperature measurement (float) */ + AOCL_MMD_PCIE_INFO = 6, /* PCIe information (char*) */ + AOCL_MMD_BOARD_NAME = 7, /* Name of board (char*) */ + AOCL_MMD_BOARD_UNIQUE_ID = 8, /* Unique ID of board (int) */ + AOCL_MMD_CONCURRENT_READS = 9, /* # of parallel reads; 1 is serial*/ + AOCL_MMD_CONCURRENT_WRITES = 10, /* # of parallel writes; 1 is serial*/ + AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/ + AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12, /* Min alignment that the ASP supports for host allocations (size_t) */ + AOCL_MMD_HOST_MEM_CAPABILITIES = 13, /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/ + AOCL_MMD_SHARED_MEM_CAPABILITIES = 14, /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/ + AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15, /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/ + AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/ + AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/ + AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/ +} aocl_mmd_info_t; + +typedef struct { + unsigned long long int exception_type; + void *user_private_info; + size_t user_cb; +}aocl_mmd_interrupt_info; + +typedef void (*aocl_mmd_interrupt_handler_fn)( int handle, void* user_data ); +typedef void (*aocl_mmd_device_interrupt_handler_fn)( int handle, aocl_mmd_interrupt_info* data_in, void* user_data ); +typedef void (*aocl_mmd_status_handler_fn)( int handle, void* user_data, aocl_mmd_op_t op, int status ); + + +/* Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_get_offline_info( + aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret ) WEAK; + +AOCL_MMD_CALL int aocl_mmd_get_info( + int handle, + aocl_mmd_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret ) WEAK; + +/* Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ +AOCL_MMD_CALL int aocl_mmd_open(const char *name) WEAK; + +/* Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK; + +/* Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signaled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler( int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data ) WEAK; + +/* Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_status_handler( int handle, aocl_mmd_status_handler_fn fn, void* user_data ) WEAK; + +/* Read, write and copy operations on a single interface. + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_read( + int handle, + aocl_mmd_op_t op, + size_t len, + void* dst, + int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_write( + int handle, + aocl_mmd_op_t op, + size_t len, + const void* src, + int mmd_interface, size_t offset ) WEAK; + +/** Error values*/ +#define AOCL_MMD_ERROR_SUCCESS 0 +#define AOCL_MMD_ERROR_INVALID_HANDLE -1 +#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2 +#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3 +#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4 +#define AOCL_MMD_ERROR_INVALID_POINTER -5 +#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6 + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +#include <cstdint> +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK; +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK; +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK; + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; + +// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK; + +#endif + +#ifdef __cplusplus +} +#endif + +/* clang-format on */ +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore new file mode 100644 index 0000000..66e06bf --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore @@ -0,0 +1,18 @@ +*~ +*# +*.marks +release_build/ +build/ +example_designs/mem_bandwidth/bin/ +example_designs/mem_bandwidth/simulation.tar.gz +example_designs/mem_bandwidth/temp_simulation/ +linux64/lib/ +linux64/libexec/diagnose +linux64/libexec/program +ase/mpf_src +*.pyc +*.swp +*.kwlp +*.kwps +temp_simulation/ +simulation.tar.gz diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt new file mode 100644 index 0000000..28dcfa4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt @@ -0,0 +1,63 @@ +# (C) 2017 Intel Corporation. All rights reserved. +# Your use of Intel Corporation's design tools, logic functions and other +# software and tools, and its AMPP partner logic functions, and any output +# files any of the foregoing (including device programming or simulation +# files), and any associated documentation or information are expressly subject +# to the terms and conditions of the Intel Program License Subscription +# Agreement, Intel MegaCore Function License Agreement, or other applicable +# license agreement, including, without limitation, that your use is for the +# sole purpose of programming logic devices manufactured by Intel and sold by +# Intel or its authorized distributors. Please refer to the applicable +# agreement for further details. + +cmake_minimum_required(VERSION 2.8.12) +project(mmd) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") + +find_package(OPAE REQUIRED) +find_package(NUMA REQUIRED) + +# DLA specific modifications made to the MMD +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD") + +enable_language(C ASM) + +set(ASM_OPTIONS "-x assembler-with-cpp") +if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as") +endif() + +set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}") + +set(MMD_SRC + ./host/ccip_mmd.cpp + ./host/ccip_mmd_device.cpp + ./host/dma_work_thread.cpp + ./host/fpga_dma.c + ./host/kernel_interrupt.cpp + ./host/mmd_dma.cpp + ./host/memcpy_s_fast.c + ./host/x86-sse2.S +) + +# Add a shared library target called intel_opae_mmd +# and build it from the MMD_SRC files +add_library(intel_opae_mmd SHARED ${MMD_SRC}) + +# Specify the include directories to be used when compiling intel_opae_mmd library +target_include_directories(intel_opae_mmd PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + +# Specify libraries needed when liking the intel_opae_mmd library +target_link_libraries(intel_opae_mmd + libopae-c + libnuma +) + +# Set the installation rules for the project +install(TARGETS intel_opae_mmd + LIBRARY DESTINATION lib + COMPONENT intel_opae_mmd +) diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake new file mode 100644 index 0000000..c981150 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake @@ -0,0 +1,34 @@ +# - Try to find libnuma +# Once done will define: +# +# NUMA_FOUND - system has libnuma +# NUMA_INCLUDE_DIRS - include directory with numa.h +# NUMA_LIBRARIES - link with this for libnuma + +find_path(NUMA_INCLUDE_DIRS + NAMES numa.h + PATHS + ${LIBNUMA_ROOT}/include + /usr/include + /p/psg/swip/dla/resources/numactl/2.0.16/include + + ) + +find_library(NUMA_LIBRARIES + NAMES numa + PATHS + ${LIBNUMA_ROOT}/lib + ${LIBNUMA_ROOT}/lib64 + /usr/lib + /usr/lib64 + /p/psg/swip/dla/resources/numactl/2.0.16/lib + + ) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA + REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES) + +add_library(libnuma IMPORTED SHARED) +set_target_properties(libnuma PROPERTIES + IMPORTED_LOCATION ${NUMA_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS}) diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake new file mode 100644 index 0000000..6395d7c --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake @@ -0,0 +1,44 @@ +# - Try to find libintelfpga +# Once done, this will define +# +# libopae-c_FOUND - system has libopae-c +# libopae-c_INCLUDE_DIRS - the libopae-c include directories +# libopae-c_LIBRARIES - link these to use libopae-c + +find_package(PkgConfig) +pkg_check_modules(PC_OPAE QUIET opae-c) + +# Use pkg-config to get hints about paths +execute_process(COMMAND pkg-config --cflags opae-c --silence-errors + COMMAND cut -d I -f 2 + OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS) +set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library") + +# Include dir +find_path(libopae-c_INCLUDE_DIRS + NAMES opae/fpga.h + PATHS ${LIBOPAE-C_ROOT}/include + ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS} + /usr/local/include + /usr/include + ${CMAKE_EXTRA_INCLUDES}) + +# The library itself +find_library(libopae-c_LIBRARIES + NAMES opae-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE + REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS) + +add_library(libopae-c IMPORTED SHARED) +set_target_properties(libopae-c PROPERTIES + IMPORTED_LOCATION ${libopae-c_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS}) + diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore new file mode 100644 index 0000000..1530978 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore @@ -0,0 +1 @@ +*.o
\ No newline at end of file diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h new file mode 100644 index 0000000..6d8f9fa --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h @@ -0,0 +1,123 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/** + * \fpga_dma.h + * \brief FPGA DMA BBB API Header + * + * Known Limitations + * - Driver does not support Address Span Extender + * - Implementation is not optimized for performance. + * User buffer data is copied into a DMA-able buffer before the transfer + * - Supports only synchronous (blocking) transfers + */ + +#ifndef AFU_BBB_UTIL_H__ +#define AFU_BBB_UTIL_H__ + +#include <assert.h> +#include <opae/fpga.h> +#include <uuid/uuid.h> + +#define DFH_FEATURE_EOL(dfh) (((dfh >> 40) & 1) == 1) +#define DFH_FEATURE(dfh) ((dfh >> 60) & 0xf) +#define DFH_FEATURE_IS_PRIVATE(dfh) (DFH_FEATURE(dfh) == 3) +#define DFH_FEATURE_IS_BBB(dfh) (DFH_FEATURE(dfh) == 2) +#define DFH_FEATURE_IS_AFU(dfh) (DFH_FEATURE(dfh) == 1) +#define DFH_FEATURE_NEXT(dfh) ((dfh >> 16) & 0xffffff) + +static bool find_dfh_by_guid(fpga_handle afc_handle, + uint64_t find_id_l, + uint64_t find_id_h, + uint64_t *result_offset = NULL, + uint64_t *result_next_offset = NULL) { + assert(find_id_l); + assert(find_id_h); + + uint64_t offset = 0; + if (result_offset) { + offset = *result_offset; + } + uint64_t dfh = 0; + + // Limit the maximum number of DFH search iterations to avoid getting stuck + // in an infinte loop in case the DFH_FEATURE_EOL is not found. Limit of + // 5000 is very conservaitve. In practice search should terminate in 3 or + // fewer iterations. + int MAX_DFH_SEARCHES = 5000; + int dfh_search_iterations = 0; + + do { + fpgaReadMMIO64(afc_handle, 0, offset, &dfh); + + int is_bbb = DFH_FEATURE_IS_BBB(dfh); + int is_afu = DFH_FEATURE_IS_AFU(dfh); + + if (is_afu || is_bbb) { + uint64_t id_l = 0; + uint64_t id_h = 0; + fpgaReadMMIO64(afc_handle, 0, offset + 8, &id_l); + fpgaReadMMIO64(afc_handle, 0, offset + 16, &id_h); + + if (find_id_l == id_l && find_id_h == id_h) { + if (result_offset) *result_offset = offset; + if (result_next_offset) *result_next_offset = DFH_FEATURE_NEXT(dfh); + return true; + } + } + offset += DFH_FEATURE_NEXT(dfh); + + dfh_search_iterations++; + if (dfh_search_iterations > MAX_DFH_SEARCHES) { + return false; + } + } while (!DFH_FEATURE_EOL(dfh)); + + return false; +} + +static bool find_dfh_by_guid(fpga_handle afc_handle, + const char *guid_str, + uint64_t *result_offset = NULL, + uint64_t *result_next_offset = NULL) { + fpga_guid guid; + + if (uuid_parse(guid_str, guid) < 0) return 0; + + uint32_t i; + uint32_t s; + + uint64_t find_id_l = 0; + uint64_t find_id_h = 0; + + // The API expects the MSB of the GUID at [0] and the LSB at [15]. + s = 64; + for (i = 0; i < 8; ++i) { + s -= 8; + find_id_h = ((find_id_h << 8) | (0xff & guid[i])); + } + + s = 64; + for (i = 0; i < 8; ++i) { + s -= 8; + find_id_l = ((find_id_l << 8) | (0xff & guid[8 + i])); + } + + return find_dfh_by_guid(afc_handle, find_id_l, find_id_h, result_offset, result_next_offset); +} + +#endif // AFU_BBB_UTIL_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp new file mode 100644 index 0000000..b7cd06a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp @@ -0,0 +1,655 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include <cassert> +#include <iomanip> +#include <iostream> +#include <map> +#include <sstream> + +#ifdef DLA_MMD +#include <chrono> +#include <thread> +#endif + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "aocl_mmd.h" +#include "ccip_mmd_device.h" + +using namespace intel_opae_mmd; + +#define ACL_DCP_ERROR_IF(COND, NEXT, ...) \ + do { \ + if (COND) { \ + printf("\nMMD ERROR: " __VA_ARGS__); \ + fflush(stdout); \ + NEXT; \ + } \ + } while (0) + +#define ACL_PKG_SECTION_DCP_GBS_GZ ".acl.gbs.gz" + +// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime +// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure +// the runtime doesn't get to reference them after MMD destructors have been called. +// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does. +// Implemented as a singleton. +class DeviceMapManager final { + public: + typedef std::map<int, CcipDevice*> t_handle_to_dev_map; + typedef std::map<uint64_t, int> t_id_to_handle_map; + + static const int SUCCESS = 0; + static const int FAILURE = -1; + + // Returns handle and device pointer to the device with the specified name + // Creates a new entry for this device if it doesn't already exist + // Return 0 on success, -1 on failure + int get_or_create_device(const char* board_name, int* handle, CcipDevice** device); + + // Return obj id based on BSP name. + uint64_t id_from_name(const char* board_name); + + // Return MMD handle based on obj id. Returned value is negative if board doesn't exist + inline int handle_from_id(uint64_t obj_id); + + // Return pointer to CCIP device based on MMD handle. Returned value is null if board doesn't exist + CcipDevice* device_from_handle(int handle); + + // Closes specified device if it exists + void close_device_if_exists(int handle); + + // Returns a reference to the class singleton + static DeviceMapManager& get_instance() { + static DeviceMapManager instance; + return instance; + } + + DeviceMapManager(DeviceMapManager const&) = delete; + void operator=(DeviceMapManager const&) = delete; + ~DeviceMapManager() { + // delete all allocated CcipDevice* entries + while (handle_to_dev_map->size() > 0) { + int handle = handle_to_dev_map->begin()->first; + aocl_mmd_close(handle); + } + delete handle_to_dev_map; + delete id_to_handle_map; + handle_to_dev_map = nullptr; + id_to_handle_map = nullptr; + } + + private: + DeviceMapManager() { + handle_to_dev_map = new t_handle_to_dev_map(); + id_to_handle_map = new t_id_to_handle_map(); + } + t_handle_to_dev_map* handle_to_dev_map = nullptr; + t_id_to_handle_map* id_to_handle_map = nullptr; +}; +static DeviceMapManager& device_manager = DeviceMapManager::get_instance(); + +int DeviceMapManager::get_or_create_device(const char* board_name, int* handle, CcipDevice** device) { + int _handle = CCIP_MMD_INVALID_PARAM; + CcipDevice* _device = nullptr; + + if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) { + return DeviceMapManager::FAILURE; + } + + uint64_t obj_id = id_from_name(board_name); + if (id_to_handle_map->count(obj_id) == 0) { + try { + _device = new CcipDevice(obj_id); + _handle = _device->get_mmd_handle(); + id_to_handle_map->insert({obj_id, _handle}); + handle_to_dev_map->insert({_handle, _device}); + } catch (std::runtime_error& e) { + LOG_ERR("%s\n", e.what()); + delete _device; + return DeviceMapManager::FAILURE; + } + } else { + _handle = id_to_handle_map->at(obj_id); + _device = handle_to_dev_map->at(_handle); + } + + (*handle) = _handle; + (*device) = _device; + return DeviceMapManager::SUCCESS; +} + +uint64_t DeviceMapManager::id_from_name(const char* board_name) { + uint64_t obj_id = 0; + if (CcipDevice::parse_board_name(board_name, obj_id)) { + return obj_id; + } else { + // TODO: add error hanlding for DeviceMapManager (make sure 0 is marked as invalid device) + return 0; + } +} + +inline int DeviceMapManager::handle_from_id(uint64_t obj_id) { + int handle = CCIP_MMD_INVALID_PARAM; + if (id_to_handle_map) { + auto it = id_to_handle_map->find(obj_id); + if (it != id_to_handle_map->end()) { + handle = it->second; + } + } + return handle; +} + +CcipDevice* DeviceMapManager::device_from_handle(int handle) { + CcipDevice* dev = nullptr; + if (handle_to_dev_map) { + auto it = handle_to_dev_map->find(handle); + if (it != handle_to_dev_map->end()) { + return it->second; + } + } + return dev; +} + +void DeviceMapManager::close_device_if_exists(int handle) { + if (handle_to_dev_map) { + if (handle_to_dev_map->count(handle) > 0) { + CcipDevice* dev = handle_to_dev_map->at(handle); + uint64_t obj_id = dev->get_fpga_obj_id(); + delete dev; + handle_to_dev_map->erase(handle); + id_to_handle_map->erase(obj_id); + } + } +} + +// Interface for checking if AFU has BSP loaded +bool ccip_mmd_bsp_loaded(const char* name) { + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + return false; + } + + int handle = device_manager.handle_from_id(obj_id); + if (handle > 0) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->bsp_loaded(); + else + return false; + } else { + bool bsp_loaded = false; + try { + CcipDevice dev(obj_id); + bsp_loaded = dev.bsp_loaded(); + } catch (std::runtime_error& e) { + LOG_ERR("%s\n", e.what()); + return false; + } + return bsp_loaded; + } +} + +static int get_offline_num_acl_boards(bool bsp_only = true) { + fpga_guid dcp_guid; + fpga_result res = FPGA_OK; + uint32_t num_matches = 0; + bool ret_err = false; + fpga_properties filter = NULL; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + ret_err = true; + goto out; + } + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + if (bsp_only) { + res = fpgaPropertiesSetGUID(filter, dcp_guid); + if (res != FPGA_OK) { + LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + +out: + if (filter) fpgaDestroyProperties(&filter); + + if (ret_err) { + return CCIP_MMD_AOCL_ERR; + } else { + return num_matches; + } +} + +bool static get_offline_board_names(std::string& boards, bool bsp_only = true) { + fpga_guid dcp_guid; + fpga_result res = FPGA_OK; + uint32_t num_matches = 0; + fpga_properties filter = nullptr; + fpga_properties prop = nullptr; + std::ostringstream board_name; + fpga_token* toks = nullptr; + uint64_t obj_id; + bool success = true; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + success = false; + goto cleanup; + } + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + if (bsp_only) { + res = fpgaPropertiesSetGUID(filter, dcp_guid); + if (res != FPGA_OK) { + LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + } + res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + toks = static_cast<fpga_token*>(calloc(num_matches, sizeof(fpga_token))); + if (toks == NULL) { + LOG_ERR("Error allocating memory\n"); + success = false; + goto cleanup; + } + + res = fpgaEnumerate(&filter, 1, toks, num_matches, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + for (unsigned int i = 0; i < num_matches; i++) { + if (prop) fpgaDestroyProperties(&prop); + res = fpgaGetProperties(toks[i], &prop); + if (res == FPGA_OK) { + res = fpgaPropertiesGetObjectID(prop, &obj_id); + if (res != FPGA_OK) { + LOG_ERR("Error reading object ID: %s\n", fpgaErrStr(res)); + success = false; + break; + } + boards.append(CcipDevice::get_board_name(BSP_NAME, obj_id)); + if (i < num_matches - 1) boards.append(";"); + } else { + success = false; + LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res)); + } + } + +cleanup: + if (prop) { + fpgaDestroyProperties(&prop); + } + if (filter) { + fpgaDestroyProperties(&filter); + } + if (toks) { + for (unsigned i = 0; i < num_matches; i++) { + if (toks[i]) { + fpgaDestroyToken(&toks[i]); + } + } + free(toks); + } + + return success; +} + +int aocl_mmd_yield(int handle) { + DEBUG_PRINT("* Called: aocl_mmd_yield\n"); + YIELD_DELAY(); + + CcipDevice* dev = device_manager.device_from_handle(handle); + assert(dev); + if (dev) { + return dev->yield(); + } + + return 0; +} + +// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info +#define RESULT_INT(X) \ + { \ + *((int*)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(int); \ + } +#define RESULT_STR(X) \ + do { \ + unsigned Xlen = strlen(X) + 1; \ + unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \ + memcpy_s_fast((void*)param_value, param_value_size, X, Xcpylen); \ + if (param_size_ret) *param_size_ret = Xcpylen; \ + } while (0) + +int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) { + // aocl_mmd_get_offline_info can be called many times by the runtime + // and it is expensive to query the system. Only compute values first + // time aocl_mmd_get_offline_info called future iterations use saved results + static bool initialized = false; + static int mem_type_info; + static int num_acl_boards; + static std::string boards; + static bool success; + + if (!initialized) { + mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY; + num_acl_boards = get_offline_num_acl_boards(); + success = get_offline_board_names(boards, true); + initialized = true; + } + + switch (requested_info_id) { + case AOCL_MMD_VERSION: + RESULT_STR(AOCL_MMD_VERSION_STRING); + break; + case AOCL_MMD_NUM_BOARDS: { + if (num_acl_boards >= 0) { + RESULT_INT(num_acl_boards); + } else { + return CCIP_MMD_AOCL_ERR; + } + break; + } + case AOCL_MMD_VENDOR_NAME: + RESULT_STR("Intel Corp"); + break; + case AOCL_MMD_BOARD_NAMES: { + if (success) { + RESULT_STR(boards.c_str()); + } else { + return CCIP_MMD_AOCL_ERR; + } + break; + } + case AOCL_MMD_VENDOR_ID: + RESULT_INT(0); + break; + case AOCL_MMD_USES_YIELD: + RESULT_INT(KernelInterrupt::yield_is_enabled()); + break; + case AOCL_MMD_MEM_TYPES_SUPPORTED: + RESULT_INT(mem_type_info); + break; + } + + return 0; +} + +int ccip_mmd_get_offline_board_names(size_t param_value_size, void* param_value, size_t* param_size_ret) { + std::string boards; + bool success = get_offline_board_names(boards, false); + if (success) { + RESULT_STR(boards.c_str()); + } else { + RESULT_INT(-1); + } + + return 0; +} + +int aocl_mmd_get_info( + int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void* param_value, size_t* param_size_ret) { + DEBUG_PRINT("called aocl_mmd_get_info\n"); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev == NULL) return 0; + + assert(param_value); + switch (requested_info_id) { + case AOCL_MMD_BOARD_NAME: { + std::ostringstream board_name; + board_name << "Intel PAC Platform" + << " (" << dev->get_dev_name() << ")"; + RESULT_STR(board_name.str().c_str()); + break; + } + case AOCL_MMD_NUM_KERNEL_INTERFACES: + RESULT_INT(1); + break; + case AOCL_MMD_KERNEL_INTERFACES: + RESULT_INT(AOCL_MMD_KERNEL); + break; +#ifdef SIM + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#else + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#endif + case AOCL_MMD_MEMORY_INTERFACE: + RESULT_INT(AOCL_MMD_MEMORY); + break; + case AOCL_MMD_PCIE_INFO: { + RESULT_STR(dev->get_bdf().c_str()); + break; + } + case AOCL_MMD_BOARD_UNIQUE_ID: + RESULT_INT(0); + break; + case AOCL_MMD_TEMPERATURE: { + if (param_value_size == sizeof(float)) { + float* ptr = static_cast<float*>(param_value); + *ptr = dev->get_temperature(); + if (param_size_ret) *param_size_ret = sizeof(float); + } + break; + } + case AOCL_MMD_CONCURRENT_READS: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_WRITES: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_READS_OR_WRITES: + RESULT_INT(2); + break; + } + return 0; +} + +#undef RESULT_INT +#undef RESULT_STR + +int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) { + dev->set_kernel_interrupt(fn, user_data); + } else { + return CCIP_MMD_AOCL_ERR; + } + return 0; +} + +int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) dev->set_status_handler(fn, user_data); + // TODO: handle error condition if dev null + return 0; +} + +// Host to device-global-memory write +int aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) { + DCP_DEBUG_MEM("\n- aocl_mmd_write: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, src, mmd_interface, offset); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->write_block(op, mmd_interface, src, offset, len); + else + return -1; + // TODO: handle error condition if dev null +} + +int aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) { + DCP_DEBUG_MEM("\n+ aocl_mmd_read: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, dst, mmd_interface, offset); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->read_block(op, mmd_interface, dst, offset, len); + else + return -1; + // TODO: handle error condition if dev null +} + +int aocl_mmd_open(const char* name) { + DEBUG_PRINT("Opening device: %s\n", name); + + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + return CCIP_MMD_INVALID_PARAM; + } + + int handle; + CcipDevice* dev = nullptr; + if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) { + delete dev; + return CCIP_MMD_AOCL_ERR; + } + + assert(dev); + if (dev->bsp_loaded()) { + if (!dev->initialize_bsp()) { + LOG_ERR("Error initializing bsp\n"); + return CCIP_MMD_BSP_INIT_FAILED; + } + } else { + return CCIP_MMD_BSP_NOT_LOADED; + } + + return handle; +} + +int aocl_mmd_close(int handle) { + device_manager.close_device_if_exists(handle); + + return 0; +} + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 2; } +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; } +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 266.666667; } // MHz + +// Helper functions for the wrapper functions around CSR and DDR +uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; } +uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 32) * instance + addr; } + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) { + return aocl_mmd_write(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) { + return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) { + return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) { + return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} + +// Get the PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) { + constexpr uint64_t hw_timer_address = 0x37000; + const uint32_t start_bit = 1; + const uint32_t stop_bit = 2; + + // Send the start command to the hardware counter + std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now(); + int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to + // determine the amount of time between the start and stop commands for the hardware counter + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Send the stop command to the hardware counter + std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now(); + status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Read back the value of the counter + uint32_t counter = 0; + status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Calculate the clock frequency of the counter, which is running on clk_dla + double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count(); + return 1.0e-6 * counter / elapsed_seconds; // 1.0e-6 is to convert to MHz +} + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp new file mode 100644 index 0000000..9bc055a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp @@ -0,0 +1,579 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <assert.h> +#include <numa.h> + +#include <unistd.h> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <limits> +#include <sstream> + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "ccip_mmd_device.h" + +// TODO: better encapsulation of afu_bbb_util functions +#include "afu_bbb_util.h" + +#define MMD_COPY_BUFFER_SIZE (1024 * 1024) + +#define MEM_WINDOW_BBB_GUID "72347537-7821-4125-442a-472d4b615064" +#define MEM_WINDOW_BBB_SIZE 8192 + +#define MSGDMA_BBB_GUID "ef82def7-f6ec-40fc-a914-9a35bace01ea" +#define MSGDMA_BBB_SIZE 256 + +#define NULL_DFH_BBB_GUID "da1182b1-b344-4e23-90fe-6aab12a0132f" +#define BSP_AFU_GUID "96ef4230-dafa-cb5f-18b7-9ffa2ee54aa0" + +using namespace intel_opae_mmd; + +int CcipDevice::next_mmd_handle{1}; + +std::string CcipDevice::get_board_name(std::string prefix, uint64_t obj_id) { + std::ostringstream stream; + stream << prefix << std::setbase(16) << obj_id; + return stream.str(); +} + +CcipDevice::CcipDevice(uint64_t obj_id) + : fpga_obj_id(obj_id), + kernel_interrupt_thread(NULL), + event_update(NULL), + event_update_user_data(NULL), + enable_set_numa(false), + fme_sysfs_temp_initialized(false), + bus(0), + device(0), + function(0), + afu_initialized(false), + bsp_initialized(false), + mmio_is_mapped(false), + afc_handle(NULL), + filter(NULL), + afc_token(NULL), + dma_ch0_dfh_offset(0), + dma_ch1_dfh_offset(0), + dma_ase_dfh_offset(0), + dma_host_to_fpga(NULL), + dma_fpga_to_host(NULL), + mmd_copy_buffer(NULL) { + // Note that this constructor is not thread-safe because next_mmd_handle + // is shared between all class instances + mmd_handle = next_mmd_handle; + if (next_mmd_handle == std::numeric_limits<int>::max()) + next_mmd_handle = 1; + else + next_mmd_handle++; + + mmd_copy_buffer = (char *)malloc(MMD_COPY_BUFFER_SIZE); + if (mmd_copy_buffer == NULL) { + throw std::runtime_error(std::string("malloc failed for mmd_copy_buffer")); + } + + fpga_result res = FPGA_OK; + uint32_t num_matches; + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error creating properties object: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error setting object type: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaPropertiesSetObjectID(filter, obj_id); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error setting object ID: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaEnumerate(&filter, 1, &afc_token, 1, &num_matches); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error enumerating AFCs: ") + std::string(fpgaErrStr(res))); + } + + if (num_matches < 1) { + res = fpgaDestroyProperties(&filter); + throw std::runtime_error("AFC not found"); + } + + res = fpgaOpen(afc_token, &afc_handle, 0); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error opening AFC: ") + std::string(fpgaErrStr(res))); + } + + fpga_properties prop = nullptr; + res = fpgaGetProperties(afc_token, &prop); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading properties: ") + std::string(fpgaErrStr(res))); + } + + if (prop) { + res = fpgaPropertiesGetBus(prop, &bus); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading bus: ") + std::string(fpgaErrStr(res))); + } + res = fpgaPropertiesGetDevice(prop, &device); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading device: ") + std::string(fpgaErrStr(res))); + } + res = fpgaPropertiesGetFunction(prop, &function); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading function: ") + std::string(fpgaErrStr(res))); + } + fpgaDestroyProperties(&prop); + } + + initialize_fme_sysfs(); + + mmd_dev_name = get_board_name(BSP_NAME, obj_id); + afu_initialized = true; +} + +// Return true if board name parses correctly, false if it does not +// Return the parsed object_id in obj_id as an [out] parameter +bool CcipDevice::parse_board_name(const char *board_name_str, uint64_t &obj_id) { + std::string prefix(BSP_NAME); + std::string board_name(board_name_str); + + obj_id = 0; + if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) { + LOG_ERR("Error parsing device name '%s'\n", board_name_str); + return false; + } + + std::string device_num_str = board_name.substr(prefix.length()); + obj_id = std::stol(device_num_str, 0, 16); + + // Assume that OPAE does not use 0 as a valid object ID. This is true for now + // but relies somewhat on an implementaion dependent feature. + assert(obj_id > 0); + return true; +} + +// Read information directly from sysfs. This is non-portable and relies on +// paths set in driver (will not interoperate between DFH driver in up-stream +// kernel and Intel driver distributed with PAC cards). In the future hopefully +// OPAE can provide SDK to read this information +void CcipDevice::initialize_fme_sysfs() { + const int MAX_LEN = 250; + char temp_fmepath[MAX_LEN]; + char numa_path[MAX_LEN]; + + // HACK: currently ObjectID is constructed using its lower 20 bits + // as the device minor number. The device minor number also matches + // the device ID in sysfs. This is a simple way to construct a path + // to the device FME using information that is already available (object_id). + // Eventually this code should be replaced with a direct call to OPAE C API, + // but API does not currently expose the device temperature. + int dev_num = 0xFFFFF & fpga_obj_id; + + // Path to temperature value + snprintf(temp_fmepath, + MAX_LEN, + "/sys/class/fpga/intel-fpga-dev.%d/intel-fpga-fme.%d/thermal_mgmt/temperature", + dev_num, + dev_num); + // Path to NUMA node + snprintf(numa_path, MAX_LEN, "/sys/class/fpga/intel-fpga-dev.%d/device/numa_node", dev_num); + + // Try to open the sysfs file. If open succeeds then set as initialized + // to be able to read temperature in future. If open fails then not + // initalized and skip attempt to read temperature in future. + FILE *tmp; + tmp = fopen(temp_fmepath, "r"); + if (tmp) { + fme_sysfs_temp_path = std::string(temp_fmepath); + fme_sysfs_temp_initialized = true; + fclose(tmp); + } + + // Read NUMA node and set value for future use. If not available set to -1 + // and disable use of NUMA setting + std::ifstream sysfs_numa_node(numa_path, std::ifstream::in); + if (sysfs_numa_node.is_open()) { + sysfs_numa_node >> fpga_numa_node; + sysfs_numa_node.close(); + if (std::stoi(fpga_numa_node) >= 0) { + enable_set_numa = true; + } else { + enable_set_numa = false; + } + } else { + enable_set_numa = false; + fpga_numa_node = "-1"; + } +} + +bool CcipDevice::find_dma_dfh_offsets() { + uint64_t dfh_offset = 0; + uint64_t next_dfh_offset = 0; + if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ch0_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA CH1 offset: 0x%lX\t GUID: %s\n", dma_ch0_dfh_offset, MSGDMA_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA: Cannot find DMA channel 0 DFH offset\n"); + return false; + } + + dfh_offset += next_dfh_offset; + if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ch1_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA CH2 offset: 0x%lX\t GUID: %s\n", dma_ch1_dfh_offset, MSGDMA_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA. Cannot find DMA channel 2 DFH offset\n"); + return false; + } + + dfh_offset = 0; + if (find_dfh_by_guid(afc_handle, MEM_WINDOW_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ase_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA ASE offset: 0x%lX\t GUID: %s\n", dma_ase_dfh_offset, MEM_WINDOW_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA. Cannot find ASE DFH offset\n"); + return false; + } + + assert(dma_ch0_dfh_offset != 0); + assert(dma_ch1_dfh_offset != 0); + assert(dma_ase_dfh_offset != 0); + assert(dma_ch0_dfh_offset != dma_ch1_dfh_offset); + + return true; +} + +bool CcipDevice::initialize_bsp() { + if (bsp_initialized) { + return true; + } + + fpga_result res = fpgaMapMMIO(afc_handle, 0, NULL); + if (res != FPGA_OK) { + LOG_ERR("Error mapping MMIO space: %s\n", fpgaErrStr(res)); + return false; + } + mmio_is_mapped = true; + + /* Reset AFC */ + res = fpgaReset(afc_handle); + if (res != FPGA_OK) { + LOG_ERR("Error resetting AFC: %s\n", fpgaErrStr(res)); + return false; + } + AFU_RESET_DELAY(); + + // DMA performance is heavily dependent on the memcpy operation that transfers + // data from user allocated buffer to the pinned buffer that is used for + // DMA. On some machines with multiple NUMA nodes it is critical for performance + // that the pinned buffer is located on the NUMA node as the threads that + // performs the DMA operation. + // + // The performance also improves slighlty if the DMA threads are on the same + // NUMA node as the FPGA PCI device. + // + // This code pins memory allocation to occur from FPGA NUMA node prior to + // initializing the DMA buffers. It also pins all threads in the process + // to run on this same node. + struct bitmask *mask = NULL; + if (enable_set_numa) { + mask = numa_parse_nodestring(fpga_numa_node.c_str()); + numa_set_membind(mask); + int ret = numa_run_on_node_mask_all(mask); + if (ret < 0) { + fprintf(stderr, " Error setting NUMA node mask\n"); + } + } + + find_dma_dfh_offsets(); + + const int dma_ch0_interrupt_num = 0; // DMA channel 0 hardcoded to interrupt 0 + dma_host_to_fpga = new mmd_dma(afc_handle, mmd_handle, dma_ch0_dfh_offset, dma_ase_dfh_offset, dma_ch0_interrupt_num); + if (!dma_host_to_fpga->initialized()) { + LOG_ERR("Error initializing mmd dma\n"); + delete dma_host_to_fpga; + return false; + } + + const int dma_ch1_interrupt_num = 2; // DMA channel 1 hardcoded to interrupt 2 + dma_fpga_to_host = new mmd_dma(afc_handle, mmd_handle, dma_ch1_dfh_offset, dma_ase_dfh_offset, dma_ch1_interrupt_num); + if (!dma_fpga_to_host->initialized()) { + fprintf(stderr, "Error initializing mmd dma\n"); + return false; + } + + // Turn off membind restriction in order to allow future allocation to + // occur on different NUMA nodes if needed. Hypothesis is that only + // the pinned buffers are performance critical for the memcpy. Other + // allocations in the process can occur on other NUMA nodes if needed. + if (enable_set_numa) { + numa_set_membind(numa_nodes_ptr); + numa_free_nodemask(mask); + } + + kernel_interrupt_thread = new KernelInterrupt(afc_handle, mmd_handle); + + if (!kernel_interrupt_thread->initialized()) { + LOG_ERR("Error initializing kernel interrupts\n"); + delete kernel_interrupt_thread; + return false; + } + + bsp_initialized = true; + return bsp_initialized; +} + +CcipDevice::~CcipDevice() { + int num_errors = 0; + if (mmd_copy_buffer) { + free(mmd_copy_buffer); + mmd_copy_buffer = NULL; + } + + if (kernel_interrupt_thread) { + delete kernel_interrupt_thread; + kernel_interrupt_thread = NULL; + } + + if (dma_host_to_fpga) { + delete dma_host_to_fpga; + dma_host_to_fpga = NULL; + } + + if (dma_fpga_to_host) { + delete dma_fpga_to_host; + dma_fpga_to_host = NULL; + } + + if (mmio_is_mapped) { + if (fpgaUnmapMMIO(afc_handle, 0)) num_errors++; + } + + if (afc_handle) { + if (fpgaClose(afc_handle) != FPGA_OK) num_errors++; + } + + if (afc_token) { + if (fpgaDestroyToken(&afc_token) != FPGA_OK) num_errors++; + } + + if (filter) { + if (fpgaDestroyProperties(&filter) != FPGA_OK) num_errors++; + } + + if (num_errors > 0) { + DEBUG_PRINT("Error freeing resources in destructor\n"); + } +} + +int CcipDevice::yield() { + if (kernel_interrupt_thread) kernel_interrupt_thread->yield(); + return 0; +} + +bool CcipDevice::bsp_loaded() { + fpga_guid dcp_guid; + fpga_guid afu_guid; + fpga_properties prop; + fpga_result res; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + return false; + } + + res = fpgaGetProperties(afc_token, &prop); + if (res != FPGA_OK) { + LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res)); + fpgaDestroyProperties(&prop); + return false; + } + + res = fpgaPropertiesGetGUID(prop, &afu_guid); + if (res != FPGA_OK) { + LOG_ERR("Error reading GUID\n"); + fpgaDestroyProperties(&prop); + return false; + } + + fpgaDestroyProperties(&prop); + if (uuid_compare(dcp_guid, afu_guid) == 0) { + return true; + } else { + return false; + } +} + +std::string CcipDevice::get_bdf() { + std::ostringstream bdf; + bdf << std::setfill('0') << std::setw(2) << unsigned(bus) << ":" << std::setfill('0') << std::setw(2) + << unsigned(device) << "." << unsigned(function); + + return bdf.str(); +} + +float CcipDevice::get_temperature() { + float temp = 0; + if (fme_sysfs_temp_initialized) { + std::ifstream sysfs_temp(fme_sysfs_temp_path, std::ifstream::in); + sysfs_temp >> temp; + sysfs_temp.close(); + } + return temp; +} + +void CcipDevice::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + if (kernel_interrupt_thread) { + kernel_interrupt_thread->set_kernel_interrupt(fn, user_data); + } +} + +void CcipDevice::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + event_update = fn; + event_update_user_data = user_data; + dma_host_to_fpga->set_status_handler(fn, user_data); + dma_fpga_to_host->set_status_handler(fn, user_data); +} + +void CcipDevice::event_update_fn(aocl_mmd_op_t op, int status) { + event_update(mmd_handle, event_update_user_data, op, status); +} + +int CcipDevice::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) { + fpga_result res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO read of + // base address + offset + if (mmd_interface == AOCL_MMD_MEMORY) { + res = dma_fpga_to_host->read_memory(op, static_cast<uint64_t *>(host_addr), offset, size); + } else { + res = read_mmio(host_addr, mmd_interface + offset, size); + + if (op) { + // TODO: check what status value should really be instead of just using 0 + // Also handle case when op is NULL + this->event_update_fn(op, 0); + } + } + + if (res != FPGA_OK) { + LOG_ERR("fpgaReadMMIO error: %s\n", fpgaErrStr(res)); + return -1; + } else { + return 0; + } +} + +int CcipDevice::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) { + fpga_result res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO write + if (mmd_interface == AOCL_MMD_MEMORY) { + res = dma_host_to_fpga->write_memory(op, static_cast<const uint64_t *>(host_addr), offset, size); + } else { + res = write_mmio(host_addr, mmd_interface + offset, size); + + if (op) { + // TODO: check what 'status' value should really be. Right now just + // using 0 as was done in previous CCIP MMD. Also handle case if op is NULL + this->event_update_fn(op, 0); + } + } + + // TODO: check what status values aocl wants and also parse the result + if (res != FPGA_OK) { + LOG_ERR("fpgaWriteMMIO error: %s\n", fpgaErrStr(res)); + return -1; + } else { + return 0; + } +} + +fpga_result CcipDevice::read_mmio(void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + DCP_DEBUG_MEM("read_mmio start: %p\t %lx\t %lu\n", host_addr, mmio_addr, size); + + // HACK: need extra delay for opencl sw reset + if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY(); + + uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr); + while (size >= 8) { + res = fpgaReadMMIO64(afc_handle, 0, mmio_addr, host_addr64); + if (res != FPGA_OK) return res; + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64); + while (size >= 4) { + res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, host_addr32); + if (res != FPGA_OK) return res; + host_addr32 += 1; + mmio_addr += 4; + size -= 4; + } + + if (size > 0) { + uint32_t read_data; + res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, &read_data); + if (res != FPGA_OK) return res; + memcpy_s_fast(host_addr32, size, &read_data, size); + } + + return res; +} + +fpga_result CcipDevice::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + DEBUG_PRINT("write_mmio\n"); + + // HACK: need extra delay for opencl sw reset + if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY(); + + const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr); + while (size >= 8) { + res = fpgaWriteMMIO64(afc_handle, 0, mmio_addr, *host_addr64); + if (res != FPGA_OK) return res; + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64); + while (size > 0) { + uint32_t tmp_data32 = 0; + size_t chunk_size = (size >= 4) ? 4 : size; + memcpy_s_fast(&tmp_data32, sizeof(tmp_data32), host_addr32, chunk_size); + res = fpgaWriteMMIO32(afc_handle, 0, mmio_addr, tmp_data32); + if (res != FPGA_OK) return res; + host_addr32 += 1; + mmio_addr += chunk_size; + size -= chunk_size; + } + + return res; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h new file mode 100644 index 0000000..f8088ac --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h @@ -0,0 +1,187 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _CCIP_MMD_DEVICE_H +#define _CCIP_MMD_DEVICE_H + +#include <limits.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <string> + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE +#include <sched.h> +#pragma pop_macro("_GNU_SOURCE") + +#include <opae/fpga.h> +#include <uuid/uuid.h> + +#include "aocl_mmd.h" +#include "kernel_interrupt.h" +#include "mmd_dma.h" + +// Tune delay for simulation or HW. Eventually delay +// should be removed for HW, may still be needed for ASE simulation +#ifdef SIM +#define DELAY_MULTIPLIER 100 +#else +#define DELAY_MULTIPLIER 1 +#endif + +// Most AOCL_MMD_CALL functions return negative number in case of error, +// CCIP_MMD_AOCL_ERR is used to indicate an error from the MMD that is being +// returned to the runtime. Simply set to -2 for now since neither interface +// defines a meaning to return codes for errors. +#define CCIP_MMD_AOCL_ERR -1 + +// NOTE: some of the code relies on invalid handle returning -1 +// future TODO eliminate dependency on specific error values +#define CCIP_MMD_INVALID_PARAM -1 + +// Our diagnostic script relies on handle values < -1 to determine when +// a valid device is present but a functioning BSP is not loaded. +#define CCIP_MMD_BSP_NOT_LOADED -2 +#define CCIP_MMD_BSP_INIT_FAILED -3 + +// Delay settings +// TODO: Figure out why these delays are needed and +// have requirement removed (at least for HW) +#define MMIO_DELAY() +#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER) +#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER) +#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER) + +#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30) + +#define DCP_OPENCL_BSP_AFU_ID "63B3779B-8BDD-4F03-9CEB-0301181D6AEF" + +#define BSP_NAME "pac_" + +// LOG ERRORS +#define CCIP_MMD_ERR_LOGGING 1 +#ifdef CCIP_MMD_ERR_LOGGING +#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) +#else +#define LOG_ERR(...) +#endif + +// debugging +#ifdef DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#ifdef DEBUG_MEM +#define DCP_DEBUG_MEM(...) fprintf(stderr, __VA_ARGS__) +#else +#define DCP_DEBUG_MEM(...) +#endif + +enum { +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + AOCL_IRQ_POLLING_BASE = 0x0100, // CSR to polling interrupt status + AOCL_IRQ_MASKING_BASE = 0x0108, // CSR to set/unset interrupt mask + AOCL_MMD_KERNEL = 0x4000, /* Control interface into kernel interface */ +#else + AOCL_MMD_KERNEL = 0, // CoreDLA completely removes the Opencl kernel interface, repurposed for CSRs +#endif + AOCL_MMD_MEMORY = 0x100000 /* Data interface to device memory */ +}; + +enum AfuStatu { CCIP_MMD_INVALID_ID = 0, CCIP_MMD_BSP, CCIP_MMD_AFU }; + +class CcipDevice final { + public: + CcipDevice(uint64_t); + CcipDevice(const CcipDevice &) = delete; + CcipDevice &operator=(const CcipDevice &) = delete; + ~CcipDevice(); + + static std::string get_board_name(std::string prefix, uint64_t obj_id); + static bool parse_board_name(const char *board_name, uint64_t &obj_id); + + int get_mmd_handle() { return mmd_handle; } + uint64_t get_fpga_obj_id() { return fpga_obj_id; } + std::string get_dev_name() { return mmd_dev_name; } + std::string get_bdf(); + float get_temperature(); + bool initialize_bsp(); + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data); + void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + int yield(); + void event_update_fn(aocl_mmd_op_t op, int status); + bool bsp_loaded(); + + int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size); + + int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size); + + private: + static int next_mmd_handle; + + int mmd_handle; + uint64_t fpga_obj_id; + std::string mmd_dev_name; + intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread; + aocl_mmd_status_handler_fn event_update; + void *event_update_user_data; + + // HACK: use the sysfs path to read temperature value and NUMA node + // this should be replaced with OPAE call once that is + // available + std::string fme_sysfs_temp_path; + std::string fpga_numa_node; + bool enable_set_numa; + bool fme_sysfs_temp_initialized; + void initialize_fme_sysfs(); + + void initialize_local_cpus_sysfs(); + + bool find_dma_dfh_offsets(); + + uint8_t bus; + uint8_t device; + uint8_t function; + + bool afu_initialized; + bool bsp_initialized; + bool mmio_is_mapped; + + fpga_handle afc_handle; + fpga_properties filter; + fpga_token afc_token; + uint64_t dma_ch0_dfh_offset; + uint64_t dma_ch1_dfh_offset; + uint64_t dma_ase_dfh_offset; + intel_opae_mmd::mmd_dma *dma_host_to_fpga; + intel_opae_mmd::mmd_dma *dma_fpga_to_host; + + char *mmd_copy_buffer; + + // Helper functions + fpga_result read_mmio(void *host_addr, size_t dev_addr, size_t size); + fpga_result write_mmio(const void *host_addr, size_t dev_addr, size_t size); +}; + +#endif // _CCIP_MMD_DEVICE_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp new file mode 100644 index 0000000..30113eb --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp @@ -0,0 +1,151 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include "dma_work_thread.h" +#include <assert.h> +#include <poll.h> +#include <stdlib.h> +#include <string.h> +#include <cstdint> +#include <iostream> +#include <thread> +#include "ccip_mmd_device.h" +#include "eventfd_wrapper.h" +#include "mmd_dma.h" + +using namespace intel_opae_mmd; + +dma_work_thread::dma_work_thread(mmd_dma &mmd_dma_arg) + : m_initialized(false), + m_thread_wake_event(NULL), + m_thread(NULL), + m_work_queue_mutex(), + m_work_queue(), + m_mmd_dma(mmd_dma_arg) { + m_thread_wake_event = new eventfd_wrapper(); + if (!m_thread_wake_event->initialized()) return; + + m_thread = new std::thread(work_thread, std::ref(*this)); + + m_initialized = true; +} + +dma_work_thread::~dma_work_thread() { + // kill the thread + if (m_thread) { + // send message to thread to end it + m_thread_wake_event->notify(UINT64_MAX - 1); + + // join with thread until it ends + m_thread->join(); + + delete m_thread; + m_thread = NULL; + } + + if (m_thread_wake_event) { + delete m_thread_wake_event; + m_thread_wake_event = NULL; + } + + m_initialized = false; +} + +void dma_work_thread::work_thread(dma_work_thread &obj) { + int res; + + // get eventfd handle + int thread_signal_fd = obj.m_thread_wake_event->get_fd(); + + struct pollfd pollfd_setup; + while (1) { + pollfd_setup.fd = thread_signal_fd; + pollfd_setup.events = POLLIN; + pollfd_setup.revents = 0; + res = poll(&pollfd_setup, 1, -1); + if (res < 0) { + fprintf(stderr, "Poll error errno = %s\n", strerror(errno)); + } else if (res > 0 && pollfd_setup.revents == POLLIN) { + uint64_t count_work_items = 0; + ssize_t bytes_read = read(thread_signal_fd, &count_work_items, sizeof(count_work_items)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + // TODO: the MMD should not exit. But I have a different branch + // I'm working on that will change synchronization to use + // condition variable instead of eventfd in synchronization + // within the same process. Will remove this exit() call at + // when PR for that change is submitted. + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + exit(-1); + } + + // Ensure count is in proper range + const unsigned long MAX_WORK_ITEMS = 1000000000; + if (count_work_items > MAX_WORK_ITEMS && count_work_items != (UINT64_MAX - 1)) { + fprintf(stderr, "Error: poll value is out of range"); + exit(-1); + } + + obj.m_work_queue_mutex.lock(); + if (obj.m_work_queue.empty() && count_work_items == UINT64_MAX - 1) { + // The maximum value of count is set when there is no work left + // The work queue must also be empty + // This thread can break out of the loop + obj.m_work_queue_mutex.unlock(); + break; + } + + std::queue<dma_work_item> items; + for (uint64_t i = 0; i < count_work_items; i++) { + // Check if there are enough jobs in the work queue as requested (count) + if (obj.m_work_queue.empty()) { + fprintf(stderr, "Poll error. Not enough tasks in queue."); + exit(-1); + } + dma_work_item item = obj.m_work_queue.front(); + items.push(item); + obj.m_work_queue.pop(); + } + obj.m_work_queue_mutex.unlock(); + + while (!items.empty()) { + dma_work_item item = items.front(); + obj.do_dma(item); + items.pop(); + } + } + } +} + +int dma_work_thread::enqueue_dma(dma_work_item &item) { + if (item.op) { + m_work_queue_mutex.lock(); + m_work_queue.push(item); + m_work_queue_mutex.unlock(); + // send message to thread to wake it + // setting count to 1 as only 1 job is pushed to the work queue + m_thread_wake_event->notify(1); + return 0; + } else { + // if op is not specified, it is a blocking operation and we don't use + // the thread + return do_dma(item); + } +} + +int dma_work_thread::do_dma(dma_work_item &item) { return m_mmd_dma.do_dma(item); } diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h new file mode 100644 index 0000000..0afb036 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h @@ -0,0 +1,73 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _DMA_WORK_THREAD_H +#define _DMA_WORK_THREAD_H + +#include <opae/fpga.h> + +#include <mutex> +#include <queue> +#include <thread> + +#include "aocl_mmd.h" + +namespace intel_opae_mmd { + +// forward class definitions +class eventfd_wrapper; +class mmd_dma; + +class dma_work_item { + public: + aocl_mmd_op_t op; + uint64_t *rd_host_addr; + const uint64_t *wr_host_addr; + size_t dev_addr; + size_t size; +}; + +class dma_work_thread final { + public: + dma_work_thread(mmd_dma &mmd_dma_arg); + ~dma_work_thread(); + + bool initialized() { return m_initialized; } + + int enqueue_dma(dma_work_item &item); + int do_dma(dma_work_item &item); + + private: + static void work_thread(dma_work_thread &obj); + + bool m_initialized; + + eventfd_wrapper *m_thread_wake_event; + std::thread *m_thread; + std::mutex m_work_queue_mutex; + std::queue<dma_work_item> m_work_queue; + + mmd_dma &m_mmd_dma; + + // not used and not implemented + dma_work_thread(dma_work_thread &other); + dma_work_thread &operator=(const dma_work_thread &other); +}; // class dma_work_thread + +}; // namespace intel_opae_mmd + +#endif // _DMA_WORK_THREAD_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h new file mode 100644 index 0000000..2de3f74 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h @@ -0,0 +1,74 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _EVENTFD_WRAPPER_H +#define _EVENTFD_WRAPPER_H + +#include <sys/eventfd.h> +#include <unistd.h> + +namespace intel_opae_mmd { + +// simple wrapper class for managing eventfd objects +class eventfd_wrapper final { + public: + eventfd_wrapper() { + m_initialized = false; + // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set + // The implementation of functions using eventfd assumes that + m_fd = eventfd(0, 0); + if (m_fd < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return; + } + + m_initialized = true; + } + + ~eventfd_wrapper() { + if (m_initialized) { + if (close(m_fd) < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + } + } + } + + bool notify(uint64_t count) { + ssize_t res = write(m_fd, &count, sizeof(count)); + if (res < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return false; + } + return true; + } + + int get_fd() { return m_fd; } + bool initialized() { return m_initialized; } + + private: + // not used and not implemented + eventfd_wrapper(eventfd_wrapper& other); + eventfd_wrapper& operator=(const eventfd_wrapper& other); + + // member varaibles + int m_fd; + int m_initialized; +}; // class eventfd_wrapper + +}; // namespace intel_opae_mmd + +#endif // _EVENTFD_WRAPPER_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c new file mode 100644 index 0000000..6c8df30 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c @@ -0,0 +1,1313 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma.c + * \brief FPGA DMA User-mode driver + */ + +#include "fpga_dma.h" +#include <assert.h> +#include <errno.h> +#include <opae/fpga.h> +#include <poll.h> +#include <safe_string/safe_string.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> +#include "fpga_dma_internal.h" +#include "memcpy_s_fast.h" + +#ifdef SIM +#define USE_ASE +#else +// TODO: Need this until we can adequately sync MMIO R/W with pointer accesses. +// Causes module to use fpgaMMIORead32() instead of foo = *ptr; +#define USE_ASE +#endif + +#ifdef FPGA_DMA_DEBUG +static int err_cnt = 0; +#endif + +#ifdef CHECK_DELAYS +double poll_wait_count = 0; +double buf_full_count = 0; +#endif + +/* + * macro for checking return codes + */ +#define ON_ERR_GOTO(res, label, desc) \ + do { \ + if ((res) != FPGA_OK) { \ + error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \ + goto label; \ + } \ + } while (0) + +#define ON_ERR_RETURN(res, desc) \ + do { \ + if ((res) != FPGA_OK) { \ + error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \ + return (res); \ + } \ + } while (0) + +// Internal Functions + +/** + * MMIOWrite64Blk + * + * @brief Writes a block of 64-bit values to FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIOWrite64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_QWORD(device)); + assert(IS_ALIGNED_QWORD(bytes)); + + uint64_t *haddr = (uint64_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device); + for (i = 0; i < bytes / sizeof(uint64_t); i++) { +#ifdef USE_ASE + res = fpgaWriteMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, *haddr); + ON_ERR_RETURN(res, "fpgaWriteMMIO64"); + haddr++; + device += sizeof(uint64_t); +#else + *dev_addr++ = *haddr++; +#endif + } + return res; +} + +/** + * MMIOWrite32Blk + * + * @brief Writes a block of 32-bit values to FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIOWrite32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_DWORD(device)); + assert(IS_ALIGNED_DWORD(bytes)); + + uint32_t *haddr = (uint32_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device); + for (i = 0; i < bytes / sizeof(uint32_t); i++) { +#ifdef USE_ASE + res = fpgaWriteMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, *haddr); + ON_ERR_RETURN(res, "fpgaWriteMMIO32"); + haddr++; + device += sizeof(uint32_t); +#else + *dev_addr++ = *haddr++; +#endif + } + return res; +} + +/** + * MMIORead64Blk + * + * @brief Reads a block of 64-bit values from FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIORead64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_QWORD(device)); + assert(IS_ALIGNED_QWORD(bytes)); + + uint64_t *haddr = (uint64_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr); + for (i = 0; i < bytes / sizeof(uint64_t); i++) { +#ifdef USE_ASE + res = fpgaReadMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, haddr); + ON_ERR_RETURN(res, "fpgaReadMMIO64"); + haddr++; + device += sizeof(uint64_t); +#else + *haddr++ = *dev_addr++; +#endif + } + return res; +} + +/** + * MMIORead32Blk + * + * @brief Reads a block of 32-bit values from FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIORead32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_DWORD(device)); + assert(IS_ALIGNED_DWORD(bytes)); + + uint32_t *haddr = (uint32_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr); + for (i = 0; i < bytes / sizeof(uint32_t); i++) { +#ifdef USE_ASE + res = fpgaReadMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, haddr); + ON_ERR_RETURN(res, "fpgaReadMMIO32"); + haddr++; + device += sizeof(uint32_t); +#else + *haddr++ = *dev_addr++; +#endif + } + return res; +} + +// Feature type is BBB +static inline bool fpga_dma_feature_is_bbb(uint64_t dfh) { + // BBB is type 2 + return ((dfh >> AFU_DFH_TYPE_OFFSET) & 0xf) == FPGA_DMA_BBB; +} + +/** + * _switch_to_ase_page + * + * @brief Updates the current page of ASE to the address given + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] addr Address to which the ASE page should be switched + * @return Nothing. Side-effect is to update the current page in the DMA handle. + * + */ +static inline void _switch_to_ase_page(fpga_dma_handle dma_h, uint64_t addr) { + uint64_t requested_page = addr & ~DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + if (requested_page != dma_h->cur_ase_page) { + MMIOWrite64Blk(dma_h, ASE_CNTL_BASE(dma_h), (uint64_t)&requested_page, sizeof(requested_page)); + dma_h->cur_ase_page = requested_page; + } +} + +/** + * _send_descriptor + * + * @brief Queues a DMA descriptor to the FPGA + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] desc Pointer to a descriptor structure to send + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _send_descriptor(fpga_dma_handle dma_h, msgdma_ext_desc_t *desc) { + fpga_result res = FPGA_OK; + msgdma_status_t status = {0}; + + debug_print("desc.rd_address = %x\n", desc->rd_address); + debug_print("desc.wr_address = %x\n", desc->wr_address); + debug_print("desc.len = %x\n", desc->len); + debug_print("desc.wr_burst_count = %x\n", desc->wr_burst_count); + debug_print("desc.rd_burst_count = %x\n", desc->rd_burst_count); + debug_print("desc.wr_stride %x\n", desc->wr_stride); + debug_print("desc.rd_stride %x\n", desc->rd_stride); + debug_print("desc.rd_address_ext %x\n", desc->rd_address_ext); + debug_print("desc.wr_address_ext %x\n", desc->wr_address_ext); + + debug_print("SGDMA_CSR_BASE = %lx SGDMA_DESC_BASE=%lx\n", dma_h->dma_csr_base, dma_h->dma_desc_base); + +#ifdef CHECK_DELAYS + bool first = true; +#endif + do { + res = MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg)); + ON_ERR_GOTO(res, out, "MMIORead32Blk"); +#ifdef CHECK_DELAYS + if (first && status.st.desc_buf_full) { + buf_full_count++; + first = false; + } +#endif + } while (status.st.desc_buf_full); + + res = MMIOWrite64Blk(dma_h, dma_h->dma_desc_base, (uint64_t)desc, sizeof(*desc)); + ON_ERR_GOTO(res, out, "MMIOWrite64Blk"); + +out: + return res; +} + +/** + * _do_dma + * + * @brief Performs a DMA transaction with the FPGA + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] dst Pointer to a host or FPGA buffer to send or retrieve + * @param[in] src Pointer to a host or FPGA buffer to send or retrieve + * @param[in] count Number of bytes + * @param[in] is_last_desc True if this is the last buffer of a batch + * @param[in] type Direction of transfer + * @param[in] intr_en True means to ask for an interrupt from the FPGA + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _do_dma(fpga_dma_handle dma_h, + uint64_t dst, + uint64_t src, + int count, + int is_last_desc, + fpga_dma_transfer_t type, + bool intr_en) { + msgdma_ext_desc_t desc = {0}; + fpga_result res = FPGA_OK; + int alignment_offset = 0; + int segment_size = 0; + + // src, dst and count must be 64-byte aligned + if (dst % FPGA_DMA_ALIGN_BYTES != 0 || src % FPGA_DMA_ALIGN_BYTES != 0 || count % FPGA_DMA_ALIGN_BYTES != 0) { + return FPGA_INVALID_PARAM; + } + // these fields are fixed for all DMA transfers + desc.seq_num = 0; + desc.wr_stride = 1; + desc.rd_stride = 1; + + desc.control.go = 1; + if (intr_en) + desc.control.transfer_irq_en = 1; + else + desc.control.transfer_irq_en = 0; + + // Enable "earlyreaddone" in the control field of the descriptor except the last. + // Setting early done causes the read logic to move to the next descriptor + // before the previous descriptor completes. + // This elminates a few hundred clock cycles of waiting between transfers. + if (!is_last_desc) + desc.control.early_done_en = 1; + else + desc.control.early_done_en = 0; + + if (type == FPGA_TO_FPGA_MM) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.len = count; + desc.wr_burst_count = 4; + desc.rd_burst_count = 4; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // either FPGA to Host or Host to FPGA transfer so we need to make sure the DMA transaction is aligned to the burst + // size (CCIP restriction) + else { + // need to determine if the CCIP (host) address is aligned to 4CL (256B). When 0 the CCIP address is aligned. + alignment_offset = + (type == HOST_TO_FPGA_MM) ? (src % (4 * FPGA_DMA_ALIGN_BYTES)) : (dst % (4 * FPGA_DMA_ALIGN_BYTES)); + + // not aligned to 4CL so performing a short transfer to get aligned + if (alignment_offset != 0) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.wr_burst_count = 1; + desc.rd_burst_count = 1; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + // count isn't large enough to hit next 4CL boundary + if (((4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset) >= count) { + segment_size = count; + count = 0; // only had to transfer count amount of data to reach the end of the provided buffer + } else { + segment_size = (4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset; + src += segment_size; + dst += segment_size; + count -= segment_size; // subtract the segment size from count since the transfer below will bring us into 4CL + // alignment + desc.control.transfer_irq_en = 0; + } + + // will post short transfer to align to a 4CL (256 byte) boundary + desc.len = segment_size; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // at this point we are 4CL (256 byte) aligned + // if there is at least 4CL (256 bytes) of data to transfer, post bursts of 4 + if (count >= (4 * FPGA_DMA_ALIGN_BYTES)) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.wr_burst_count = 4; + desc.rd_burst_count = 4; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + // buffer ends on 4CL boundary + if ((count % (4 * FPGA_DMA_ALIGN_BYTES)) == 0) { + segment_size = count; + count = 0; // transfer below will move the remainder of the buffer + } + // buffers do not end on 4CL boundary so transfer only up to the last 4CL boundary leaving a segment at the end to + // finish later + else { + segment_size = count - (count % (4 * FPGA_DMA_ALIGN_BYTES)); // round count down to the nearest multiple of 4CL + src += segment_size; + dst += segment_size; + count -= segment_size; + desc.control.transfer_irq_en = 0; + } + + desc.len = segment_size; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // at this point we have posted all the bursts of length 4 we can but there might be 64, 128, or 192 bytes of data + // to transfer still if buffer did not end on 4CL (256 byte) boundary post short transfer to handle the remainder + if (count > 0) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.len = count; + desc.wr_burst_count = 1; + desc.rd_burst_count = 1; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + if (intr_en) desc.control.transfer_irq_en = 1; + // will post short transfer to move the remainder of the buffer + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + + } // end of FPGA --> Host or Host --> FPGA transfer + +out: + return res; +} + +fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dfh_offset, int interrupt_num, fpga_dma_handle *dma_p) { + fpga_result res = FPGA_OK; + fpga_dma_handle dma_h = NULL; + int i = 0; + if (!fpga) { + return FPGA_INVALID_PARAM; + } + if (!dma_p) { + return FPGA_INVALID_PARAM; + } + // init the dma handle + dma_h = (fpga_dma_handle)malloc(sizeof(struct _dma_handle_t)); + if (!dma_h) { + return FPGA_NO_MEMORY; + } + dma_h->fpga_h = fpga; + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) dma_h->dma_buf_ptr[i] = NULL; + dma_h->mmio_num = 0; + dma_h->cur_ase_page = 0xffffffffffffffffUll; + + // Discover DMA BBB by traversing the device feature list + bool dma_found = false; + +#ifndef USE_ASE + res = fpgaMapMMIO(dma_h->fpga_h, 0, (uint64_t **)&dma_h->mmio_va); + ON_ERR_GOTO(res, out, "fpgaMapMMIO"); +#endif + + dfh_feature_t dfh = {0}; + res = MMIORead64Blk(dma_h, dfh_offset, (uint64_t)&dfh, sizeof(dfh)); + ON_ERR_GOTO(res, out, "MMIORead64Blk"); + + if (fpga_dma_feature_is_bbb(dfh.dfh) && (dfh.feature_uuid_lo == FPGA_DMA_UUID_L) && + (dfh.feature_uuid_hi == FPGA_DMA_UUID_H)) { + dma_h->dma_base = dfh_offset; + dma_h->dma_csr_base = dma_h->dma_base + FPGA_DMA_CSR; + dma_h->dma_desc_base = dma_h->dma_base + FPGA_DMA_DESC; + dma_h->dma_ase_cntl_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_CNTL; + dma_h->dma_ase_data_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_DATA; + dma_found = true; + *dma_p = dma_h; + res = FPGA_OK; + } else { + *dma_p = NULL; + res = FPGA_NOT_FOUND; + goto out; + } + + // Buffer size must be page aligned for prepareBuffer + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaPrepareBuffer( + dma_h->fpga_h, FPGA_DMA_BUF_SIZE, (void **)&(dma_h->dma_buf_ptr[i]), &dma_h->dma_buf_wsid[i], 0); + ON_ERR_GOTO(res, out, "fpgaPrepareBuffer"); + + // Make sure it's actually allocated + dma_h->dma_buf_ptr[i][0] = 0xff; + madvise((void *)dma_h->dma_buf_ptr[i], FPGA_DMA_BUF_SIZE, MADV_SEQUENTIAL); + + res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->dma_buf_wsid[i], &dma_h->dma_buf_iova[i]); + ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress"); + } + + // Allocate magic number buffer + res = fpgaPrepareBuffer(dma_h->fpga_h, FPGA_DMA_ALIGN_BYTES, (void **)&(dma_h->magic_buf), &dma_h->magic_wsid, 0); + ON_ERR_GOTO(res, out, "fpgaPrepareBuffer"); + + dma_h->magic_buf[0] = 0xff; + + res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->magic_wsid, &dma_h->magic_iova); + ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress"); + memset((void *)dma_h->magic_buf, 0, FPGA_DMA_ALIGN_BYTES); + + // turn on global interrupts + msgdma_ctrl_t ctrl = {0}; + ctrl.ct.global_intr_en_mask = 1; + res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg)); + ON_ERR_GOTO(res, rel_buf, "MMIOWrite32Blk"); + + // register interrupt event handle + res = fpgaCreateEventHandle(&dma_h->eh); + ON_ERR_GOTO(res, rel_buf, "fpgaCreateEventHandle"); + + res = fpgaRegisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh, interrupt_num /*vector id */); + ON_ERR_GOTO(res, destroy_eh, "fpgaRegisterEvent"); + + return FPGA_OK; + +destroy_eh: + res = fpgaDestroyEventHandle(&dma_h->eh); + ON_ERR_GOTO(res, rel_buf, "fpgaDestroyEventHandle"); + +rel_buf: + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer"); + } +out: + if (!dma_found) { + free(dma_h); + } + return res; +} + +/** + * _read_memory_mmio_unaligned + * + * @brief Performs a unaligned read(address not 4/8/64 byte aligned) from FPGA address(device address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dev_addr FPGA address + * @param[in] host_addr Host buffer address + * @param[in] count Size in bytes, always less than 8bytes. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _read_memory_mmio_unaligned(fpga_dma_handle dma_h, + uint64_t dev_addr, + uint64_t host_addr, + uint64_t count) { + fpga_result res = FPGA_OK; + + assert(count < QWORD_BYTES); + + if (0 == count) return res; + + uint64_t shift = dev_addr % QWORD_BYTES; + debug_print("shift = %08lx , count = %08lx \n", shift, count); + + _switch_to_ase_page(dma_h, dev_addr); + uint64_t dev_aligned_addr = (dev_addr - shift) & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + // read data from device memory + uint64_t read_tmp = 0; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + // overlay our data + memcpy_s_fast((void *)host_addr, count, ((char *)(&read_tmp)) + shift, count); + + return res; +} + +/** + * _write_memory_mmio_unaligned + * + * @brief Performs an unaligned write(address not 4/8/64 byte aligned) to FPGA address(device address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dev_addr FPGA address + * @param[in] host_addr Host buffer address + * @param[in] count Size in bytes, always less than 8bytes. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _write_memory_mmio_unaligned(fpga_dma_handle dma_h, + uint64_t dev_addr, + uint64_t host_addr, + uint64_t count) { + fpga_result res = FPGA_OK; + + assert(count < QWORD_BYTES); + + if (0 == count) return res; + + uint64_t shift = dev_addr % QWORD_BYTES; + debug_print("shift = %08lx , count = %08lx \n", shift, count); + + _switch_to_ase_page(dma_h, dev_addr); + uint64_t dev_aligned_addr = (dev_addr - (dev_addr % QWORD_BYTES)) & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + // read data from device memory + uint64_t read_tmp = 0; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + // overlay our data + memcpy_s_fast(((char *)(&read_tmp)) + shift, count, (void *)host_addr, count); + + // write back to device + res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + return res; +} + +/** + * _write_memory_mmio + * + * @brief Writes to a DWORD/QWORD aligned memory address(FPGA address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the FPGA address + * @param[in/out] src_ptr Pointer to the Host buffer address + * @param[in/out] count Pointer to the Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src, dst, and count + * + */ +static fpga_result _write_memory_mmio(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t *count) { + fpga_result res = FPGA_OK; + + if (*count < DWORD_BYTES) return res; + + assert(*count >= DWORD_BYTES); + assert(IS_ALIGNED_DWORD(*dst_ptr)); + if (!IS_ALIGNED_DWORD(*dst_ptr)) // If QWORD aligned, this will be true + return FPGA_EXCEPTION; + + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t align_bytes = *count; + uint64_t offset = 0; + + if (!IS_ALIGNED_QWORD(dst)) { + // Write out a single DWORD to get QWORD aligned + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIOWrite32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + if (0 == align_bytes) return res; + + assert(IS_ALIGNED_QWORD(dst)); + + // Write out blocks of 64-bit values + while (align_bytes >= QWORD_BYTES) { + uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW; + left_in_page -= dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1))); + if (size_to_copy < QWORD_BYTES) break; + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, size_to_copy); + ON_ERR_RETURN(res, "MMIOWrite64Blk"); + src += size_to_copy; + dst += size_to_copy; + align_bytes -= size_to_copy; + } + + if (align_bytes >= DWORD_BYTES) { + // Write out remaining DWORD + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIOWrite32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + assert(align_bytes < DWORD_BYTES); + + *src_ptr = src; + *dst_ptr = dst; + *count = align_bytes; + return res; +} + +/** + * _ase_host_to_fpga + * + * @brief Tx "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make + * calls to handle unaligned and aligned MMIO writes. + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the FPGA address + * @param[in/out] src_ptr Pointer to the Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src and dst + * + */ +static fpga_result _ase_host_to_fpga(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t count) { + fpga_result res = FPGA_OK; + uint64_t dst = *dst_ptr; + uint64_t src = *src_ptr; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr); + + // Aligns address to 8 byte using dst masking method + if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) { + unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size); + if (res != FPGA_OK) return res; + count_left -= unaligned_size; + src += unaligned_size; + dst += unaligned_size; + } + // Handles 8/4 byte MMIO transfer + res = _write_memory_mmio(dma_h, &dst, &src, &count_left); + if (res != FPGA_OK) return res; + + // Left over unaligned count bytes are transfered using dst masking method + unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + + res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size); + if (res != FPGA_OK) return res; + + count_left -= unaligned_size; + + *dst_ptr = dst + unaligned_size; + *src_ptr = src + unaligned_size; + + return FPGA_OK; +} + +/** + * _read_memory_mmio + * + * @brief Reads a DWORD/QWORD aligned memory address(FPGA address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the Host Buffer Address + * @param[in/out] src_ptr Pointer to the FPGA address + * @param[in/out] count Pointer to the size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src, dst, and count + * + */ +static fpga_result _read_memory_mmio(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t *count) { + fpga_result res = FPGA_OK; + + if (*count < DWORD_BYTES) return res; + + assert(*count >= DWORD_BYTES); + assert(IS_ALIGNED_DWORD(*src_ptr)); + if (!IS_ALIGNED_DWORD(*src_ptr)) // If QWORD aligned, this will be true + return FPGA_EXCEPTION; + + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t align_bytes = *count; + uint64_t offset = 0; + + if (!IS_ALIGNED_QWORD(src)) { + // Read a single DWORD to get QWORD aligned + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIORead32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + if (0 == align_bytes) return res; + + assert(IS_ALIGNED_QWORD(src)); + + // Read blocks of 64-bit values + while (align_bytes >= QWORD_BYTES) { + uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW; + left_in_page -= src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1))); + if (size_to_copy < QWORD_BYTES) break; + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, size_to_copy); + ON_ERR_RETURN(res, "MMIORead64Blk"); + src += size_to_copy; + dst += size_to_copy; + align_bytes -= size_to_copy; + } + + if (align_bytes >= DWORD_BYTES) { + // Read remaining DWORD + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIORead32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + assert(align_bytes < DWORD_BYTES); + + *src_ptr = src; + *dst_ptr = dst; + *count = align_bytes; + return res; +} + +/** + * _ase_fpga_to_host + * + * @brief Tx "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make + * calls to handle unaligned and aligned MMIO writes. + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the Host Buffer Address + * @param[in/out] src_ptr Pointer to the FPGA address + * @param[in/out] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src and dst + * + */ +static fpga_result _ase_fpga_to_host(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t count) { + fpga_result res = FPGA_OK; + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr); + + // Aligns address to 8 byte using src masking method + if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) { + unaligned_size = QWORD_BYTES - (src % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size); + if (res != FPGA_OK) return res; + count_left -= unaligned_size; + dst += unaligned_size; + src += unaligned_size; + } + // Handles 8/4 byte MMIO transfer + res = _read_memory_mmio(dma_h, &src, &dst, &count_left); + if (res != FPGA_OK) return res; + + // Left over unaligned count bytes are transfered using src masking method + unaligned_size = QWORD_BYTES - (src % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + + res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size); + if (res != FPGA_OK) return res; + + count_left -= unaligned_size; + + *dst_ptr = dst + unaligned_size; + *src_ptr = src + unaligned_size; + + return FPGA_OK; +} + +static fpga_result clear_interrupt(fpga_dma_handle dma_h) { + // clear interrupt by writing 1 to IRQ bit in status register + msgdma_status_t status = {0}; + status.st.irq = 1; + + return MMIOWrite32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg)); +} + +static fpga_result poll_interrupt(fpga_dma_handle dma_h) { + struct pollfd pfd = {0}; + msgdma_status_t status = { 0 }; + fpga_result res = FPGA_OK; + int poll_res; + + res = fpgaGetOSObjectFromEventHandle(dma_h->eh, &pfd.fd); + ON_ERR_GOTO(res, out, "fpgaGetOSObjectFromEventHandle failed\n"); + + pfd.events = POLLIN; + +#ifdef CHECK_DELAYS + if (0 == poll(&pfd, 1, 0)) poll_wait_count++; +#endif + poll_res = poll(&pfd, 1, FPGA_DMA_TIMEOUT_MSEC); + MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)& status.reg, sizeof(status.reg)); + if (poll_res < 0) { + fprintf(stderr, "Poll error errno = %s DMA status reg: 0x%x\n", strerror(errno), status.reg); + res = FPGA_EXCEPTION; + goto out; + } else if (poll_res == 0) { + fprintf(stderr, "Poll(interrupt) timeout DMA status reg: 0x%x\n", status.reg); + res = FPGA_EXCEPTION; + } else { + uint64_t count = 0; + ssize_t bytes_read = read(pfd.fd, &count, sizeof(count)); + if (bytes_read > 0) { + debug_print("Poll success. Return = %d, count = %d\n", poll_res, (int)count); + res = FPGA_OK; + } else { + fprintf(stderr, "Error: poll failed read: zero bytes read"); + res = FPGA_EXCEPTION; + } + } + +out: + clear_interrupt(dma_h); + return res; +} + +static fpga_result _issue_magic(fpga_dma_handle dma_h) { + fpga_result res = FPGA_OK; + *(dma_h->magic_buf) = 0x0ULL; + + res = _do_dma(dma_h, + dma_h->magic_iova | FPGA_DMA_WF_HOST_MASK, + FPGA_DMA_WF_ROM_MAGIC_NO_MASK, + 64, + 1, + FPGA_TO_HOST_MM, + FPGA2HOST_IRQ_REQ /*intr_en */); + return res; +} + +static void _wait_magic(fpga_dma_handle dma_h) { +#ifndef SKIP_FPGA2HOST_IRQ + poll_interrupt(dma_h); +#endif + while (*(dma_h->magic_buf) != FPGA_DMA_WF_MAGIC_NO) + ; + *(dma_h->magic_buf) = 0x0ULL; +} + +fpga_result transferHostToFpga( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t count_left = count; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + int issued_intr = 0; + debug_print("Host To Fpga ----------- src = %08lx, dst = %08lx \n", src, dst); + if (!IS_DMA_ALIGNED(dst)) { + if (count_left < FPGA_DMA_ALIGN_BYTES) { + res = _ase_host_to_fpga(dma_h, &dst, &src, count_left); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + return res; + } else { + aligned_addr = ((dst / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES; + align_bytes = aligned_addr - dst; + res = _ase_host_to_fpga(dma_h, &dst, &src, align_bytes); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + count_left = count_left - align_bytes; + } + } + if (count_left) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print( + "DMA TX : dma chuncks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src); + + for (i = 0; i < dma_chunks; i++) { + // constant size transfer, no length check required for memcpy + memcpy_s_fast(dma_h->dma_buf_ptr[i % FPGA_DMA_MAX_BUF], + FPGA_DMA_BUF_SIZE, + (void *)(src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE); + // The value of FPGA_DMA_MAX_BUF is 2. Thus FPGA_DMA_MAX_BUF/2 -- 1, so the comparison + // is always i % 1 == 0, which will always be true. This means that the i == (dma_chunks -1) + // portion of the conditional will never be reached. However, for clarity and in case + // FPGA_DMA_MAX_BUF changes, I will leave the conditional as is and apply a coverity supression + // coverity[deadcode:FALSE] + if ((i % (FPGA_DMA_MAX_BUF / 2) == (FPGA_DMA_MAX_BUF / 2) - 1) || i == (dma_chunks - 1) /*last descriptor */) { + if (i == (FPGA_DMA_MAX_BUF / 2) - 1) { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + true); + } else { + if (issued_intr) poll_interrupt(dma_h); + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + true /*intr_en */); + } + issued_intr = 1; + } else { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + false /*intr_en */); + } + } + if (issued_intr) { + poll_interrupt(dma_h); + issued_intr = 0; + } + if (count_left) { + uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES; + if (dma_tx_bytes != 0) { + debug_print("dma_tx_bytes = %08lx was transfered using DMA\n", dma_tx_bytes); + if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) { + res = FPGA_NO_MEMORY; + ON_ERR_GOTO(res, out, "Illegal transfer size\n"); + } + + memcpy_s_fast( + dma_h->dma_buf_ptr[0], dma_tx_bytes, (void *)(src + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes); + res = _do_dma(dma_h, + (dst + dma_chunks * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK, + dma_tx_bytes, + 1, + type, + true /*intr_en */); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + poll_interrupt(dma_h); + } + count_left -= dma_tx_bytes; + if (count_left) { + dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + res = _ase_host_to_fpga(dma_h, &dst, &src, count_left); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + } + } + } +out: + return res; +} + +fpga_result transferFpgaToHost( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t j = 0; + uint64_t count_left = count; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + int wf_issued = 0; + + debug_print("FPGA To Host ----------- src = %08lx, dst = %08lx \n", src, dst); + if (!IS_DMA_ALIGNED(src)) { + if (count_left < FPGA_DMA_ALIGN_BYTES) { + res = _ase_fpga_to_host(dma_h, &src, &dst, count_left); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + return res; + } else { + aligned_addr = ((src / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES; + align_bytes = aligned_addr - src; + res = _ase_fpga_to_host(dma_h, &src, &dst, align_bytes); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + count_left = count_left - align_bytes; + } + } + if (count_left) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print( + "DMA TX : dma chunks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src); + uint64_t pending_buf = 0; + for (i = 0; i < dma_chunks; i++) { + res = _do_dma(dma_h, + dma_h->dma_buf_iova[i % (FPGA_DMA_MAX_BUF)] | FPGA_DMA_HOST_MASK, + (src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + + const int num_pending = i - pending_buf + 1; + if (num_pending == (FPGA_DMA_MAX_BUF / 2)) { // Enters this loop only once,after first batch of descriptors. + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + wf_issued = 1; + } + if (num_pending > (FPGA_DMA_MAX_BUF - 1) || i == (dma_chunks - 1) /*last descriptor */) { + if (wf_issued) { + _wait_magic(dma_h); + for (j = 0; j < (FPGA_DMA_MAX_BUF / 2); j++) { + // constant size transfer; no length check required + memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)], + FPGA_DMA_BUF_SIZE); + pending_buf++; + } + wf_issued = 0; + } + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + wf_issued = 1; + } + } + + if (wf_issued) _wait_magic(dma_h); + + // clear out final dma memcpy operations + while (pending_buf < dma_chunks) { + // constant size transfer; no length check required + memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)], + FPGA_DMA_BUF_SIZE); + pending_buf++; + } + if (count_left > 0) { + uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES; + if (dma_tx_bytes != 0) { + debug_print("dma_tx_bytes = %08lx was transfered using DMA\n", dma_tx_bytes); + res = _do_dma(dma_h, + dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK, + (src + dma_chunks * FPGA_DMA_BUF_SIZE), + dma_tx_bytes, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) { + res = FPGA_NO_MEMORY; + ON_ERR_GOTO(res, out, "Illegal transfer size\n"); + } + memcpy_s_fast( + (void *)(dst + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes, dma_h->dma_buf_ptr[0], dma_tx_bytes); + } + count_left -= dma_tx_bytes; + if (count_left) { + dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + res = _ase_fpga_to_host(dma_h, &src, &dst, count_left); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + } + } + } +out: + return res; +} + +fpga_result transferFpgaToFpga( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t count_left = count; + uint64_t *tmp_buf = NULL; + if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src) && IS_DMA_ALIGNED(count_left)) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print("!!!FPGA to FPGA!!! TX :dma chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n", + dma_chunks, + count_left, + dst, + src); + + for (i = 0; i < dma_chunks; i++) { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + (src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + 0, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed"); + if ((i + 1) % FPGA_DMA_MAX_BUF == 0 || i == (dma_chunks - 1) /*last descriptor */) { + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + } + } + if (count_left > 0) { + debug_print("Count_left = %08lx was transfered using DMA\n", count_left); + res = _do_dma(dma_h, + (dst + dma_chunks * FPGA_DMA_BUF_SIZE), + (src + dma_chunks * FPGA_DMA_BUF_SIZE), + count_left, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed"); + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + } + } else { + if ((src < dst) && (src + count_left >= dst)) { + debug_print("Overlapping addresses, Provide correct dst address\n"); + return FPGA_NOT_SUPPORTED; + } + uint32_t tx_chunks = count_left / FPGA_DMA_BUF_ALIGN_SIZE; + count_left -= (tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE); + debug_print("!!!FPGA to FPGA TX!!! : tx chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n", + tx_chunks, + count_left, + dst, + src); + tmp_buf = (uint64_t *)malloc(FPGA_DMA_BUF_ALIGN_SIZE); + for (i = 0; i < tx_chunks; i++) { + res = transferFpgaToHost( + dma_h, (uint64_t)tmp_buf, (src + i * FPGA_DMA_BUF_ALIGN_SIZE), FPGA_DMA_BUF_ALIGN_SIZE, FPGA_TO_HOST_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + res = transferHostToFpga( + dma_h, (dst + i * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, FPGA_DMA_BUF_ALIGN_SIZE, HOST_TO_FPGA_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + } + if (count_left > 0) { + res = transferFpgaToHost( + dma_h, (uint64_t)tmp_buf, (src + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), count_left, FPGA_TO_HOST_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + res = transferHostToFpga( + dma_h, (dst + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, count_left, HOST_TO_FPGA_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + } + free(tmp_buf); + } +out: + return res; +out_spl: + free(tmp_buf); + return res; +} + +fpga_result fpgaDmaTransferSync( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + + if (!dma_h) return FPGA_INVALID_PARAM; + + if (type >= FPGA_MAX_TRANSFER_TYPE) return FPGA_INVALID_PARAM; + + if (!dma_h->fpga_h) return FPGA_INVALID_PARAM; + + if (type == HOST_TO_FPGA_MM) { + res = transferHostToFpga(dma_h, dst, src, count, HOST_TO_FPGA_MM); + } else if (type == FPGA_TO_HOST_MM) { + res = transferFpgaToHost(dma_h, dst, src, count, FPGA_TO_HOST_MM); + } else if (type == FPGA_TO_FPGA_MM) { + res = transferFpgaToFpga(dma_h, dst, src, count, FPGA_TO_FPGA_MM); + } else { + // Should not be possible, since we have handled all fpga_dma_transfer_t types + assert(0); + } + + return res; +} + +fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma, + uint64_t dst, + uint64_t src, + size_t count, + fpga_dma_transfer_t type, + fpga_dma_transfer_cb cb, + void *context) { + // TODO + return FPGA_NOT_SUPPORTED; +} + +fpga_result fpgaDmaClose(fpga_dma_handle dma_h) { + fpga_result res = FPGA_OK; + int i = 0; + if (!dma_h) { + res = FPGA_INVALID_PARAM; + goto out; + } + + if (!dma_h->fpga_h) { + res = FPGA_INVALID_PARAM; + goto out; + } + + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer failed"); + } + + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->magic_wsid); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer"); + + fpgaUnregisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh); + fpgaDestroyEventHandle(&dma_h->eh); + + // turn off global interrupts + msgdma_ctrl_t ctrl = {0}; + ctrl.ct.global_intr_en_mask = 0; + res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg)); + ON_ERR_GOTO(res, out, "MMIOWrite32Blk"); + +out: + free((void *)dma_h); + return res; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h new file mode 100644 index 0000000..e382696 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h @@ -0,0 +1,141 @@ +// Copyright 2017-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma.h + * \brief FPGA DMA BBB API Header + * + * Known Limitations + * - Supports only synchronous (blocking) transfers + */ + +#ifndef __FPGA_DMA_H__ +#define __FPGA_DMA_H__ + +#include <opae/fpga.h> + +//#define DEBUG_MEM 1 +//#define FPGA_DMA_DEBUG 1 +#define SKIP_FPGA2HOST_IRQ 1 +#ifdef SKIP_FPGA2HOST_IRQ +#define FPGA2HOST_IRQ_REQ false +#else +#define FPGA2HOST_IRQ_REQ true +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The DMA driver supports host to FPGA, FPGA to host and FPGA + * to FPGA transfers. The FPGA interface can be streaming + * or memory-mapped. Streaming interfaces are not currently + * supported. + */ +typedef enum { + HOST_TO_FPGA_MM = 0, // Memory mapped FPGA interface + FPGA_TO_HOST_MM, // Memory mapped FPGA interface + FPGA_TO_FPGA_MM, // Memory mapped FPGA interface + FPGA_MAX_TRANSFER_TYPE, +} fpga_dma_transfer_t; + +typedef struct _dma_handle_t *fpga_dma_handle; + +// Callback for asynchronous DMA transfers +typedef void (*fpga_dma_transfer_cb)(void *context); + +/** + * fpgaDmaOpen + * + * @brief Open a handle to DMA BBB. + * Scans the device feature chain looking for a DMA BBB. + * + * @param[in] fpga Handle to the FPGA AFU object obtained via fpgaOpen() + * @param[in] dma_base to DMA channel DFH + * @param[in] interrupt_num interrupt number assigned to DMA channel + * @param[out] dma DMA object handle + * @returns FPGA_OK on success, return code otherwise + */ +fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dma_base, int interrupt_num, fpga_dma_handle *dma); + +/** + * fpgaDmaTransferSync + * + * @brief Perform a blocking copy of 'count' bytes from memory area pointed + * by src to memory area pointed by dst where fpga_dma_transfer_t specifies the + * type of memory transfer. + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dst Address of the destination buffer + * @param[in] src Address of the source buffer + * @param[in] count Size in bytes + * @param[in] type Must be one of the following values: + * HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface. + * User must specify valid src and dst. + * FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory + * User must specify valid src and dst. + * FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces + * User must specify valid src and dst. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +fpga_result fpgaDmaTransferSync( + fpga_dma_handle dma, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type); + +/** + * fpgaDmaTransferAsync (Not supported) + * + * @brief Perform a non-blocking copy of 'count' bytes from memory area pointed + * by src to memory area pointed by dst where fpga_dma_transfer_t specifies the + * type of memory transfer. + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dst Address of the destination buffer + * @param[in] src Address of the source buffer + * @param[in] count Size in bytes + * @param[in] type Must be one of the following values: + * HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface. + * User must specify valid src and dst. + * FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory + * User must specify valid src and dst. + * FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces + * User must specify valid src and dst. + * @param[in] cb Callback to invoke when DMA transfer is complete + * @param[in] context Pointer to define user-defined context + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma, + uint64_t dst, + uint64_t src, + size_t count, + fpga_dma_transfer_t type, + fpga_dma_transfer_cb cb, + void *context); + +/** + * fpgaDmaClose + * + * @brief Close the DMA BBB handle. + * + * @param[in] dma DMA object handle + * @returns FPGA_OK on success, return code otherwise + */ +fpga_result fpgaDmaClose(fpga_dma_handle dma); + +#ifdef __cplusplus +} +#endif + +#endif // __FPGA_DMA_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h new file mode 100644 index 0000000..e4c8373 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h @@ -0,0 +1,289 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma_internal.h + * \brief FPGA DMA BBB Internal Header + */ + +#ifndef __FPGA_DMA_INT_H__ +#define __FPGA_DMA_INT_H__ + +#include <opae/fpga.h> +#include "x86-sse2.h" + +#ifdef CHECK_DELAYS +#pragma message "Compiled with -DCHECK_DELAYS. Not to be used in production" +#endif + +#ifdef FPGA_DMA_DEBUG +#pragma message "Compiled with -DFPGA_DMA_DEBUG. Not to be used in production" +#endif + +#ifndef max +#define max(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#endif + +#ifndef min +#define min(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + +#define FPGA_DMA_TIMEOUT_MSEC (5000) + +#define QWORD_BYTES 8 +#define DWORD_BYTES 4 +#define IS_ALIGNED_DWORD(addr) (addr % 4 == 0) +#define IS_ALIGNED_QWORD(addr) (addr % 8 == 0) + +#define FPGA_DMA_UUID_H 0xef82def7f6ec40fc +#define FPGA_DMA_UUID_L 0xa9149a35bace01ea +#define FPGA_DMA_WF_MAGIC_NO 0x5772745F53796E63ULL +#define FPGA_DMA_HOST_MASK 0x2000000000000 +#define FPGA_DMA_WF_HOST_MASK 0x3000000000000 +#define FPGA_DMA_WF_ROM_MAGIC_NO_MASK 0x1000000000000 + +#define AFU_DFH_REG 0x0 +#define AFU_DFH_NEXT_OFFSET 16 +#define AFU_DFH_EOL_OFFSET 40 +#define AFU_DFH_TYPE_OFFSET 60 + +// BBB Feature ID (refer CCI-P spec) +#define FPGA_DMA_BBB 0x2 + +// Feature ID for DMA BBB +#define FPGA_DMA_BBB_FEATURE_ID 0x765 + +// DMA Register offsets from base +#define FPGA_DMA_CSR 0x40 +#define FPGA_DMA_DESC 0x60 +#define FPGA_DMA_ADDR_SPAN_EXT_CNTL 0x200 +#define FPGA_DMA_ADDR_SPAN_EXT_DATA 0x1000 + +#define DMA_ADDR_SPAN_EXT_WINDOW (4 * 1024) +#define DMA_ADDR_SPAN_EXT_WINDOW_MASK ((uint64_t)(DMA_ADDR_SPAN_EXT_WINDOW - 1)) + +#define FPGA_DMA_MASK_32_BIT 0xFFFFFFFF + +#define FPGA_DMA_CSR_BUSY (1 << 0) +#define FPGA_DMA_DESC_BUFFER_EMPTY 0x2 +#define FPGA_DMA_DESC_BUFFER_FULL 0x4 + +#define FPGA_DMA_ALIGN_BYTES 64 +#define IS_DMA_ALIGNED(addr) (addr % FPGA_DMA_ALIGN_BYTES == 0) + +#define CSR_BASE(dma_handle) ((uint64_t)dma_handle->dma_csr_base) +#define ASE_DATA_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_data_base) +#define ASE_CNTL_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_cntl_base) +#define HOST_MMIO_32_ADDR(dma_handle, offset) \ + ((volatile uint32_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset))) +#define HOST_MMIO_64_ADDR(dma_handle, offset) \ + ((volatile uint64_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset))) +#define HOST_MMIO_32(dma_handle, offset) (*HOST_MMIO_32_ADDR(dma_handle, offset)) +#define HOST_MMIO_64(dma_handle, offset) (*HOST_MMIO_64_ADDR(dma_handle, offset)) + +#define CSR_STATUS(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, status)) +#define CSR_CONTROL(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, ctrl)) + +// Granularity of DMA transfer (maximum bytes that can be packed +// in a single descriptor).This value must match configuration of +// the DMA IP. Larger transfers will be broken down into smaller +// transactions. +#define FPGA_DMA_BUF_SIZE (1024 * 1024 * 2UL) +#define FPGA_DMA_BUF_ALIGN_SIZE FPGA_DMA_BUF_SIZE + +// Convenience macros + +#ifdef FPGA_DMA_DEBUG +#define debug_print(fmt, ...) \ + do { \ + if (FPGA_DMA_DEBUG) { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + } \ + } while (0) +#define error_print(fmt, ...) \ + do { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + err_cnt++; \ + } while (0) +#else +#define debug_print(...) +#define error_print(...) +#endif + +#define FPGA_DMA_MAX_BUF 2 + +typedef struct __attribute__((__packed__)) { + uint64_t dfh; + uint64_t feature_uuid_lo; + uint64_t feature_uuid_hi; +} dfh_feature_t; + +typedef union { + uint64_t reg; + struct { + uint64_t feature_type : 4; + uint64_t reserved_8 : 8; + uint64_t afu_minor : 4; + uint64_t reserved_7 : 7; + uint64_t end_dfh : 1; + uint64_t next_dfh : 24; + uint64_t afu_major : 4; + uint64_t feature_id : 12; + } bits; +} dfh_reg_t; + +struct _dma_handle_t { + fpga_handle fpga_h; + uint32_t mmio_num; + uint64_t mmio_va; + uint64_t cur_ase_page; + uint64_t dma_base; + uint64_t dma_offset; + uint64_t dma_csr_base; + uint64_t dma_desc_base; + uint64_t dma_ase_cntl_base; + uint64_t dma_ase_data_base; + // Interrupt event handle + fpga_event_handle eh; + // magic number buffer + volatile uint64_t *magic_buf; + uint64_t magic_iova; + uint64_t magic_wsid; + uint64_t *dma_buf_ptr[FPGA_DMA_MAX_BUF]; + uint64_t dma_buf_wsid[FPGA_DMA_MAX_BUF]; + uint64_t dma_buf_iova[FPGA_DMA_MAX_BUF]; +}; + +typedef union { + uint32_t reg; + struct { + uint32_t tx_channel : 8; + uint32_t generate_sop : 1; + uint32_t generate_eop : 1; + uint32_t park_reads : 1; + uint32_t park_writes : 1; + uint32_t end_on_eop : 1; + uint32_t reserved_1 : 1; + uint32_t transfer_irq_en : 1; + uint32_t early_term_irq_en : 1; + uint32_t trans_error_irq_en : 8; + uint32_t early_done_en : 1; + uint32_t reserved_2 : 6; + uint32_t go : 1; + }; +} msgdma_desc_ctrl_t; + +typedef struct __attribute__((__packed__)) { + // 0x0 + uint32_t rd_address; + // 0x4 + uint32_t wr_address; + // 0x8 + uint32_t len; + // 0xC + uint16_t seq_num; + uint8_t rd_burst_count; + uint8_t wr_burst_count; + // 0x10 + uint16_t rd_stride; + uint16_t wr_stride; + // 0x14 + uint32_t rd_address_ext; + // 0x18 + uint32_t wr_address_ext; + // 0x1c + msgdma_desc_ctrl_t control; +} msgdma_ext_desc_t; + +typedef union { + uint32_t reg; + struct { + uint32_t busy : 1; + uint32_t desc_buf_empty : 1; + uint32_t desc_buf_full : 1; + uint32_t rsp_buf_empty : 1; + uint32_t rsp_buf_full : 1; + uint32_t stopped : 1; + uint32_t resetting : 1; + uint32_t stopped_on_errror : 1; + uint32_t stopped_on_early_term : 1; + uint32_t irq : 1; + uint32_t reserved : 22; + } st; +} msgdma_status_t; + +typedef union { + uint32_t reg; + struct { + uint32_t stop_dispatcher : 1; + uint32_t reset_dispatcher : 1; + uint32_t stop_on_error : 1; + uint32_t stopped_on_early_term : 1; + uint32_t global_intr_en_mask : 1; + uint32_t stop_descriptors : 1; + uint32_t rsvd : 22; + } ct; +} msgdma_ctrl_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rd_fill_level : 16; + uint32_t wr_fill_level : 16; + } fl; +} msgdma_fill_level_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rsp_fill_level : 16; + uint32_t rsvd : 16; + } rsp; +} msgdma_rsp_level_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rd_seq_num : 16; + uint32_t wr_seq_num : 16; + } seq; +} msgdma_seq_num_t; + +typedef struct __attribute__((__packed__)) { + // 0x0 + msgdma_status_t status; + // 0x4 + msgdma_ctrl_t ctrl; + // 0x8 + msgdma_fill_level_t fill_level; + // 0xc + msgdma_rsp_level_t rsp; + // 0x10 + msgdma_seq_num_t seq_num; +} msgdma_csr_t; + +#endif // __FPGA_DMA_INT_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp new file mode 100644 index 0000000..206b98a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp @@ -0,0 +1,278 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <poll.h> +#include <stdlib.h> + +#include <thread> + +#include "ccip_mmd_device.h" +#include "eventfd_wrapper.h" +#include "kernel_interrupt.h" + +using namespace intel_opae_mmd; + +// if ENABLE_OPENCL_KERNEL_INTERRUPTS is set at compile time, interrupts will +// be enabled. +#define ENABLE_OPENCL_KERNEL_INTERRUPTS + +// if ENABLE_OPENCL_KERNEL_POLLING_THREAD is set at compile time, a thread will +// replace yield and the thread will call runtime call back + +// DLA runtime assumes interrupt service routing will run on its own (instead of runtime yielding to MMD) when hardware +// interrupts +#ifdef DLA_MMD +#define ENABLE_OPENCL_KERNEL_POLLING_THREAD +#endif + +// ccip interrupt line that is used for kernel +#define MMD_KERNEL_INTERRUPT_LINE_NUM 1 + +KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle) + : m_initialized(false), + m_eventfd_wrapper(NULL), + m_thread(NULL), + m_kernel_interrupt_fn(NULL), + m_kernel_interrupt_user_data(NULL), + m_fpga_handle(fpga_handle_arg), + m_mmd_handle(mmd_handle), + m_event_handle(0) { + enable_interrupts(); +} + +KernelInterrupt::~KernelInterrupt() { disable_interrupts(); } + +void KernelInterrupt::disable_interrupts() { + // kill the thread + if (m_thread) { + // send message to thread to end it + m_eventfd_wrapper->notify(1); + + // join with thread until it ends + m_thread->join(); + + delete m_thread; + m_thread = NULL; + } + + if (m_eventfd_wrapper) { + delete m_eventfd_wrapper; + m_eventfd_wrapper = NULL; + } + + if (m_event_handle) { + fpga_result res; +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error fpgaUnregisterEvent"); + } +#endif + + res = fpgaDestroyEventHandle(&m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error fpgaDestroyEventHandle"); + } + } + + // disable opencl kernel interrupts +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + set_interrupt_mask(0x00000000); +#endif + + m_initialized = false; +} + +void KernelInterrupt::enable_interrupts() { + m_eventfd_wrapper = new eventfd_wrapper(); + if (!m_eventfd_wrapper->initialized()) return; + +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + m_thread = new std::thread(interrupt_polling_thread, std::ref(*this)); +#endif + + fpga_result res; + // Create event + res = fpgaCreateEventHandle(&m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error creating event handle"); + return; + } + +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + // Register user interrupt with event handle + res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, MMD_KERNEL_INTERRUPT_LINE_NUM); + if (res != FPGA_OK) { + fprintf(stderr, "error registering event"); + res = fpgaDestroyEventHandle(&m_event_handle); + return; + } + + // enable opencl kernel interrupts +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + set_interrupt_mask(0x00000001); +#endif +#endif + + m_initialized = true; +} + +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) +void KernelInterrupt::set_interrupt_mask(uint32_t intr_mask) { + fpga_result res; + res = fpgaWriteMMIO32(m_fpga_handle, 0, AOCL_IRQ_MASKING_BASE, intr_mask); + if (res != FPGA_OK) { + fprintf(stderr, "Error fpgaWriteMMIO32: %d\n", res); + return; + } +} +#endif + +void KernelInterrupt::interrupt_polling_thread(KernelInterrupt& obj) { + bool thread_is_active = true; + while (thread_is_active) { +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + const int timeout = -1; +#else + const int timeout = 0; + usleep(100); +#endif + thread_is_active = obj.poll_interrupt(timeout); + } +} + +bool KernelInterrupt::poll_interrupt(int poll_timeout_arg) { + fpga_result fpga_res; + + int res; + // get eventfd handles + int intr_fd; + fpga_res = fpgaGetOSObjectFromEventHandle(m_event_handle, &intr_fd); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "error getting event file handle"); + return false; + } + int thread_signal_fd = m_eventfd_wrapper->get_fd(); + + struct pollfd pollfd_arr[2]; + pollfd_arr[0].fd = intr_fd; + pollfd_arr[0].events = POLLIN; + pollfd_arr[0].revents = 0; + pollfd_arr[1].fd = thread_signal_fd; + pollfd_arr[1].events = POLLIN; + pollfd_arr[1].revents = 0; + res = poll(pollfd_arr, 2, poll_timeout_arg); + if (res < 0) { + fprintf(stderr, "Poll error errno = %s\n", strerror(errno)); + return false; + } else if (res > 0 && pollfd_arr[0].revents == POLLIN) { + uint64_t count; + ssize_t bytes_read = read(intr_fd, &count, sizeof(count)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + // TODO: remove exit call. Revist this when fixing kernel interrupts + exit(-1); + } + } else if (res > 0 && pollfd_arr[1].revents == POLLIN) { + uint64_t count; + ssize_t bytes_read = read(thread_signal_fd, &count, sizeof(count)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + // TODO: remove exit call. Revist this when fixing kernel interrupts + exit(-1); + } + return false; + } else { + // no event fd event happened +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + return false; +#endif + } + +#ifdef DLA_MMD + run_kernel_interrupt_fn(); +#else // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + + // probobly not required for interrupt polling but we poll the interrupt + // csr line to make sure an interrupt was actually triggered + uint32_t irqval = 0; + fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res); + return false; + } + + DEBUG_PRINT("irqval: %u\n", irqval); + if (irqval) run_kernel_interrupt_fn(); + +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + // workaround for fb:530016 + // check if irq line is still high and generate another interrupt event + fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res); + return false; + } + + // signal intr event fd + if (irqval) { + DEBUG_PRINT("CRITICAL WARNING: irqval has not been cleared by aocl runtime\n"); + uint64_t count = 1; + ssize_t res = write(intr_fd, &count, sizeof(count)); + if (res < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return false; + } + } +#endif +#endif + + return true; +} + +bool KernelInterrupt::yield_is_enabled() { +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + return false; +#else + return true; +#endif +} + +void KernelInterrupt::yield() { +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + usleep(0); +#else + poll_interrupt(0); +#endif +} + +void KernelInterrupt::run_kernel_interrupt_fn() { + if (m_kernel_interrupt_fn) { + m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data); + } else { + fprintf(stderr, "m_kernel_interrupt_fn is NULL. No interrupt handler set!\n"); + } +} + +void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data) { + m_kernel_interrupt_fn = fn; + m_kernel_interrupt_user_data = user_data; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h new file mode 100644 index 0000000..44e9b50 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h @@ -0,0 +1,75 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _KERNEL_INTERRUPT_H +#define _KERNEL_INTERRUPT_H + +#include <opae/fpga.h> + +#include <atomic> +#include <thread> + +#include "aocl_mmd.h" + +namespace intel_opae_mmd { + +class eventfd_wrapper; + +class KernelInterrupt final { + public: + KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle); + ~KernelInterrupt(); + + bool initialized() { return m_initialized; } + + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data); + void yield(); + static bool yield_is_enabled(); + + void enable_interrupts(); + void disable_interrupts(); + + private: +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + void set_interrupt_mask(uint32_t intr_mask); +#endif + void run_kernel_interrupt_fn(); + bool poll_interrupt(int poll_timeout_arg); + + static void interrupt_polling_thread(KernelInterrupt& obj); + + bool m_initialized; + eventfd_wrapper* m_eventfd_wrapper; + + std::thread* m_thread; + + aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn; + void* m_kernel_interrupt_user_data; + + fpga_handle m_fpga_handle; + int m_mmd_handle; + + fpga_event_handle m_event_handle; + + // not used and not implemented + KernelInterrupt(KernelInterrupt& other); + KernelInterrupt& operator=(const KernelInterrupt& other); +}; // class KernelInterrupt + +}; // namespace intel_opae_mmd + +#endif // _KERNEL_INTERRUPT_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c new file mode 100644 index 0000000..65d7f1a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c @@ -0,0 +1,133 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE + +#include <assert.h> +#include <safe_string/safe_string.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include "memcpy_s_fast.h" +#include "x86-sse2.h" + +#pragma pop_macro("_GNU_SOURCE") + +static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n); + +memcpy_fn_t p_memcpy = memcpy_setup; // Initial value points to setup routine + +/** + * SSE2_memcpy + * + * @brief memcpy using SSE2 or REP MOVSB + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ +static void *SSE2_memcpy(void *dst, size_t max, const void *src, size_t n) { + assert(n <= max); + + void *ldst = dst; + void *lsrc = (void *)src; + if (IS_CL_ALIGNED(src) && IS_CL_ALIGNED(dst)) // 64-byte aligned + { + if (n >= MIN_SSE2_SIZE) // Arbitrary crossover performance point + { + debug_print("copying 0x%lx bytes with SSE2\n", (uint64_t)ALIGN_TO_CL(n)); + aligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n)); + ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n)); + lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n)); + n -= ALIGN_TO_CL(n); + } + } else { + if (n >= MIN_SSE2_SIZE) // Arbitrary crossover performance point + { + debug_print("copying 0x%lx bytes (unaligned) with SSE2\n", (uint64_t)ALIGN_TO_CL(n)); + unaligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n)); + ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n)); + lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n)); + n -= ALIGN_TO_CL(n); + } + } + + if (n) { + register unsigned long int dummy; + debug_print("copying 0x%lx bytes with REP MOVSB\n", n); + __asm__ __volatile__("rep movsb\n" + : "=&D"(ldst), "=&S"(lsrc), "=&c"(dummy) + : "0"(ldst), "1"(lsrc), "2"(n) + : "memory"); + } + + return dst; +} + +/** + * memcpy_wrap + * + * @brief Trampoline for memcpy + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ + +#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK +static void *memcpy_wrap(void *dst, size_t max, const void *src, size_t n) { return memcpy(dst, src, n); } +#endif // ENABLE_MEMCPY_ENV_VAR_CHECK + +/** + * memcpy_setup + * Will be called on the first memcpy_s_fast invocation only. + * + * @brief Set up which memcpy routine will be used at runtime + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ + +static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n) { + // Default to SSE2_memcpy + p_memcpy = SSE2_memcpy; + +// +#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK + char *pmemcpy = getenv(USE_MEMCPY_ENV); + + if (pmemcpy) { + if (!strcasecmp(pmemcpy, "libc")) { + p_memcpy = memcpy_wrap; + } else if (!strcasecmp(pmemcpy, "sse2")) { + p_memcpy = SSE2_memcpy; + } else if (!strcasecmp(pmemcpy, "memcpy_s")) { + p_memcpy = (memcpy_fn_t)memcpy_s; + } + } +#endif // #ifdef ENABLE_MEMCPY_ENV_VAR_CHECK + + return p_memcpy(dst, max, src, n); +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h new file mode 100644 index 0000000..08056d3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h @@ -0,0 +1,69 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef MEMCPY_S_FAST_H_ +#define MEMCPY_S_FAST_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Constants needed in memcpy routines +// Arbitrary crossover point for using SSE2 over rep movsb +#define MIN_SSE2_SIZE 4096 + +// TODO: hidden environment variables to experiment with performance +// in production software are not a good idea in my opinion. Commenting out +// for now but hopefully can remove this code completely in the long term. +//#define USE_MEMCPY_ENV "PAC_MEMCPY" + +#define CACHE_LINE_SIZE 64 +#define ALIGN_TO_CL(x) ((uint64_t)(x) & ~(CACHE_LINE_SIZE - 1)) +#define IS_CL_ALIGNED(x) (((uint64_t)(x) & (CACHE_LINE_SIZE - 1)) == 0) + +// Convenience macros +#ifdef DEBUG_MEM +#define debug_print(fmt, ...) \ + do { \ + if (FPGA_DMA_DEBUG) { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define error_print(fmt, ...) \ + do { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + err_cnt++; \ + } while (0) +#else +#define debug_print(...) +#define error_print(...) +#endif + +typedef void *(*memcpy_fn_t)(void *dst, size_t max, const void *src, size_t len); + +extern memcpy_fn_t p_memcpy; + +#define memcpy_s_fast(a, b, c, d) p_memcpy(a, b, c, d) + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // MEMCPY_S_FAST_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp new file mode 100644 index 0000000..92337a3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp @@ -0,0 +1,434 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "ccip_mmd_device.h" +#include "mmd_dma.h" + +using namespace intel_opae_mmd; + +// disable dma and only use mmio. this is very slow. +//#define DISABLE_DMA + +// Each MSGDMA_BBB DFH is now 0x100 instead of 0x2_0000 (it needed to be 0x2_0000 previously because +// the ASE component was within the msgdma_bbb.qsys). +// Original addressing: +// board_afu_dfh: 0x0-0x3f. +// msgdma_bbb_csr: 0x2_0000-0x2_1fff. +// Original range at board.ddr_board.msgdma_bbb: 0x2_0000- 0x2_1fff. +// DFH : 0x0-0x3f. +// ASE.cntl : 0x200-0x207. +// ASE.windowed_slave : 0x1000-0x1fff. +// Current addressing (with ASE removed from the msgdma_bbb and now living on its own in ddr_board.qsys): +// From top-level board.qsys (base address 0x0): +// board | dfh : 0x0_0000 - 0x0_003f +// board | ddr_board.ase : 0x1_0000 - 0x1_1fff +// board | ddr_board.msgdma_bbb_0 : 0x2_0000 - 0x2_007f +// board | ddr_board.msgdma_bbb_1 : 0x2_0100 - 0x2_017f +// board | ddr_board.null_dfh : 0x2_0200 - 0x2_023f +// From ase.qsys (base address: 0x1_0000): +// board.ddr_board.ase.dfh_csr : 0x0-0x3f +// board.ddr_board.ase.ASE.cntl : 0x200-0x207 +// board.ddr_board.ase.ASE.windowed_slave : 0x1000-0x1fff +// From msgdma_bbb.qsys inst0 (base address: 0x2_0000) +// board.ddr_board.msgdma_bbb_inst_0.dfh_csr : 0x0-0x3f +// board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.CSR : 0x40-0x5f +// board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f +// From msgdma_bbb.qsys inst1 (base address: 0x2_0100) +// board.ddr_board.msgdma_bbb_inst_1.dfh_csr : 0x0-0x3f +// board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.CSR : 0x40-0x5f +// board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f + +#define MEM_WINDOW_CRTL 0x200 +#define MEM_WINDOW_MEM 0x1000 +#define MEM_WINDOW_SPAN (4 * 1024) +#define MEM_WINDOW_SPAN_MASK ((long)(MEM_WINDOW_SPAN - 1)) +#define MINIMUM_DMA_SIZE 256 +#define DMA_ALIGNMENT 256 + +#ifdef DEBUG_MEM +#define DCP_DEBUG_DMA(...) fprintf(stderr, __VA_ARGS__) +#else +#define DCP_DEBUG_DMA(...) +#endif + +mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, + int mmd_handle, + uint64_t dfh_offset_arg, + uint64_t ase_bbb_addr_arg, + int interrupt_num_arg) + : m_initialized(false), + m_dma_op_mutex(), + m_status_handler_fn(NULL), + m_status_handler_user_data(NULL), + m_fpga_handle(fpga_handle_arg), + m_mmd_handle(mmd_handle), + dfh_offset(dfh_offset_arg), + interrupt_num(interrupt_num_arg), + dma_h(NULL), + msgdma_bbb_base_addr(0), + ase_bbb_base_addr(ase_bbb_addr_arg) { +#ifndef DISABLE_DMA + + fpga_result res; + res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h); + if (res != FPGA_OK) { + m_dma_work_thread = NULL; + fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res)); + return; + } +#endif // DISABLE_DMA + + m_dma_work_thread = new dma_work_thread(*this); + if (!m_dma_work_thread->initialized()) { + return; + } + + m_initialized = true; +} + +mmd_dma::~mmd_dma() { + // kill the thread + if (m_dma_work_thread) { + delete m_dma_work_thread; + m_dma_work_thread = NULL; + } + + if (dma_h) { + if (fpgaDmaClose(dma_h) != FPGA_OK) fprintf(stderr, "Error closing DMA\n"); + } + m_initialized = false; +} + +void mmd_dma::reinit_dma() { + if (!m_initialized) return; + + if (dma_h) { + m_initialized = false; + + fpga_result res; + res = fpgaDmaClose(dma_h); + dma_h = NULL; + if (res != FPGA_OK) { + fprintf(stderr, "Error closing DMA\n"); + return; + } + + res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h); + if (res != FPGA_OK) { + fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res)); + return; + } + + m_initialized = true; + } +} + +void mmd_dma::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + m_status_handler_fn = fn; + m_status_handler_user_data = user_data; +} + +void mmd_dma::event_update_fn(aocl_mmd_op_t op, int status) { + m_status_handler_fn(m_mmd_handle, m_status_handler_user_data, op, status); +} + +fpga_result mmd_dma::do_dma(dma_work_item &item) { + // main dma function needs to be thread safe because dma csr operations + // are not thread safe + std::lock_guard<std::mutex> lock(m_dma_op_mutex); + + fpga_result res = FPGA_OK; + assert(item.rd_host_addr != NULL || item.wr_host_addr != NULL); + + // Tell the kernel we'll need these and they're sequential + uint64_t addr = item.rd_host_addr ? (uint64_t)item.rd_host_addr : (uint64_t)item.wr_host_addr; + addr = addr & ~((uint64_t)getpagesize() - 1); // Align to page boundary + size_t remainder = ((size_t)getpagesize() - (addr & getpagesize())) & ~(getpagesize() - 1); + madvise((void *)addr, item.size + remainder, MADV_SEQUENTIAL); + + if (item.rd_host_addr) { + res = read_memory(item.rd_host_addr, item.dev_addr, item.size); + } else { + assert(item.wr_host_addr); + res = write_memory(item.wr_host_addr, item.dev_addr, item.size); + } + + if (item.op) { + // TODO: check what 'status' value should really be. Right now just + // using 0 as was done in previous CCIP MMD. Also handle case if op is NULL + event_update_fn(item.op, 0); + } + + return res; +} + +fpga_result mmd_dma::enqueue_dma(dma_work_item &item) { + return static_cast<fpga_result>(m_dma_work_thread->enqueue_dma(item)); +} + +fpga_result mmd_dma::read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size) { + assert(host_addr); + dma_work_item item; + item.op = op; + item.rd_host_addr = host_addr; + item.wr_host_addr = NULL; + item.dev_addr = dev_addr; + item.size = size; + + return enqueue_dma(item); +} + +fpga_result mmd_dma::write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size) { + assert(host_addr); + dma_work_item item; + item.op = op; + item.rd_host_addr = NULL; + item.wr_host_addr = host_addr; + item.dev_addr = dev_addr; + item.size = size; + + return enqueue_dma(item); +} + +fpga_result mmd_dma::read_memory(uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + // check for alignment + if (dev_addr % DMA_ALIGNMENT != 0) { + // check for mmio alignment + uint64_t mmio_shift = dev_addr % 8; + if (mmio_shift != 0) { + size_t unaligned_size = 8 - mmio_shift; + if (unaligned_size > size) unaligned_size = size; + + read_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size); + + if (size > unaligned_size) + res = read_memory( + (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size); + return res; + } + + // TODO: need to do a shift here + return read_memory_mmio(host_addr, dev_addr, size); + } + + // check size + if (size < MINIMUM_DMA_SIZE) return read_memory_mmio(host_addr, dev_addr, size); + + size_t remainder = (size % DMA_ALIGNMENT); + size_t dma_size = size - remainder; + +#ifdef DISABLE_DMA + res = read_memory_mmio(host_addr, dev_addr, dma_size); +#else + res = fpgaDmaTransferSync(dma_h, (uint64_t)host_addr /*dst*/, dev_addr /*src*/, dma_size, FPGA_TO_HOST_MM); +#endif + if (res != FPGA_OK) return res; + + if (remainder) res = read_memory_mmio(host_addr + dma_size / 8, dev_addr + dma_size, remainder); + + if (res != FPGA_OK) return res; + + DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size); + DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size); + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + uint64_t shift = dev_addr % 8; + + assert(size + shift <= 8); + + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + + uint64_t dev_aligned_addr = dev_addr - shift; + + // read data from device memory + uint64_t read_tmp; + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp); + if (res != FPGA_OK) return res; + // overlay our data + memcpy_s_fast(host_addr, size, ((char *)(&read_tmp)) + shift, size); + + return FPGA_OK; +} + +fpga_result mmd_dma::read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size); + + fpga_result res = FPGA_OK; + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + for (size_t i = 0; i < size / 8; i++) { + uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + if (mem_page != cur_mem_page) { + cur_mem_page = mem_page; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + } + DCP_DEBUG_DMA("DCP DEBUG: read data %8p %08lx %16p\n", host_addr, dev_addr, host_addr); + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), host_addr); + if (res != FPGA_OK) return res; + + host_addr += 1; + dev_addr += 8; + } + + if (size % 8 != 0) { + res = read_memory_mmio_unaligned(host_addr, dev_addr, size % 8); + if (res != FPGA_OK) return res; + } + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory_mmio done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + // check for alignment + if (dev_addr % DMA_ALIGNMENT != 0) { + // check for mmio alignment + uint64_t mmio_shift = dev_addr % 8; + if (mmio_shift != 0) { + size_t unaligned_size = 8 - mmio_shift; + if (unaligned_size > size) unaligned_size = size; + + DCP_DEBUG_DMA("DCP DEBUG: write_memory %ld %ld %ld\n", mmio_shift, unaligned_size, size); + write_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size); + + if (size > unaligned_size) + res = write_memory( + (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size); + return res; + } + + // TODO: need to do a shift here + return write_memory_mmio(host_addr, dev_addr, size); + } + + // check size + if (size < MINIMUM_DMA_SIZE) return write_memory_mmio(host_addr, dev_addr, size); + + size_t remainder = (size % DMA_ALIGNMENT); + size_t dma_size = size - remainder; + +// TODO: make switch for MMIO +#ifdef DISABLE_DMA + res = write_memory_mmio(host_addr, dev_addr, dma_size); +#else + res = fpgaDmaTransferSync(dma_h, dev_addr /*dst*/, (uint64_t)host_addr /*src*/, dma_size, HOST_TO_FPGA_MM); +#endif + if (res != FPGA_OK) return res; + + if (remainder) res = write_memory(host_addr + dma_size / 8, dev_addr + dma_size, remainder); + + if (res != FPGA_OK) return res; + + DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size); + DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size); + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::write_memory done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + uint64_t shift = dev_addr % 8; + + assert(size + shift <= 8); + + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + + uint64_t dev_aligned_addr = dev_addr - shift; + + // read data from device memory + uint64_t read_tmp; + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp); + if (res != FPGA_OK) return res; + // overlay our data + memcpy_s_fast(((char *)(&read_tmp)) + shift, size, host_addr, size); + + // write back to device + res = fpgaWriteMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_aligned_addr & MEM_WINDOW_SPAN_MASK), read_tmp); + if (res != FPGA_OK) return res; + + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size); + + fpga_result res = FPGA_OK; + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + for (size_t i = 0; i < size / 8; i++) { + uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + if (mem_page != cur_mem_page) { + cur_mem_page = mem_page; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + } + DCP_DEBUG_DMA("DCP DEBUG: write data %8p %08lx %016lx\n", host_addr, dev_addr, *host_addr); + res = fpgaWriteMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), *host_addr); + if (res != FPGA_OK) return res; + + host_addr += 1; + dev_addr += 8; + } + + if (size % 8 != 0) { + res = write_memory_mmio_unaligned(host_addr, dev_addr, size % 8); + if (res != FPGA_OK) return res; + } + + DCP_DEBUG_DMA("DCP DEBUG: aocl_mmd_write done!\n"); + return FPGA_OK; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h new file mode 100644 index 0000000..ff33aed --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h @@ -0,0 +1,97 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _MMD_DMA_H +#define _MMD_DMA_H + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE +#include <sched.h> +#pragma pop_macro("_GNU_SOURCE") + +#include <opae/fpga.h> + +#include <mutex> + +#include "aocl_mmd.h" +#include "dma_work_thread.h" +#include "fpga_dma.h" + +namespace intel_opae_mmd { + +class eventfd_wrapper; + +class mmd_dma final { + public: + mmd_dma(fpga_handle fpga_handle_arg, + int mmd_handle, + uint64_t dfh_offset_arg, + uint64_t ase_bbb_addr_arg, + int interrupt_num_arg); + ~mmd_dma(); + + bool initialized() { return m_initialized; } + + fpga_result read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result do_dma(dma_work_item &item); + + void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + + // used after reconfigation + void reinit_dma(); + + void bind_to_node(void); + + private: + // Helper functions + fpga_result enqueue_dma(dma_work_item &item); + fpga_result read_memory(uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size); + + void event_update_fn(aocl_mmd_op_t op, int status); + + bool m_initialized; + + dma_work_thread *m_dma_work_thread; + std::mutex m_dma_op_mutex; + + aocl_mmd_status_handler_fn m_status_handler_fn; + void *m_status_handler_user_data; + + fpga_handle m_fpga_handle; + int m_mmd_handle; + + uint64_t dfh_offset; + int interrupt_num; + fpga_dma_handle dma_h; + uint64_t msgdma_bbb_base_addr; + uint64_t ase_bbb_base_addr; + + // not used and not implemented + mmd_dma(mmd_dma &other); + mmd_dma &operator=(const mmd_dma &other); +}; // class mmd_dma + +}; // namespace intel_opae_mmd + +#endif // _MMD_DMA_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S new file mode 100644 index 0000000..e1fb5d3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S @@ -0,0 +1,269 @@ +// From TinyMembench v0.4, with slight modifications for Windows. +/* + * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#if defined(__i386__) || defined(__amd64__) + +.intel_syntax noprefix +.text + +#define PREFETCH_DISTANCE 256 + +.macro asm_function_helper function_name + .global \function_name +.func \function_name +\function_name: +#ifdef __amd64__ + #ifdef _WIN64 + .set DST, rcx + .set SRC, rdx + .set SIZE, r8 + #else + .set DST, rdi + .set SRC, rsi + .set SIZE, rdx + #endif +#else + mov eax, [esp + 4] + mov ecx, [esp + 8] + mov edx, [esp + 12] + .set DST, eax + .set SRC, ecx + .set SIZE, edx +#endif +.endm + +.macro asm_function function_name +#if defined(_WIN32) && !defined(_WIN64) + asm_function_helper _\function_name +#else + asm_function_helper \function_name +#endif +.endm + +.macro push3 a, b, c + push \a + push \b + push \c +.endm + +.macro pop3 a, b, c + pop \c + pop \b + pop \a +.endm + +/*****************************************************************************/ + +asm_function aligned_block_copy_movsb +0: +#ifdef __amd64__ + push3 rdi rsi rcx + push3 DST SRC SIZE + pop3 rdi rsi rcx + rep movsb + pop3 rdi rsi rcx +#else + push3 edi esi ecx + push3 DST SRC SIZE + pop3 edi esi ecx + rep movsb + pop3 edi esi ecx +#endif + ret +.endfunc + +asm_function aligned_block_copy_movsd +0: +#ifdef __amd64__ + push3 rdi rsi rcx + push3 DST SRC SIZE + pop3 rdi rsi rcx + sar rcx, 2 + rep movsd + pop3 rdi rsi rcx +#else + push3 edi esi ecx + push3 DST SRC SIZE + pop3 edi esi ecx + sar ecx, 2 + rep movsd + pop3 edi esi ecx +#endif + ret +.endfunc + +asm_function unaligned_block_copy_sse2 +0: + movdqu xmm0, [SRC + 0] + movdqu xmm1, [SRC + 16] + movdqu xmm2, [SRC + 32] + movdqu xmm3, [SRC + 48] + movdqu [DST + 0], xmm0 + movdqu [DST + 16], xmm1 + movdqu [DST + 32], xmm2 + movdqu [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_sse2 +0: + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_sse2 +0: + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_pf32_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + prefetchnta [SRC + PREFETCH_DISTANCE + 32] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_pf32_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + prefetchnta [SRC + PREFETCH_DISTANCE + 32] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_pf64_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_pf64_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_fill_sse2 + movdqa xmm0, [SRC + 0] +0: + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm0 + movdqa [DST + 32], xmm0 + movdqa [DST + 48], xmm0 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_fill_nt_sse2 + movdqa xmm0, [SRC + 0] +0: + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm0 + movntdq [DST + 32], xmm0 + movntdq [DST + 48], xmm0 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +/*****************************************************************************/ + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h new file mode 100644 index 0000000..6ebe2ef --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h @@ -0,0 +1,54 @@ +// From TinyMembench v0.4, with slight modifications for Windows. +/* + * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __X86_SSE2_H__ +#define __X86_SSE2_H__ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +void aligned_block_copy_movsb(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_movsd(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void unaligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_nt_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_nt_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_fill_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_fill_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h new file mode 100644 index 0000000..edb46c7 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h @@ -0,0 +1,489 @@ +#ifndef AOCL_MMD_H +#define AOCL_MMD_H + +/* (C) 1992-2019 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Support for memory mapped ACL devices. + * + * Typical API lifecycle, from the perspective of the caller. + * + * 1. aocl_mmd_open must be called first, to provide a handle for further + * operations. + * + * 2. The interrupt and status handlers must be set. + * + * 3. Read and write operations are performed. + * + * 4. aocl_mmd_close may be called to shut down the device. No further + * operations are permitted until a subsequent aocl_mmd_open call. + * + * aocl_mmd_get_offline_info can be called anytime including before + * open. aocl_mmd_get_info can be called anytime between open and close. + */ + +#ifndef AOCL_MMD_CALL +#if defined(_WIN32) +#define AOCL_MMD_CALL __declspec(dllimport) +#else +#define AOCL_MMD_CALL __attribute__((visibility ("default"))) +#endif +#endif + +#ifndef WEAK +#if defined(_WIN32) +#define WEAK +#else +/* This normally comes with "__attribute__((weak))" but for reasons not presently + * understood, the shared library is not properly loaded on Ubuntu18 when the functions + * are weak. + */ +#define WEAK +#endif +#endif + +#include <cstddef> //size_t + +/* The MMD API's version - the runtime expects this string when + * AOCL_MMD_VERSION is queried. This changes only if the API has changed */ +#define AOCL_MMD_VERSION_STRING "18.1" + +/* Memory types that can be supported - bitfield. Other than physical memory + * these types closely align with the OpenCL SVM types. + * + * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate + * directly with physical memory such as DDR, QDR, etc. + * + * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data andy requires explicit function calls from the user + * to sychronize the cache between the host processor and the FPGA. This level + * of SVM is not currently supported by Altera except as a subset of + * SVM_FINE_GAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires additional information from the user + * and/or host runtime that can be collected during pointer allocation in order + * to sychronize the cache between the host processor and the FPGA. Once this + * additional data is provided for an SVM pointer, the vendor interface handles + * cache synchronization between the host processor & the FPGA automatically. + * This level of SVM is not currently supported by Altera except as a subset + * of SVM_FINE_GRAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for + * caching SVM pointer data and does not require any additional information to + * sychronize the cache between the host processor and the FPGA. The vendor + * interface handles cache synchronization between the host processor & the + * FPGA automatically for all SVM pointers. This level of SVM support is + * currently under development by Altera and some features may not be fully + * supported. + */ +#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0) +#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1) +#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2) +#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3) + +/* program modes - bitfield + * + * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory + * when this bit is is set to 1. If programming can't occur without preserving + * global memory contents, the program function must fail, in which case the + * runtime may re-invoke program with this bit set to 0, allowing programming + * to occur even if doing so destroys global memory contents. + * + * more modes are reserved for stacking on in the future + */ +#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0) +typedef int aocl_mmd_program_mode_t; + +typedef void* aocl_mmd_op_t; + +typedef struct { + unsigned lo; /* 32 least significant bits of time value. */ + unsigned hi; /* 32 most significant bits of time value. */ +} aocl_mmd_timestamp_t; + +/* Defines the set of characteristics that can be probed about the board before + * opening a device. The type of data returned by each is specified in + * parentheses in the adjacent comment. + * + * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES + * These two fields can be used to implement multi-device support. The MMD + * layer may have a list of devices it is capable of interacting with, each + * identified with a unique name. The length of the list should be returned + * in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in + * AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open + * for each board name returned in AOCL_MMD_BOARD_NAMES. + * + * */ +typedef enum { + AOCL_MMD_VERSION = 0, /* Version of MMD (char*)*/ + AOCL_MMD_NUM_BOARDS = 1, /* Number of candidate boards (int)*/ + AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/ + AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */ + AOCL_MMD_VENDOR_ID = 4, /* An integer ID for the vendor (int) */ + AOCL_MMD_USES_YIELD = 5, /* 1 if yield must be called to poll hw (int) */ + /* The following can be combined in a bit field: + * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM + * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1 + */ + AOCL_MMD_MEM_TYPES_SUPPORTED = 6, +} aocl_mmd_offline_info_t; + +/* Defines the set of characteristics that can be probed about the board after + * opening a device. This can involve communication to the device + * + * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1 + * + * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface. + * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int + * + * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each + * kernel interface. If a kernel interface is not clocked by acl_kernel_clk + * then return -1 + * + * */ +typedef enum { + AOCL_MMD_NUM_KERNEL_INTERFACES = 1, /* Number of Kernel interfaces (int) */ + AOCL_MMD_KERNEL_INTERFACES = 2, /* Kernel interface (int*) */ + AOCL_MMD_PLL_INTERFACES = 3, /* Kernel clk handles (int*) */ + AOCL_MMD_MEMORY_INTERFACE = 4, /* Global memory handle (int) */ + AOCL_MMD_TEMPERATURE = 5, /* Temperature measurement (float) */ + AOCL_MMD_PCIE_INFO = 6, /* PCIe information (char*) */ + AOCL_MMD_BOARD_NAME = 7, /* Name of board (char*) */ + AOCL_MMD_BOARD_UNIQUE_ID = 8, /* Unique ID of board (int) */ + AOCL_MMD_CONCURRENT_READS = 9, /* # of parallel reads; 1 is serial*/ + AOCL_MMD_CONCURRENT_WRITES = 10, /* # of parallel writes; 1 is serial*/ + AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11 /* total # of concurent operations read + writes*/ +} aocl_mmd_info_t; + +typedef struct { + unsigned long long int exception_type; + void* user_private_info; + size_t user_cb; +} aocl_mmd_interrupt_info; + +typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data); +typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data); +typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status); + +/* Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +AOCL_MMD_CALL int aocl_mmd_get_info(int handle, + aocl_mmd_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +/* Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ +AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK; + +/* Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK; + +/* Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signalled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK; + +/* Set the device interrupt handler for the opened device. + * The device interrupt handler is called whenever the client needs to be notified + * of a device event signalled by the device internals. + * For example, an ECC error has been reported. + * + * Important: Interrupts from the device must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a device interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle, + aocl_mmd_device_interrupt_handler_fn fn, + void* user_data) WEAK; + +/* Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK; + +/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle + * and hence possibly waiting for events to be processed by the device. + * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is + * assumed to provide status/event updates via some other execution thread + * such as through an interrupt handler. + * + * Returns: non-zero if the yield function performed useful work such as + * processing DMA transactions, 0 if there is no useful work to be performed + * + * NOTE: yield may be called continuously as long as it reports that it has useful work + */ +AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK; + +/* Read, write and copy operations on a single interface. + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_read( + int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_write( + int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_copy( + int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK; + +/* Host Channel create operation + * Opens channel between host and kernel. + * + * Arguments: + * channel_name - name of channel to initialize. Same name as used in board_spec.xml + * + * queue_depth - the size in bytes of pinned memory queue in system memory + * + * direction - the direction of the channel + * + * The return value is negative if initialization was unsuccessful, and + * positive otherwise. Positive return value is handle to the channel to be used for + * subsequent calls for the channel. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK; + +/* Host Channel destroy operation + * Closes channel between host and kernel. + * + * Arguments: + * channel - the handle to the channel to close, that was obtained with + * create channel + * + * The return value is 0 if the destroy was successful, and negative + * otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK; + +/* Host Channel get buffer operation + * Provide host with pointer to buffer they can access to to write or + * read from kernel, along with space or data available in the buffer + * in bytes. + * + * Arguments: + * channel - the handle to the channel to get the buffer for + * + * buffer_size - the address that this call will write the amount of + * space or data that's available in the buffer, + * depending on direction of the channel, in bytes + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is the pointer to the buffer that host can write + * to or read from. NULL if the status is negative. + */ +AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK; + +/* Host Channel acknowledge buffer operation + * Acknowledge to the channel that the user has written or read data from + * it. This will make the data or additional buffer space available to + * write to or read from kernel. + * + * Arguments: + * channel - the handle to the channel that user is acknowledging + * + * send_size - the size in bytes that the user is acknowledging + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is equal to send_size if send_size was less than or + * equal to the buffer_size from get buffer call. If send_size was + * greater, then return value is the amount that was actually sent. + */ +AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK; + +/* Program the device + * + * The host will guarantee that no operations are currently executing on the + * device. That means the kernels will be idle and no read/write/copy + * commands are active. Interrupts should be disabled and the FPGA should + * be reprogrammed with the data from user_data which has size size. The host + * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler + * again. At this point interrupts can be enabled. + * + * The new handle to the board after reprogram does not have to be the same as + * the one before. + * + * Arguments: + * user_data - The binary contents of the fpga.bin file created during + * Quartus II compilation. + * size - the size in bytes of user_data + * program_mode - bit field for programming attributes. See + * aocl_mmd_program_mode_t definition + * + * Returns: the new non-negative integer handle for the board, otherwise a + * negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK; + +/* Shared memory allocator + * Allocates memory that is shared between the host and the FPGA. The + * host will access this memory using the pointer returned by + * aocl_mmd_shared_mem_alloc, while the FPGA will access the shared memory + * using device_ptr_out. If shared memory is not supported this should return + * NULL. + * + * Shared memory survives FPGA reprogramming if the CPU is not rebooted. + * + * Arguments: + * size - the size of the shared memory to allocate + * device_ptr_out - will receive the pointer value used by the FPGA (the device) + * to access the shared memory. Cannot be NULL. The type is + * unsigned long long to handle the case where the host has a + * smaller pointer size than the device. + * + * Returns: The pointer value to be used by the host to access the shared + * memory if successful, otherwise NULL. + */ +AOCL_MMD_CALL void* aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long* device_ptr_out) WEAK; + +/* Shared memory de-allocator + * Frees previously allocated shared memory. If shared memory is not supported, + * this function should do nothing. + * + * Arguments: + * host_ptr - the host pointer that points to the shared memory, as returned by + * aocl_mmd_shared_mem_alloc + * size - the size of the shared memory to free. Must match the size + * originally passed to aocl_mmd_shared_mem_alloc + */ +AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void* host_ptr, size_t size) WEAK; + +/* DEPRECATED. Use aocl_mmd_program instead + * This reprogram API is only for mmd version previous than 18.1 + */ +AOCL_MMD_CALL int aocl_mmd_reprogram(int handle, void* user_data, size_t size) WEAK; + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +#include <cstdint> +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK; +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK; +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK; + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; + +// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK; +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore new file mode 100644 index 0000000..66e06bf --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.gitignore @@ -0,0 +1,18 @@ +*~ +*# +*.marks +release_build/ +build/ +example_designs/mem_bandwidth/bin/ +example_designs/mem_bandwidth/simulation.tar.gz +example_designs/mem_bandwidth/temp_simulation/ +linux64/lib/ +linux64/libexec/diagnose +linux64/libexec/program +ase/mpf_src +*.pyc +*.swp +*.kwlp +*.kwps +temp_simulation/ +simulation.tar.gz diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master new file mode 100644 index 0000000..835c7e0 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/.sync_master @@ -0,0 +1 @@ +sc diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt new file mode 100644 index 0000000..e7e4584 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/CMakeLists.txt @@ -0,0 +1,144 @@ +# (C) 2017 Intel Corporation. All rights reserved. +# Your use of Intel Corporation's design tools, logic functions and other +# software and tools, and its AMPP partner logic functions, and any output +# files any of the foregoing (including device programming or simulation +# files), and any associated documentation or information are expressly subject +# to the terms and conditions of the Intel Program License Subscription +# Agreement, Intel MegaCore Function License Agreement, or other applicable +# license agreement, including, without limitation, that your use is for the +# sole purpose of programming logic devices manufactured by Intel and sold by +# Intel or its authorized distributors. Please refer to the applicable +# agreement for further details. + +cmake_minimum_required(VERSION 2.8.12) +project(mmd) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# DLA specific modifications made to the MMD +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_MAX_DEVICE=128") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOPTION3=1 -DACL_USE_DMA=1") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_HAS_STDLIB_STDIO") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_BIT=64") + +# Select PCIE Gen3 x16 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x16") + +if (WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DAOCL_MMD_CALL=__declspec(dllexport)") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_COMPILER_IS_MSVC=1 -DACL_HOST_RUNTIME_IS_STATIC=1") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=windows -DACL_TARGET_SYS=windows -DWINDOWS") +endif() + +# from the opencl makefile +if (NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -O3 -DACL_COMPILER_IS_MSVC=0 -DACL_HOST_RUNTIME_IS_STATIC=0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-unknown-pragmas -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -O2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_TARGET_SYS=linux -DLINUX") + # Release build only + set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2") +endif() + +enable_language(C ASM) + +set(ASM_OPTIONS "-x assembler-with-cpp") +if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as") +endif() + +set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}") + +set(MMD_SRC + ./host/acl_pcie_config.cpp + ./host/acl_pcie.cpp + ./host/acl_pcie_debug.cpp + ./host/acl_pcie_device.cpp + ./host/acl_pcie_dma_linux.cpp + ./host/acl_pcie_dma_windows.cpp + ./host/acl_pcie_hostch.cpp + ./host/acl_pcie_mm_io.cpp + ./host/acl_pcie_timer.cpp +) + +add_library(de10_agilex_mmd SHARED ${MMD_SRC}) + +target_include_directories(de10_agilex_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) +if (WIN32) + # Terrasic production BSP Linux kernel space driver header files + set(TERASIC_KERNEL_HEADER_DIR $ENV{AOCL_BOARD_PACKAGE_ROOT}/linux64/driver) + set(TERASIC_KERNEL_HEADER_FILES + fpga_cmd_guids.h + hw_host_channel.h + hw_pcie_constants.h + hw_pcie_dma.h + ) + if (EXISTS ${TERASIC_KERNEL_HEADER_DIR}) + foreach(header ${TERASIC_KERNEL_HEADER_FILES}) + if (EXISTS ${TERASIC_KERNEL_HEADER_DIR}/${header}) + file(COPY ${TERASIC_KERNEL_HEADER_DIR}/${header} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/include) + else() + message(WARNING "Header file ${header} does not exist in ${TERASIC_KERNEL_HEADER_DIR}") + endif() + endforeach() + else() + message(FATAL_ERROR "Source directory ${TERASIC_KERNEL_HEADER_DIR} does not exist.") + endif() + + set(HW_PCI_DMA_H ${CMAKE_CURRENT_SOURCE_DIR}/include/hw_pcie_dma.h) + file(READ ${HW_PCI_DMA_H} HW_PCI_DMA_H_CONTENT) + # Remove any end-of-line whitespace from the file content (spaces and tabs) + string(REGEX REPLACE "[ \t]+(\r?\n)" "\\1" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}") + set(OLD_CODE_BLOCK +"PACK( +struct DMA_DESC_ENTRY { + UINT32 src_addr_ldw; + UINT32 src_addr_udw; + UINT32 dest_addr_ldw; + UINT32 dest_addr_udw; + UINT32 ctl_dma_len; + UINT32 reserved[3]; +});") + set(NEW_CODE_BLOCK +"#if defined(GEN3_x8) +PACK( +struct DMA_DESC_ENTRY { + UINT32 src_addr_ldw; + UINT32 src_addr_udw; + UINT32 dest_addr_ldw; + UINT32 dest_addr_udw; + UINT32 ctl_dma_len; + UINT32 reserved[3]; +}); +#elif defined(GEN3_x16) +PACK( +struct DMA_DESC_ENTRY { + UINT64 src_addr; + UINT64 dst_addr; + UINT32 ctrl; + UINT32 reserved[3]; +}); +#endif") + string(REPLACE "${OLD_CODE_BLOCK}" "${NEW_CODE_BLOCK}" HW_PCI_DMA_H_CONTENT "${HW_PCI_DMA_H_CONTENT}") + file(WRITE ${HW_PCI_DMA_H} "${HW_PCI_DMA_H_CONTENT}") + + set_target_properties(de10_agilex_mmd PROPERTIES LINK_FLAGS "-subsystem:console -nologo -fixed:no -incremental:no -opt:noref -ignore:4089 /NXCOMPAT /DYNAMICBASE") + + find_library(ACL_CHECK_SYS_CMD_LIB + acl_check_sys_cmd + PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64) + find_library(FPGA_LIB + FpgaLib + PATHS ${CMAKE_CURRENT_SOURCE_DIR}/lib/win64) + + target_link_libraries(de10_agilex_mmd ${ACL_CHECK_SYS_CMD_LIB} ${FPGA_LIB}) +else() + target_link_libraries(de10_agilex_mmd) +endif() + +install(TARGETS de10_agilex_mmd + RUNTIME DESTINATION "dla/runtime/bin" COMPONENT de10_agilex_mmd + LIBRARY DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd + ARCHIVE DESTINATION "dla/runtime/lib" COMPONENT de10_agilex_mmd +) diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp new file mode 100644 index 0000000..527d8bf --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.cpp @@ -0,0 +1,951 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie.cpp ------------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions that are defined in aocl_mmd.h */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_pcie.h" + +// other header files inside MMD driver +#include "acl_pcie_debug.h" +#include "acl_pcie_device.h" +#include "hw_pcie_constants.h" +#ifndef DLA_MMD +#include "acl_check_sys_cmd.h" +#endif + +// other standard header files +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#include <map> +#include <sstream> +#include <string> +#include <utility> + +#ifdef DLA_MMD +#include <chrono> +#include <thread> +#endif + +#if defined(LINUX) +#include <fcntl.h> +#include <semaphore.h> +#include <signal.h> +#include <unistd.h> +#endif // LINUX + +// MAX size of line read from pipe-ing the output of system call to MMD +#define BUF_SIZE 1024 +// MAX size of command passed to system for invoking system call from MMD +#define SYSTEM_CMD_SIZE 4 * 1024 + +#ifndef DLA_MMD +// static helper functions +static bool blob_has_elf_signature(void *data, size_t data_size); +#endif + +// global variables used for handling multi-devices and its helper functions +// Use a DeviceMapManager to manage a heap-allocated map for storing device information +// instead of using a static global map because of a segmentation fault which occurs in +// the following situation: +// 1) Host program contains a global variable which calls clReleaseContext in its destructor. +// When the program ends the global goes out of scope and the destructor is called. +// 2) clReleaseContext calls a function in the MMD library which modifies the static global map in +// the MMD library. +// In this situation it was discovered that the destructor of the static global map is called before +// the destructor of the global in the host program, thus resulting in a segmentation fault when +// clReleaseContext calls a function that modifies the internal map after it has been destroyed. +// Using a heap-allocated map avoids this issue as the lifetime of the map persists until it is +// deleted or the process is completely terminated. +class DeviceMapManager { + public: + typedef std::pair<const std::string, ACL_PCIE_DEVICE *> DeviceInfo; + typedef std::map<int, DeviceInfo> DeviceMap; + + static inline bool empty() { return !s_device_map; } + + // Returns the underlying device map. The map must not be empty when this is called. + static inline const DeviceMap &get_device_map() { + ACL_PCIE_ASSERT(s_device_map, "no devices are open -- aborting\n"); + return *s_device_map; + } + + // Returns the device info associated with the given handle. The handle must exist. + static inline const DeviceInfo &get_pcie_device_info(int handle) { return get_device_it_for_handle(handle)->second; } + + // Returns the device associated with the given handle. The handle must exist. + static inline ACL_PCIE_DEVICE *get_pcie_device(int handle) { return get_pcie_device_info(handle).second; } + + // Adds a device with the specified name for the given handle. If a device with the same handle already exists + // it is discarded first. The caller must ensure they don't associate the same device with multiple handles. + static inline void add_pcie_device_handle(int handle, const std::string &name, ACL_PCIE_DEVICE *dev) { + // To avoid memory leaks ensure that only this function ever allocates a new device map because + // we only ever delete the map when the size of the map goes from non-empty to empty. + if (!s_device_map) s_device_map = new DeviceMap(); + + if (s_device_map->count(handle)) discard_pcie_device_handle(handle); + s_device_map->insert(std::pair<int, DeviceInfo>(handle, DeviceInfo(name, dev))); + } + + // Removes the device associated with the given handle. The handle must exist. + static inline void discard_pcie_device_handle(int handle) { + DeviceMap::iterator it = get_device_it_for_handle(handle); + + delete it->second.second; + s_device_map->erase(it); + if (s_device_map->empty()) { + // From a functional perspective the map can remain allocated for + // the entire lifetime the MMD is loaded but there + // is no other good place to clean it up except here. + delete s_device_map; + s_device_map = NULL; + } + } + + // Removes all devices. + static inline void discard_all_pcie_device_handles() { + if (!s_device_map) return; + + for (DeviceMapManager::DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) { + delete it->second.second; + } + + delete s_device_map; + s_device_map = NULL; + } + + // Returns true if any device is currently being programmed. + static inline bool is_any_device_being_programmed() { + if (!s_device_map) return false; + + for (DeviceMap::iterator it = s_device_map->begin(); it != s_device_map->end(); ++it) { + if (it->second.second->is_being_programmed()) { + return true; + } + } + return false; + } + + private: + static inline DeviceMap::iterator get_device_it_for_handle(int handle) { + ACL_PCIE_ASSERT(s_device_map, "can't find handle %d -- aborting\n", handle); + DeviceMap::iterator it = s_device_map->find(handle); + ACL_PCIE_ASSERT(it != s_device_map->end(), "can't find handle %d -- aborting\n", handle); + return it; + } + + static DeviceMap *s_device_map; +}; +DeviceMapManager::DeviceMap *DeviceMapManager::s_device_map = NULL; + +static int test_device_exception_signal_number = 63; + +// Functions for handling interrupts or signals for multiple devices +// This functions are used inside the ACL_PCIE_DEVICE class +#if defined(WINDOWS) +void pcie_interrupt_handler(void *data) { + ACL_PCIE_DEVICE *device = static_cast<ACL_PCIE_DEVICE *>(data); + device->service_interrupt(); +} + +BOOL ctrl_c_handler(DWORD fdwCtrlType) { + if (fdwCtrlType != CTRL_C_EVENT) return FALSE; + + if (DeviceMapManager::is_any_device_being_programmed()) { + ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n"); + return TRUE; + } + + // On Windows, the signal handle function is executed by another thread, + // so we cannot simply free all the open devices. + // Just exit when received a ctrl-c event, the OS will take care of the clean-up. + exit(1); +} +#endif // WINDOWS +#if defined(LINUX) +// On Linux, driver will send a SIG_INT_NOTIFY *signal* to notify about an interrupt. +void pcie_linux_signal_handler(int sig, siginfo_t *info, void *unused) { + // test_device_exception_signal_number is reserved for device exception testing + if (sig == test_device_exception_signal_number) { + ACL_PCIE_ERROR_IF(DeviceMapManager::get_device_map().empty(), + return, + "No devices available to trigger test_device_exception_signal_number on.\n"); + // Pick the last (most recent) handle for device exception testing + unsigned int handle = DeviceMapManager::get_device_map().rbegin()->first; + DeviceMapManager::get_pcie_device(handle)->test_trigger_device_interrupt(); + } else { + // the last bit indicates the DMA completion + unsigned int irq_type_flag = info->si_int & 0x1; + // other bits shows the handle value of the device that sent the interrupt + unsigned int handle = info->si_int >> 1; + if (DeviceMapManager::empty() || !DeviceMapManager::get_device_map().count(handle)) { + ACL_PCIE_DEBUG_MSG(":: received an unknown handle %d in signal handler, ignore this.\n", handle); + return; + } + + DeviceMapManager::get_pcie_device(handle)->service_interrupt(irq_type_flag); + } +} + +void ctrl_c_handler(int sig_num) { + if (DeviceMapManager::is_any_device_being_programmed()) { + ACL_PCIE_INFO("The device is still being programmed, cannot terminate at this point.\n"); + return; + } + + // Free all the resource allocated for open devices before exiting the program. + // It also notifies the kernel driver about the termination of the program, + // so that the kernel driver won't try to talk to any user-allocated memory + // space (mainly for the DMA) after the program exit. + DeviceMapManager::discard_all_pcie_device_handles(); + exit(1); +} + +void abort_signal_handler(int sig_num) { + DeviceMapManager::discard_all_pcie_device_handles(); + exit(1); +} + +int allocate_and_register_linux_signal_number_helper(int pid) { + char buffer[4096], *locOfSigCgt; + FILE *fp; + int bytes_read, status, ret = -1; + unsigned long long sigmask = 0; + struct sigaction sigusr {}, sigabrt {}; + + snprintf(buffer, sizeof(buffer), "/proc/%d/status", pid); + fp = fopen(buffer, "rb"); + ACL_PCIE_ERROR_IF(fp == NULL, return -1, "Unable to open file %s\n", buffer); + bytes_read = fread(buffer, sizeof(buffer[0]), sizeof(buffer) - 1, fp); + fclose(fp); + buffer[bytes_read] = 0; // null terminate the string + locOfSigCgt = strstr(buffer, "SigCgt:"); // returns null if can't find, shouldn't happen + ACL_PCIE_ERROR_IF(locOfSigCgt == NULL, return -1, "Did not find SigCgt: for PID %d\n", pid); + sscanf(locOfSigCgt + 7, "%llx", &sigmask); + + // Find an unused signal number + for (int i = SIGRTMAX; i >= SIGRTMIN; i--) { + if (!((sigmask >> (i - 1)) & 1)) { + ret = i; + break; + } + } + ACL_PCIE_ERROR_IF(ret == -1, return -1, "Unable to find an unused signal number\n"); + + // Enable if driver is using signals to communicate with the host. + sigusr.sa_sigaction = pcie_linux_signal_handler; + sigusr.sa_flags = SA_SIGINFO; + status = sigaction(ret, &sigusr, NULL); + if (getenv("ACL_MMD_TEST_INTELFPGA")) { + ACL_PCIE_ERROR_IF(((sigmask >> (test_device_exception_signal_number - 1)) & 1), + return -1, + "Signal number %i cannot be occupied\n", + test_device_exception_signal_number); + status = sigaction(test_device_exception_signal_number, &sigusr, NULL); + } + ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, ret); + + // Install signal handler for SIGABRT from assertions in the upper layers + sigabrt.sa_handler = abort_signal_handler; + sigemptyset(&sigabrt.sa_mask); + sigabrt.sa_flags = 0; + status = sigaction(SIGABRT, &sigabrt, NULL); + ACL_PCIE_ERROR_IF(status != 0, return -1, "sigaction failed with status %d, signal number %d\n", status, SIGABRT); + + // if it makes it here, the user got an unused signal number and we installed all signal handlers + return ret; +} + +// returns an unused signal number, -1 means ran into some error +int allocate_and_register_linux_signal_number(pthread_mutex_t *mutex) { + int pid = getpid(); + int err = pthread_mutex_lock(mutex); + ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_lock error %d\n", err); + + // this has multiple return points, put in separate function so that we don't bypass releasing the mutex + int ret = allocate_and_register_linux_signal_number_helper(pid); + + err = pthread_mutex_unlock(mutex); + ACL_PCIE_ERROR_IF(err != 0, return -1, "pthread_mutex_unlock error %d\n", err); + + return ret; +} +#endif // LINUX + +// Function to install the signal handler for Ctrl-C +// If ignore_sig != 0, the ctrl-c signal will be ignored by the program +// If ignore_sig = 0, the custom signal handler (ctrl_c_handler) will be used +int install_ctrl_c_handler(int ingore_sig) { +#if defined(WINDOWS) + SetConsoleCtrlHandler((ingore_sig ? NULL : (PHANDLER_ROUTINE)ctrl_c_handler), TRUE); +#endif // WINDOWS +#if defined(LINUX) + struct sigaction sig; + sig.sa_handler = (ingore_sig ? SIG_IGN : ctrl_c_handler); + sigemptyset(&sig.sa_mask); + sig.sa_flags = 0; + sigaction(SIGINT, &sig, NULL); +#endif // LINUX + + return 0; +} + +// Function to return the number of boards installed in the system +unsigned int get_offline_num_boards() { + unsigned int num_boards = 0; + + // These are for reading/parsing the environment variable + const char *override_count_string = 0; + long parsed_count; + char *endptr; + +// Windows MMD will try to open all the devices +#if defined(WINDOWS) + fpga_result result; + fpga_properties filter = NULL; + + result = fpgaGetProperties(NULL, &filter); + if (result != FPGA_OK) { + num_boards = ACL_MAX_DEVICE; + ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n"); + } + + result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE); + if (result != FPGA_OK) { + num_boards = ACL_MAX_DEVICE; + + if (filter != NULL) fpgaDestroyProperties(&filter); + + ACL_PCIE_ERROR_IF(1, goto End, "failed to set object type.\n"); + } + + result = fpgaPropertiesSetVendorID(filter, ACL_PCI_INTELFPGA_VENDOR_ID); + if (result != FPGA_OK) { + num_boards = ACL_MAX_DEVICE; + + if (filter != NULL) fpgaDestroyProperties(&filter); + + ACL_PCIE_ERROR_IF(1, goto End, "failed to set vendor ID.\n"); + } + + result = fpgaEnumerate(&filter, 1, NULL, 1, &num_boards); + if (result != FPGA_OK) { + num_boards = ACL_MAX_DEVICE; + + if (filter != NULL) fpgaDestroyProperties(&filter); + + ACL_PCIE_ERROR_IF(1, goto End, "failed to scan for the PCI device.\n"); + } + + if (filter != NULL) fpgaDestroyProperties(&filter); + + if (num_boards == 0) { + num_boards = ACL_MAX_DEVICE; + } + +End: +#endif // WINDOWS + +// Linux MMD will look into the number of devices +#if defined(LINUX) + FILE *fp; + char str_line_in[BUF_SIZE]; + char str_board_pkg_name[BUF_SIZE]; + char str_cmd[SYSTEM_CMD_SIZE]; + + snprintf(str_board_pkg_name, sizeof(str_board_pkg_name), "acl%s", ACL_BOARD_PKG_NAME); + snprintf(str_cmd, sizeof(str_cmd), "ls /sys/class/aclpci_%s 2>/dev/null", ACL_BOARD_PKG_NAME); + +#ifndef DLA_MMD + ACL_PCIE_ASSERT(system_cmd_is_valid(str_cmd), "Invalid popen() function parameter: %s\n", str_cmd); +#endif + fp = popen(str_cmd, "r"); + + if (fp == NULL) { + ACL_PCIE_INFO("Couldn't open pipe stream\n"); + return false; + } + // Read every line from output + while (fgets(str_line_in, BUF_SIZE, fp) != NULL) { + if (strncmp(str_board_pkg_name, str_line_in, strnlen(str_board_pkg_name, MAX_NAME_SIZE)) == 0) { + num_boards++; + } + } + + pclose(fp); + +#endif // LINUX + + override_count_string = getenv("CL_OVERRIDE_NUM_DEVICES_INTELFPGA"); + if (override_count_string) { + endptr = 0; + parsed_count = strtol(override_count_string, &endptr, 10); + if (endptr == override_count_string // no valid characters + || *endptr // an invalid character + || (parsed_count < 0 || parsed_count >= (long)ACL_MAX_DEVICE)) { + // malformed override string, do nothing + } else { + // Was ok. + num_boards = (unsigned int)parsed_count; + } + } + + return num_boards; +} + +// Get information about the board using the enum aocl_mmd_offline_info_t for +// offline info (called without a handle), and the enum aocl_mmd_info_t for +// info specific to a certain board. +#define RESULT_INT(X) \ + { \ + *((int *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(int); \ + } +#define RESULT_UNSIGNED(X) \ + { \ + *((unsigned *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(unsigned); \ + } +#define RESULT_SIZE_T(X) \ + { \ + *((size_t *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(size_t); \ + } +#if defined(WINDOWS) +#define RESULT_STR(X) \ + do { \ + size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1; \ + memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \ + if (param_size_ret) *param_size_ret = Xlen; \ + } while (0) +#else +#define RESULT_STR(X) \ + do { \ + size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1; \ + memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \ + if (param_size_ret) *param_size_ret = Xlen; \ + } while (0) +#endif +int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void *param_value, + size_t *param_size_ret) { + // It might be helpful to cache the info if function aocl_mmd_get_offline_info is called frequently. + unsigned int num_boards; + switch (requested_info_id) { + case AOCL_MMD_VERSION: + RESULT_STR(MMD_VERSION); + break; + case AOCL_MMD_NUM_BOARDS: { + num_boards = get_offline_num_boards(); + RESULT_INT((int)num_boards); + break; + } + case AOCL_MMD_BOARD_NAMES: { + // Construct a list of all possible devices supported by this MMD layer + std::ostringstream boards; + num_boards = get_offline_num_boards(); + for (unsigned i = 0; i < num_boards; i++) { + boards << "acl" << ACL_BOARD_PKG_NAME << i; + if (i < num_boards - 1) boards << ";"; + } + RESULT_STR(boards.str().c_str()); + break; + } + case AOCL_MMD_VENDOR_NAME: { + RESULT_STR(ACL_VENDOR_NAME); + break; + } + case AOCL_MMD_VENDOR_ID: + RESULT_INT(ACL_PCI_INTELFPGA_VENDOR_ID); + break; + case AOCL_MMD_USES_YIELD: + RESULT_INT(0); + break; + case AOCL_MMD_MEM_TYPES_SUPPORTED: + RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY); + break; + } + return 0; +} + +int aocl_mmd_get_info( + int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(), + return -1, + "aocl_mmd_get_info failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + switch (requested_info_id) { + case AOCL_MMD_BOARD_NAME: { + std::ostringstream board_name; + board_name << ACL_BOARD_NAME << " (" << DeviceMapManager::get_pcie_device_info(handle).first << ")"; + RESULT_STR(board_name.str().c_str()); + break; + } + case AOCL_MMD_NUM_KERNEL_INTERFACES: + RESULT_INT(1); + break; + case AOCL_MMD_KERNEL_INTERFACES: + RESULT_INT(AOCL_MMD_KERNEL); + break; + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(AOCL_MMD_PLL); + break; + case AOCL_MMD_MEMORY_INTERFACE: + RESULT_INT(AOCL_MMD_MEMORY); + break; + case AOCL_MMD_PCIE_INFO: + RESULT_STR(pcie_dev->get_dev_pcie_info()); + break; + case AOCL_MMD_CONCURRENT_READS: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_WRITES: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_READS_OR_WRITES: + RESULT_INT(1); + break; + case AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT: + RESULT_SIZE_T(0); + break; + case AOCL_MMD_HOST_MEM_CAPABILITIES: + RESULT_UNSIGNED(0); + break; + case AOCL_MMD_SHARED_MEM_CAPABILITIES: + RESULT_UNSIGNED(0); + break; + case AOCL_MMD_DEVICE_MEM_CAPABILITIES: + RESULT_UNSIGNED(0); + break; + case AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + case AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + case AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY: + RESULT_SIZE_T(0); + break; + + case AOCL_MMD_TEMPERATURE: { + float *r; + int temp; + pcie_dev->get_ondie_temp_slow_call(&temp); + r = (float *)param_value; + *r = ACL_PCIE_TEMP_FORMULA; + if (param_size_ret) *param_size_ret = sizeof(float); + break; + } + + // currently not supported + case AOCL_MMD_BOARD_UNIQUE_ID: + return -1; + } + return 0; +} + +#undef RESULT_INT +#undef RESULT_STR + +// Open and initialize the named device. +int AOCL_MMD_CALL aocl_mmd_open(const char *name) { + static int signal_handler_installed = 0; + static int unique_id = 0; + int dev_num = -1; + static int user_signal_number = -1; +#if defined(LINUX) + static pthread_mutex_t linux_signal_arb_mutex = + PTHREAD_MUTEX_INITIALIZER; // initializes as unlocked, static = no cleanup needed + + if (sscanf(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) { + return -1; + } +#endif // LINUX + +#if defined(WINDOWS) + if (sscanf_s(name, "acl" ACL_BOARD_PKG_NAME "%d", &dev_num) != 1) { + return -1; + } +#endif + if (dev_num < 0 || dev_num >= ACL_MAX_DEVICE) { + return -1; + } + if (++unique_id <= 0) { + unique_id = 1; + } + + ACL_PCIE_ASSERT(DeviceMapManager::empty() || DeviceMapManager::get_device_map().count(unique_id) == 0, + "unique_id %d is used before.\n", + unique_id); + + if (signal_handler_installed == 0) { +#if defined(LINUX) + user_signal_number = allocate_and_register_linux_signal_number(&linux_signal_arb_mutex); + if (user_signal_number == -1) return -1; +#endif // LINUX + + install_ctrl_c_handler(0 /* use the custom signal handler */); + signal_handler_installed = 1; + } + + ACL_PCIE_DEVICE *pcie_dev = NULL; + + try { + pcie_dev = new ACL_PCIE_DEVICE(dev_num, name, unique_id, user_signal_number); + } + + // Catch any memory allocation failures + catch (std::bad_alloc &) { + delete pcie_dev; + return -1; + } + + if (!pcie_dev->is_valid()) { + delete pcie_dev; + return -1; + } + + DeviceMapManager::add_pcie_device_handle(unique_id, name, pcie_dev); + if (pcie_dev->is_initialized()) { + return unique_id; + } else { + // Perform a bitwise-not operation to the unique_id if the device + // do not pass the initial test. This negative unique_id indicates + // a fail to open the device, but still provide actual the unique_id + // to allow reprogram executable to get access to the device and + // reprogram the board when the board is not usable. + return ~unique_id; + } +} + +// Close an opened device, by its handle. +int AOCL_MMD_CALL aocl_mmd_close(int handle) { + DeviceMapManager::discard_pcie_device_handle(handle); + + return 0; +} + +// Set the interrupt handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), + return -1, + "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->set_kernel_interrupt(fn, user_data); +} + +// Set the device interrupt handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle, + aocl_mmd_device_interrupt_handler_fn fn, + void *user_data) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), + return -1, + "aocl_mmd_set_interrupt_handler failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->set_device_interrupt(fn, user_data); +} + +// Set the operation status handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), + return -1, + "aocl_mmd_set_status_handler failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->set_status_handler(fn, user_data); +} + +// Called when the host is idle and hence possibly waiting for events to be +// processed by the device +int AOCL_MMD_CALL aocl_mmd_yield(int handle) { return DeviceMapManager::get_pcie_device(handle)->yield(); } + +// Read, write and copy operations on a single interface. +int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) { + void *host_addr = dst; + size_t dev_addr = offset; + + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(), + return -1, + "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->read_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len); +} + +int AOCL_MMD_CALL +aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) { + void *host_addr = const_cast<void *>(src); + size_t dev_addr = offset; + + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(), + return -1, + "aocl_mmd_write failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->write_block(op, (aocl_mmd_interface_t)mmd_interface, host_addr, dev_addr, len); +} + +int AOCL_MMD_CALL +aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(), + return -1, + "aocl_mmd_copy failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->copy_block(op, (aocl_mmd_interface_t)mmd_interface, src_offset, dst_offset, len); +} + +// Initialize host channel specified in channel_name +int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), + return -1, + "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->create_hostchannel(channel_name, queue_depth, direction); +} + +// reset the host channel specified with channel handle +int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), + return -1, + "aocl_mmd_create_hostchannel failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->destroy_channel(channel); +} + +// Get the pointer to buffer the user can write/read from the kernel with +AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF(!pcie_dev->is_initialized(), + return NULL, + "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n", + handle); + + return pcie_dev->hostchannel_get_buffer(buffer_size, channel, status); +} + +// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer +size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) { + ACL_PCIE_DEVICE *pcie_dev = DeviceMapManager::get_pcie_device(handle); + ACL_PCIE_ERROR_IF( + !pcie_dev->is_initialized(), *status = -1; + return 0, "aocl_mmd_read failed due to the target device (handle %d) is not properly initialized.\n", handle); + + return pcie_dev->hostchannel_ack_buffer(send_size, channel, status); +} + +#ifdef DLA_MMD + +AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle) +{ + auto ret = DeviceMapManager::get_pcie_device(handle)->pause_and_save_pcie(); + if (ret) { + return -1; + } + return 0; +} +AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle) +{ + auto ret = DeviceMapManager::get_pcie_device(handle)->restore_and_resume_pcie(); + if (ret) { + return -1; + } + return 0; +} +// Reprogram the device given the sof file name +int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename, const bool skipSaveRestore) { + if (DeviceMapManager::get_pcie_device(handle)->reprogram_sof(sof_filename, skipSaveRestore)) + { + return -1; + } + return 0; +} +#else +// Reprogram the device based on the program mode +int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) { + // assuming the an ELF-formatted blob. + if (!blob_has_elf_signature(data, data_size)) { + ACL_PCIE_DEBUG_MSG("ad hoc fpga bin\n"); + return -1; + } + + // program the device based on the certain mode + if (program_mode & AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM) { + if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_PR)) return -1; + return handle; + } else { + if (DeviceMapManager::get_pcie_device(handle)->reprogram(data, data_size, ACL_PCIE_PROGRAM_JTAG)) return -1; + // Re-open the device to reinitialize hardware + const std::string device_name = DeviceMapManager::get_pcie_device_info(handle).first; + DeviceMapManager::discard_pcie_device_handle(handle); + + return aocl_mmd_open(device_name.c_str()); + } +} +#endif +// Shared memory allocator +AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) { + return DeviceMapManager::get_pcie_device(handle)->shared_mem_alloc(size, device_ptr_out); +} + +// Shared memory de-allocator +AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) { + DeviceMapManager::get_pcie_device(handle)->shared_mem_free(host_ptr, size); +} + +#ifndef DLA_MMD +// This function checks if the input data has an ELF-formatted blob. +// Return true when it does. +static bool blob_has_elf_signature(void *data, size_t data_size) { + bool result = false; + if (data && data_size > 4) { + unsigned char *cdata = (unsigned char *)data; + const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'}; // Little endian + result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) && + (cdata[3] == elf_signature[3]); + } + return result; +} +#endif + +// Return a positive number when single device open. Otherwise, return -1 +AOCL_MMD_CALL int get_open_handle() { + if (DeviceMapManager::empty() || DeviceMapManager::get_device_map().size() != 1) { + return -1; + } + return DeviceMapManager::get_device_map().begin()->first; +} + +AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles, + size_t num_devices, + size_t size, + size_t alignment, + aocl_mmd_mem_properties_t *properties, + int *error) { + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL int aocl_mmd_free(void *mem) { + // Not supported on this BSP + return 0; +} + +AOCL_MMD_CALL void *aocl_mmd_device_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) { + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL void *aocl_mmd_shared_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) { + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) { + // Not supported on this BSP + return 0; +} + +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 4; } +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; } +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 333.333333; } // MHz + +// Helper functions for the wrapper functions around CSR and DDR +uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; } +uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 33) * instance + addr; } + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) { + return aocl_mmd_write( + handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) { + return aocl_mmd_read( + handle, NULL, sizeof(uint32_t), data, ACL_MMD_KERNEL_HANDLE, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) { + return aocl_mmd_write(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) { + return aocl_mmd_read(handle, NULL, length, data, ACL_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr)); +} + +// Get the PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) { + constexpr uint64_t hw_timer_address = 0x37000; + const uint32_t start_bit = 1; + const uint32_t stop_bit = 2; + + // Send the start command to the hardware counter + std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now(); + int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address); + assert(status == 0); + + // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to + // determine the amount of time between the start and stop commands for the hardware counter + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Send the stop command to the hardware counter + std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now(); + status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, ACL_MMD_KERNEL_HANDLE, hw_timer_address); + assert(status == 0); + + // Read back the value of the counter + uint32_t counter = 0; + status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, ACL_MMD_KERNEL_HANDLE, hw_timer_address); + assert(status == 0); + + // Calculate the clock frequency of the counter, which is running on clk_dla + double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count(); + return 1.0e-6 * counter / elapsed_seconds; // 1.0e-6 is to convert to MHz +} + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h new file mode 100644 index 0000000..cfba6a3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie.h @@ -0,0 +1,177 @@ +#ifndef ACL_PCIE_H +#define ACL_PCIE_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie.h --------------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file defines macros and types that are used inside the MMD driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#ifndef ACL_PCIE_EXPORT +#define ACL_PCIE_EXPORT __declspec(dllimport) +#endif + +#include <assert.h> +#include <stddef.h> +#include <stdio.h> +#ifdef DLA_MMD +#include <cstdint> +#else +#include <CL/cl_platform.h> +#endif +#include "aocl_mmd.h" +#include "hw_pcie_constants.h" + +#define MMD_VERSION AOCL_MMD_VERSION_STRING + +#ifdef DLA_MMD +#include "version.h" +#else +#include <version.h> +#endif + +#define KERNEL_DRIVER_VERSION_EXPECTED ACL_DRIVER_VERSION + +#if defined(_WIN32) || defined(_WIN64) +// Need DWORD, UINT32, etc. +// But windows.h spits out a lot of spurious warnings. +#pragma warning(push) +#pragma warning(disable : 4668) +#include <windows.h> +#pragma warning(pop) + +// OPAE header files +#include <initguid.h> +#include <opae/fpga.h> +#include "fpga_cmd_guids.h" + +#define INVALID_DEVICE (NULL) + +// define for the format string for DWORD type +#define DWORD_FMT_U "%lu" +#define DWORD_FMT_X "%lx" +#define DWORD_FMT_4X "%04lX" + +// define for the format string for size_t type +#ifdef _WIN64 +#define SIZE_FMT_U "%zu" +#define SIZE_FMT_X "%zx" +#else +#define SIZE_FMT_U "%Iu" +#define SIZE_FMT_X "%Ix" +#endif + +typedef ULONG64 KPTR; +typedef UINT64 DMA_ADDR; +#endif // WINDOWS + +#if defined(LINUX) +typedef uintptr_t KPTR; +typedef int fpga_handle; +typedef unsigned int fpga_result; +#define FPGA_OK 0 + +typedef unsigned int DWORD; +typedef unsigned long long QWORD; +typedef char INT8; +typedef unsigned char UINT8; +typedef int16_t INT16; +typedef uint16_t UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long long INT64; +typedef unsigned long long UINT64; + +#define INVALID_HANDLE_VALUE ((int)(-1)) + +// Linux driver-specific exports +#include "pcie_linux_driver_exports.h" + +#define INVALID_DEVICE (-1) +#define WD_STATUS_SUCCESS 0 + +// define for the format string for DWORD type +#define DWORD_FMT_U "%u" +#define DWORD_FMT_X "%x" +#define DWORD_FMT_4X "%04X" + +// define for the format string for size_t type +#define SIZE_FMT_U "%zu" +#define SIZE_FMT_X "%zx" + +#endif // LINUX + +#define MAX_NAME_SIZE (1204) + +typedef enum { + AOCL_MMD_KERNEL = ACL_MMD_KERNEL_HANDLE, // Control interface into kernel interface + AOCL_MMD_MEMORY = ACL_MMD_MEMORY_HANDLE, // Data interface to device memory + AOCL_MMD_PLL = ACL_MMD_PLL_HANDLE, // Interface for reconfigurable PLL + AOCL_MMD_HOSTCH = ACL_MMD_HOSTCH_HANDLE +} aocl_mmd_interface_t; + +// Describes the properties of key components in a standard ACL device +#define PCIE_INFO_STR_LEN 1024 +#define PCIE_SLOT_INFO_STR_LEN 128 + +struct ACL_PCIE_DEVICE_DESCRIPTION { + DWORD vendor_id; + DWORD device_id; + char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN]; + char pcie_info_str[PCIE_INFO_STR_LEN]; + bool interrupt_valid; + UINT32 interrupt_data; + UINT64 interrupt_addr; +}; + +#define ACL_PCIE_ASSERT(COND, ...) \ + do { \ + if (!(COND)) { \ + printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + assert(0); \ + } \ + } while (0) + +#define ACL_PCIE_ERROR_IF(COND, NEXT, ...) \ + do { \ + if (COND) { \ + printf("\nMMD ERROR: " __VA_ARGS__); \ + fflush(stdout); \ + NEXT; \ + } \ + } while (0) + +#define ACL_PCIE_INFO(...) \ + do { \ + printf("MMD INFO : " __VA_ARGS__); \ + fflush(stdout); \ + } while (0) + +// Define the flag of program +#define ACL_PCIE_PROGRAM_PR 1 +#define ACL_PCIE_PROGRAM_JTAG 0 + +#endif // ACL_PCIE_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp new file mode 100644 index 0000000..03c76dd --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.cpp @@ -0,0 +1,1049 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_config.cpp ------------------------------------------ C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle functions that program the FPGA. */ +/* The declaration of the class lives in the acl_pcie_config.h. */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_pcie_config.h" +#include "acl_pcie.h" + +// other header files inside MMD driver +#include "acl_pcie_debug.h" +#if defined(WINDOWS) +#include "acl_pcie_dma_windows.h" +#endif // WINDOWS + +// other standard header files +#include <stdlib.h> +#include <string.h> +#include <iostream> +#include <sstream> +#if defined(WINDOWS) +#include <process.h> +#endif // WINDOWS + +#if defined(LINUX) +#include <unistd.h> +#endif // LINUX + +#if defined(WINDOWS) +#define FREEZE_STATUS_OFFSET 0 +#define FREEZE_CTRL_OFFSET 4 +#define FREEZE_VERSION_OFFSET 12 +#define FREEZE_BRIDGE_SUPPORTED_VERSION 0xad000003 + +#define FREEZE_REQ 1 +#define RESET_REQ 2 +#define UNFREEZE_REQ 4 + +#define FREEZE_REQ_DONE 1 +#define UNFREEZE_REQ_DONE 2 + +#define ALT_PR_DATA_OFST 0x00 +#define ALT_PR_CSR_OFST 0x04 +#define ALT_PR_VER_OFST 0x08 + +#define ALT_PR_CSR_PR_START 1 +#define ALT_PR_CSR_STATUS_SFT 1 +#define ALT_PR_CSR_STATUS_MSK (7 << ALT_PR_CSR_STATUS_SFT) +#define ALT_PR_CSR_STATUS_NRESET (0 << ALT_PR_CSR_STATUS_SFT) +#define ALT_PR_CSR_STATUS_BUSY (1 << ALT_PR_CSR_STATUS_SFT) +#define ALT_PR_CSR_STATUS_PR_IN_PROG (2 << ALT_PR_CSR_STATUS_SFT) +#define ALT_PR_CSR_STATUS_PR_SUCCESS (3 << ALT_PR_CSR_STATUS_SFT) +#define ALT_PR_CSR_STATUS_PR_ERR (4 << ALT_PR_CSR_STATUS_SFT) + +#define ACL_DMA_PR_ALIGNMENT_BYTES 4096 + +#define PLL_OFFSET_VERSION_ID 0x000 +#define PLL_OFFSET_ROM 0x400 +#define PLL_OFFSET_RECONFIG_CTRL_S10 0x800 +#define PLL_OFFSET_COUNTER 0x100 +#define PLL_OFFSET_RESET 0x110 +#define PLL_OFFSET_LOCK 0x120 + +#define PLL_M_HIGH_REG_S10 0x104 +#define PLL_M_LOW_REG_S10 0x107 +#define PLL_M_BYPASS_ENABLE_REG_S10 0x105 +#define PLL_M_EVEN_DUTY_ENABLE_REG_S10 0x106 + +#define PLL_N_HIGH_REG_S10 0x100 +#define PLL_N_LOW_REG_S10 0x102 +#define PLL_N_BYPASS_ENABLE_REG_S10 0x101 +#define PLL_N_EVEN_DUTY_ENABLE_REG_S10 0x101 + +#define PLL_C0_HIGH_REG_S10 0x11B +#define PLL_C0_LOW_REG_S10 0x11E +#define PLL_C0_BYPASS_ENABLE_REG_S10 0x11C +#define PLL_C0_EVEN_DUTY_ENABLE_REG_S10 0x11D + +#define PLL_C1_HIGH_REG_S10 0x11F +#define PLL_C1_LOW_REG_S10 0x122 +#define PLL_C1_BYPASS_ENABLE_REG_S10 0x120 +#define PLL_C1_EVEN_DUTY_ENABLE_REG_S10 0x121 + +#define PLL_LF_REG_S10 0x10A + +#define PLL_CP1_REG_S10 0x101 +#define PLL_CP2_REG_S10 0x10D + +#define PLL_REQUEST_CAL_REG_S10 0x149 +#define PLL_ENABLE_CAL_REG_S10 0x14A +#endif // WINDOWS + +#ifndef DLA_MMD +#include "acl_check_sys_cmd.h" +#include "pkg_editor.h" +#endif + +// MAX size of line read from pipe-ing the output of find_jtag_cable.tcl to MMD +#define READ_SIZE 1024 +// MAX size of command passed to system for invoking find_jtag_cable.tcl from MMD +#define SYSTEM_CMD_SIZE 4 * 1024 + +// Function to install the signal handler for Ctrl-C +// Implemented inside acl_pcie.cpp +extern int install_ctrl_c_handler(int ingore_sig); + +ACL_PCIE_CONFIG::ACL_PCIE_CONFIG(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma) { + m_handle = Handle; + m_io = io; + m_pcie = pcie; + m_dma = dma; + +#if defined(WINDOWS) + fpga_result result = FPGA_OK; + UINT32 NumCmds = 0; + FpgaCmd = NULL; + + // Get the number of supported commands + result = fpgaGetSupportedCommands(Handle, NULL, &NumCmds); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n"); + + // Allocate memory for the guid array based on NumCmds + FpgaCmd = (fpga_guid *)malloc(NumCmds * sizeof(fpga_guid)); + + if (FpgaCmd == NULL) { + throw std::bad_alloc(); + } + + ACL_PCIE_ERROR_IF(FpgaCmd == NULL, return, "malloc failed in ACL_PCIE_CONFIG().\n"); + + // Populate the guid array + result = fpgaGetSupportedCommands(Handle, FpgaCmd, &NumCmds); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "fpgaGetSupportedCommands failed in ACL_PCIE_CONFIG().\n"); +#endif // WINDOWS + + return; +} + +ACL_PCIE_CONFIG::~ACL_PCIE_CONFIG() { +#if defined(WINDOWS) + // Free the guid array + if (FpgaCmd) { + free(FpgaCmd); + FpgaCmd = NULL; + } +#endif +} + +// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf +// For Linux, the actual implementation of PR is inside the kernel mode driver. +// Return 0 on success. +int ACL_PCIE_CONFIG::program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len) { + int pr_result = 1; // set to default - failure + + ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n"); + ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n"); + +#if defined(WINDOWS) + int i; + uint32_t version; + UINT32 to_send, status; + UINT32 *data; + fpga_result result; + + /* Get version ID */ + result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version); + ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version); + + /* Check if PR is supported */ + if (version < (unsigned int)ACL_PR_PIO_VERSIONID) { + ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n"); + pr_result = 1; + return pr_result; + } + + ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n"); + + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status); + ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n"); + + to_send = 0x00000001; + ACL_PCIE_DEBUG_MSG(":: Writing 0x%08X to PR IP status register\n", (int)to_send); + result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, to_send); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n"); + + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n"); + ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status); + + if ((status != 0x10) && (status != 0x0)) { + ACL_PCIE_ERROR_IF(1, return 1, ":: PR IP not in an usable state.\n"); + } + + data = (UINT32 *)core_bitstream; + ACL_PCIE_DEBUG_MSG(":: Writing %d bytes of bitstream file to PR IP at BAR %d, OFFSET 0x%08X\n", + (int)core_rbf_len, + (int)ACL_PRCONTROLLER_BAR, + (int)ACL_PRCONTROLLER_OFFSET); + for (i = 0; i < (int)core_rbf_len / 4; i++) { + result = fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, data[i]); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaWriteMMIO32 failed.\n"); + } + + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP data register\n", (int)status); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n"); + + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + 4, &status); + ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReadMMIO32 failed.\n"); + + if (status == 0x14) { + ACL_PCIE_DEBUG_MSG(":: PR done!: 0x%08X\n", (int)status); + pr_result = 0; + } else { + ACL_PCIE_DEBUG_MSG(":: PR error!: 0x%08X\n", (int)status); + pr_result = 1; + } + + ACL_PCIE_DEBUG_MSG(":: PR completed!\n"); + +#endif // WINDOWS +#if defined(LINUX) + struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL}; + + cmd_pr.user_addr = core_bitstream; + cmd_pr.size = core_rbf_len; + + pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr)); + +#endif // LINUX + + return pr_result; +} + +// Change the kernel region using PR only via PCIe, using an in-memory image of the core.rbf +// For Linux, the actual implementation of PR is inside the kernel mode driver. +// Return 0 on success. +int ACL_PCIE_CONFIG::program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str) { + int pr_result = 1; // set to default - failure +#if defined(WINDOWS) + uint32_t pll_config_array[8] = {0}; +#else + int pll_config_array[8] = {0}; +#endif // WINDOWS + std::stringstream converter(pll_config_str); + + ACL_PCIE_ERROR_IF(core_bitstream == NULL, return 1, "core_bitstream is an NULL pointer.\n"); + ACL_PCIE_ERROR_IF(core_rbf_len < 1000000, return 1, "size of core rbf file is suspiciously small.\n"); + + /* parse PLL string */ + converter >> pll_config_array[0] >> pll_config_array[1] >> pll_config_array[2] >> pll_config_array[3] >> + pll_config_array[4] >> pll_config_array[5] >> pll_config_array[6] >> pll_config_array[7]; + if (converter.fail() == true) { + ACL_PCIE_ERROR_IF(1, return 1, "PLL configuration string requires 8 integer elements\n"); + }; + +#if defined(WINDOWS) + int i, j, k, result, count, chunk_num, frames; + size_t offset; + uint32_t to_send, status; + uint32_t version; + uint32_t *data; + uint32_t pll_freq_khz, pll_m, pll_n, pll_c0, pll_c1, pll_lf, pll_cp, pll_rc; + uint32_t pll_m_high, pll_m_low, pll_m_bypass_enable, pll_m_even_duty_enable; + uint32_t pll_n_high, pll_n_low, pll_n_bypass_enable, pll_n_even_duty_enable; + uint32_t pll_c0_high, pll_c0_low, pll_c0_bypass_enable, pll_c0_even_duty_enable; + uint32_t pll_c1_high, pll_c1_low, pll_c1_bypass_enable, pll_c1_even_duty_enable; + uint32_t pll_cp1, pll_cp2; + uint32_t pll_byte; + + /* Get version ID */ + result = fpgaReadMMIO32(m_handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, &version); + ACL_PCIE_DEBUG_MSG(":: VERSION_ID is 0x%08X\n", (int)version); + + /* Check if PR is supported */ + if (version < (unsigned int)ACL_PR_PIO_VERSIONID) { + ACL_PCIE_DEBUG_MSG(":: Currently programmed image does not support PR\n"); + pr_result = 1; + return pr_result; + } + + ACL_PCIE_DEBUG_MSG(":: OK to proceed with PR!\n"); + + /* freeze bridge */ + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status); + + result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status); + + ACL_PCIE_DEBUG_MSG(":: Asserting region freeze\n"); + fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, FREEZE_REQ); + Sleep(1); + + result = fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status); + + ACL_PCIE_DEBUG_MSG(":: PR Beginning\n"); + + /* PR IP write initialisation */ + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_VER_OFST, &status); + ACL_PCIE_DEBUG_MSG(":: ALT_PR_VER_OFST version is 0x%08X\n", (int)status); + + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status); + + to_send = ALT_PR_CSR_PR_START; + ACL_PCIE_DEBUG_MSG(":: Starting PR by writing 0x%08X to ALT_PR_CSR_OFST\n", (int)to_send); + fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, to_send); + + /* Wait for PR to be in progress */ + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + i = 0; + while (status != ALT_PR_CSR_STATUS_PR_IN_PROG) { + Sleep(1); + i++; + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + }; + ACL_PCIE_DEBUG_MSG(":: PR IP initialization took %d ms, ALT_PR_CSR_OFST status is 0x%08X\n", i, (int)status); + + // --------------------------------------------------------------- + // Legacy PR using PIO + // --------------------------------------------------------------- + if ((version >= (unsigned int)ACL_PR_PIO_VERSIONID) && (version < (unsigned int)ACL_PR_DMA_VERSIONID)) { + /* PR IP write bitstream */ + MemoryBarrier(); + data = (UINT32 *)core_bitstream; + count = (int)core_rbf_len; + ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X\n", (int)count); + + /* Write out the complete 32-bit chunks */ + /* Wait for a designated amount of time between 4K chunks */ + i = 0; + j = 0; + chunk_num = 0; + while (count >= 4) { + fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, data[i]); + i++; + j++; + count = count - 4; + if (j >= 1024) { + chunk_num++; + j = 0; + Sleep(1); + } + } + ACL_PCIE_DEBUG_MSG(":: Number of 4K chunks written: %d\n", (int)chunk_num); + ACL_PCIE_DEBUG_MSG(":: Number of bytes in PR bitstream remaining: %d\n", (int)count); + + /* Write out remaining non 32-bit chunks */ + to_send = data[i]; + switch (count) { + case 3: + to_send = to_send & 0x00ffffff; + fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send); + break; + case 2: + to_send = to_send & 0x0000ffff; + fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send); + break; + case 1: + to_send = to_send & 0x000000ff; + fpgaWriteMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_DATA_OFST, to_send); + break; + case 0: + break; + default: + /* This will never happen */ + return 1; + } + } + + // --------------------------------------------------------------- + // PR using DMA + // --------------------------------------------------------------- + if (version >= (unsigned int)ACL_PR_DMA_VERSIONID) { + /* PR IP write bitstream */ + MemoryBarrier(); + ACL_PCIE_DEBUG_MSG(":: Size of PR RBF is 0x%08X, initiating DMA transfer to PR IP\n", (int)core_rbf_len); + + /* Write PR bitstream using DMA */ + frames = (int)core_rbf_len / ACL_DMA_PR_ALIGNMENT_BYTES; + ACL_PCIE_DEBUG_MSG( + ":: PR bitstream will be sent in %d Byte frames, a total of %d frames\n", ACL_DMA_PR_ALIGNMENT_BYTES, frames); + + // sending in 4kB frames + for (k = 0; k < frames; k++) { + offset = (size_t)k * ACL_DMA_PR_ALIGNMENT_BYTES; + void *host_addr_new = reinterpret_cast<void *>(core_bitstream + offset); + size_t dev_addr_new = ACL_PCIE_PR_DMA_OFFSET; + + status = (uint32_t)m_dma->read_write(host_addr_new, dev_addr_new, ACL_DMA_PR_ALIGNMENT_BYTES, NULL, false); + + while (!m_dma->is_idle()) { + ACL_PCIE_DEBUG_MSG(":: DMA still in progress...\n"); + } + } + } + + // Wait for PR complete + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status); + i = 0; + // wait till we get a PR_SUCCESS, or PR_ERROR, or a 1 second timeout + while (status != ALT_PR_CSR_STATUS_PR_SUCCESS && status != ALT_PR_CSR_STATUS_PR_ERR && i < 100000) { + Sleep(100); + i++; + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + ACL_PCIE_DEBUG_MSG(":: ALT_PR_CSR_OFST status is 0x%08X\n", (int)status); + }; + + if (status == ALT_PR_CSR_STATUS_PR_SUCCESS) { + /* dynamically reconfigure IOPLL for kernel clock */ + /* read kernel clock generation version ID */ + result = fpgaReadMMIO32( + m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_VERSION_ID, &status); + ACL_PCIE_DEBUG_MSG(":: Kernel clock generator version ID is 0x%08X\n", (int)status); + + /* extract PLL settings from PLL configuration array */ + pll_freq_khz = pll_config_array[0]; + pll_m = pll_config_array[1]; + pll_n = pll_config_array[2]; + pll_c0 = pll_config_array[3]; + pll_c1 = pll_config_array[4]; + pll_lf = pll_config_array[5]; + pll_cp = pll_config_array[6]; + pll_rc = pll_config_array[7]; + + ACL_PCIE_DEBUG_MSG(":: PLL settings are %d %d %d %d %d %d %d %d\n", + pll_freq_khz, + pll_m, + pll_n, + pll_c0, + pll_c1, + pll_lf, + pll_cp, + pll_rc); + + // Measure kernel clock frequency + fpgaWriteMMIO32( + m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0); + Sleep(1000); + result = fpgaReadMMIO32( + m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status); + ACL_PCIE_DEBUG_MSG(":: Before reconfig, kernel clock set to %d Hz\n", (int)status); + + // extract all PLL parameters + pll_m_high = (pll_m >> 8) & 0xFF; + pll_m_low = pll_m & 0xFF; + pll_m_bypass_enable = (pll_m >> 16) & 0x01; + pll_m_even_duty_enable = (pll_m >> 17) & 0x01; + + pll_n_high = (pll_n >> 8) & 0xFF; + pll_n_low = pll_n & 0xFF; + pll_n_bypass_enable = (pll_n >> 16) & 0x01; + pll_n_even_duty_enable = (pll_n >> 17) & 0x01; + + pll_c0_high = (pll_c0 >> 8) & 0xFF; + pll_c0_low = pll_c0 & 0xFF; + pll_c0_bypass_enable = (pll_c0 >> 16) & 0x01; + pll_c0_even_duty_enable = (pll_c0 >> 17) & 0x01; + + pll_c1_high = (pll_c1 >> 8) & 0xFF; + pll_c1_low = pll_c1 & 0xFF; + pll_c1_bypass_enable = (pll_c1 >> 16) & 0x01; + pll_c1_even_duty_enable = (pll_c1 >> 17) & 0x01; + + pll_lf = (pll_lf >> 6) & 0xFF; + + pll_cp = pll_cp & 0xFF; + pll_cp1 = pll_cp & 0x07; + pll_cp2 = (pll_cp >> 3) & 0x07; + + pll_rc = pll_rc & 0x03; + + /* read and write PLL settings */ + to_send = pll_m_high; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_HIGH_REG_S10, + &to_send, + 1); + to_send = pll_m_low; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_LOW_REG_S10, + &to_send, + 1); + to_send = pll_m_bypass_enable; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_BYPASS_ENABLE_REG_S10, + &to_send, + 1); + to_send = (pll_m_even_duty_enable << 7); + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_M_EVEN_DUTY_ENABLE_REG_S10, + &to_send, + 1); + + to_send = pll_n_high; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_HIGH_REG_S10, + &to_send, + 1); + to_send = pll_n_low; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_LOW_REG_S10, + &to_send, + 1); + to_send = (pll_n_even_duty_enable << 7) | (pll_cp1 << 4) | pll_n_bypass_enable; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_N_BYPASS_ENABLE_REG_S10, + &to_send, + 1); + + to_send = pll_c0_high; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_HIGH_REG_S10, + &to_send, + 1); + to_send = pll_c0_low; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_LOW_REG_S10, + &to_send, + 1); + to_send = pll_c0_bypass_enable; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_BYPASS_ENABLE_REG_S10, + &to_send, + 1); + to_send = (pll_c0_even_duty_enable << 7); + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C0_EVEN_DUTY_ENABLE_REG_S10, + &to_send, + 1); + + to_send = pll_c1_high; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_HIGH_REG_S10, + &to_send, + 1); + to_send = pll_c1_low; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_LOW_REG_S10, + &to_send, + 1); + to_send = pll_c1_bypass_enable; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_BYPASS_ENABLE_REG_S10, + &to_send, + 1); + to_send = (pll_c1_even_duty_enable << 7); + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_C1_EVEN_DUTY_ENABLE_REG_S10, + &to_send, + 1); + + to_send = (pll_cp2 << 5); + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_CP2_REG_S10, + &to_send, + 1); + + to_send = (pll_lf << 3) | (pll_rc << 1); + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_LF_REG_S10, + &to_send, + 1); + + // start PLL calibration + /* read/modify/write the request calibration */ + ACL_PCIE_DEBUG_MSG(":: Requesting PLL calibration\n"); + result = fpgaReadMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10, + &pll_byte, + 1); + to_send = pll_byte | 0x40; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_REQUEST_CAL_REG_S10, + &to_send, + 1); + /* write 0x03 to enable calibration interface */ + to_send = 0x03; + fpgaWriteMmio(m_handle, + ACL_PCIE_KERNELPLL_RECONFIG_BAR, + ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_RECONFIG_CTRL_S10 + PLL_ENABLE_CAL_REG_S10, + &to_send, + 1); + ACL_PCIE_DEBUG_MSG(":: PLL calibration done\n"); + + // Measure kernel clock frequency + fpgaWriteMMIO32( + m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, 0); + Sleep(1000); + result = fpgaReadMMIO32( + m_handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET + PLL_OFFSET_COUNTER, &status); + ACL_PCIE_DEBUG_MSG(":: After reconfig, kernel clock set to %d Hz\n", (int)status); + + /* assert reset */ + MemoryBarrier(); + ACL_PCIE_DEBUG_MSG(":: Asserting region reset\n"); + fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, RESET_REQ); + Sleep(10); + + /* unfreeze bridge */ + MemoryBarrier(); + result = + fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_VERSION_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge version is 0x%08X\n", (int)status); + + result = + fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status); + + ACL_PCIE_DEBUG_MSG(":: Removing region freeze\n"); + fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, UNFREEZE_REQ); + Sleep(1); + + ACL_PCIE_DEBUG_MSG(":: Checking freeze bridge status\n"); + result = + fpgaReadMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_STATUS_OFFSET, &status); + ACL_PCIE_DEBUG_MSG(":: Freeze bridge status is 0x%08X\n", (int)status); + + /* deassert reset */ + MemoryBarrier(); + ACL_PCIE_DEBUG_MSG(":: Deasserting region reset\n"); + fpgaWriteMMIO32(m_handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET + FREEZE_CTRL_OFFSET, 0); + + MemoryBarrier(); + result = fpgaReadMMIO32(m_handle, ACL_PRCONTROLLER_BAR, ACL_PRCONTROLLER_OFFSET + ALT_PR_CSR_OFST, &status); + ACL_PCIE_DEBUG_MSG(":: Reading 0x%08X from PR IP status register\n", (int)status); + if (status == 0x6) { + ACL_PCIE_DEBUG_MSG(":: PR done! Status is 0x%08X\n", (int)status); + pr_result = 0; + } else { + ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status); + pr_result = 1; + } + } else { + ACL_PCIE_DEBUG_MSG(":: PR error! Status is 0x%08X\n", (int)status); + pr_result = 1; + } + + ACL_PCIE_DEBUG_MSG(":: PR completed!\n"); + +#endif // WINDOWS +#if defined(LINUX) + struct acl_cmd cmd_pr = {ACLPCI_CMD_BAR, ACLPCI_CMD_DO_PR, NULL, NULL}; + + cmd_pr.user_addr = core_bitstream; + cmd_pr.size = core_rbf_len; + cmd_pr.device_addr = pll_config_array; + + pr_result = read(m_handle, &cmd_pr, sizeof(cmd_pr)); + +#endif // LINUX + + return pr_result; +} + +// Windows specific code to disable PCIe advanced error reporting on the +// upstream port. +// No-op in Linux because save_pcie_control_regs() has already disabled +// AER on the upstream port. +// Returns 0 on success +int ACL_PCIE_CONFIG::disable_AER_windows(void) { + fpga_result result = FPGA_OK; + +#if defined(WINDOWS) + // IOCTL call to disable AER in kernel mode + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_DISABLE_AER), NULL, NULL, 0); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when disabling AER.\n"); +#endif // WINDOWS + return result; +} + +// Windows specific code to enable PCIe advanced error reporting on the +// upstream port. +// No-op in Linux because load_pcie_control_regs() has already enabled +// AER on the upstream port. +// Returns 0 on success +int ACL_PCIE_CONFIG::enable_AER_and_retrain_link_windows(void) { + fpga_result result = FPGA_OK; + +#if defined(WINDOWS) + // IOCTL call to enable AER and retrain link in kernel mode + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_ENABLE_AER_RETRAIN_LINK), NULL, NULL, 0); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when enabling AER.\n"); +#endif // WINDOWS + return result; +} + +// Program the FPGA using a given SOF file +// Quartus is needed for this, because, +// quartus_pgm is used to program the board through USB blaster +// For Linux, when the kernel driver is asked to save/load_pcie_control_regs(), +// it will also disable/enable the aer on the upstream, so no need to +// implement those here. +// NOTE: This function only works with single device machines - if there +// are multiple cards (and multiple USB-blasters) in the system, it doesn't +// properly determine which card is which. Only the first device will be +// programmed. +// Return 0 on success. +int ACL_PCIE_CONFIG::program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index) { + const int MAX_ATTEMPTS = 3; + int program_failed = 1; + int status; + bool use_cable_autodetect = true; + + // If ad_cable value is "0", either JTAG cable autodetect failed or not + // supported, then use the default value + if (strcmp(ad_cable, "0") == 0) use_cable_autodetect = false; + + const char *cable = getenv("ACL_PCIE_JTAG_CABLE"); + if (!cable) { + if (use_cable_autodetect) { + cable = ad_cable; + ACL_PCIE_DEBUG_MSG("setting Cable to autodetect value %s\n", cable); + } else { + cable = "1"; + ACL_PCIE_DEBUG_MSG("setting Cable to default value %s\n", cable); + } + } + + const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX"); + if (!device_index) { + if (use_cable_autodetect) { + device_index = ad_device_index; + ACL_PCIE_DEBUG_MSG("setting Device Index to autodetect value %s\n", device_index); + } else { + device_index = "1"; + ACL_PCIE_DEBUG_MSG("setting Device Index to default value %s\n", device_index); + } + } + + char cmd[4 * 1024]; +#ifdef DLA_MMD +#if defined(WINDOWS) + if ((ACL_PCIE_DEBUG | 0) >= VERBOSITY_DEFAULT) { + snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\"", cable, filename, device_index); + } else { + snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index); + } +#else + snprintf(cmd, sizeof(cmd), "quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null", cable, filename, device_index); +#endif + ACL_PCIE_INFO("Executing \"%s\"\n", cmd); +#else +#if defined(WINDOWS) + snprintf( + cmd, sizeof(cmd), "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" > nul 2>&1", cable, filename, device_index); +#endif +#if defined(LINUX) + snprintf(cmd, + sizeof(cmd), + "aocl do quartus_pgm -c %s -m jtag -o \"P;%s@%s\" 2>&1 >/dev/null", + cable, + filename, + device_index); +#endif + ACL_PCIE_DEBUG_MSG("Executing \"%s\"\n", cmd); +#endif + + // Disable AER + status = disable_AER_windows(); + ACL_PCIE_ERROR_IF(status, return -1, "Failed to disable AER on Windows before programming SOF.\n"); + + // Set the program to ignore the ctrl-c signal + // This setting will be inherited by the system() function call below, + // so that the quartus_pgm call won't be interrupt by the ctrl-c signal. + install_ctrl_c_handler(1 /* ignore the signal */); + + // Program FPGA by executing the command +#ifndef DLA_MMD + ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid system() function parameter: %s\n", cmd); +#endif + for (int attempts = 0; attempts < MAX_ATTEMPTS && program_failed; attempts++) { + if (attempts > 0) { + ACL_PCIE_INFO("Execution failed. Will try again in case the error was transient.\n"); + } + program_failed = system(cmd); +#if defined(WINDOWS) + Sleep(2000); +#endif // WINDOWS +#if defined(LINUX) + sleep(2); +#endif // LINUX + } + + // Restore the original custom ctrl-c signal handler + install_ctrl_c_handler(0 /* use the custom signal handler */); + + // Enable AER + status = enable_AER_and_retrain_link_windows(); + ACL_PCIE_ERROR_IF(status, return -1, "Failed to enable AER and retrain link on Windows after programming SOF.\n"); + + return program_failed; +} + +bool ACL_PCIE_CONFIG::find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index) { + FILE *fp; + int status; + char line_in[READ_SIZE]; + bool found_cable = false; + + char cmd[SYSTEM_CMD_SIZE]; + const char *aocl_boardpkg_root = getenv("AOCL_BOARD_PACKAGE_ROOT"); + if (!aocl_boardpkg_root) { + ACL_PCIE_INFO("AOCL_BOARD_PACKAGE_ROOT not set!!!"); + return false; + } + + snprintf(cmd, sizeof(cmd), "aocl do quartus_stp -t %s/scripts/find_jtag_cable.tcl %X", aocl_boardpkg_root, cade_id); + ACL_PCIE_DEBUG_MSG("executing \"%s\"\n", cmd); + + // Open PIPE to tcl script +#ifndef DLA_MMD + ACL_PCIE_ASSERT(system_cmd_is_valid(cmd), "Invalid popen() function parameter: %s\n", cmd); +#endif +#if defined(WINDOWS) + fp = _popen(cmd, "r"); +#endif // WINDOWS +#if defined(LINUX) + fp = popen(cmd, "r"); +#endif // LINUX + + if (fp == NULL) { + ACL_PCIE_INFO("Couldn't open fp file\n"); + } else { + // Read everyline and look for matching string from tcl script + while (fgets(line_in, READ_SIZE, fp) != NULL) { + ACL_PCIE_DEBUG_MSG("%s", line_in); + const char *str_match_cable = "Matched Cable:"; + const char *str_match_dev_name = "Device Name:@"; + const char *str_match_end = ":"; + // parsing the string and extracting the cable/index value + // from the output of find_jtag_cable.tcl script + char *pos_cable = strstr(line_in, str_match_cable); + if (pos_cable) { + found_cable = true; + // find the sub-string locations in the line + char *pos_dev_name = strstr(line_in, str_match_dev_name); + if (pos_dev_name) { + char *pos_end = + strstr(pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE), str_match_end); // Find the last ":" + if (pos_end) { + // calculate the cable/index string size + size_t i_cable_str_len = pos_dev_name - pos_cable - strnlen(str_match_cable, MAX_NAME_SIZE); + size_t i_dev_index_str_len = pos_end - pos_dev_name - strnlen(str_match_dev_name, MAX_NAME_SIZE); + // extract the cable/index value from the line + snprintf(ad_cable, + AD_CABLE_SIZE, + "%.*s", + (int)i_cable_str_len, + pos_cable + strnlen(str_match_cable, MAX_NAME_SIZE)); + snprintf(ad_device_index, + AD_CABLE_SIZE, + "%.*s", + (int)i_dev_index_str_len, + pos_dev_name + strnlen(str_match_dev_name, MAX_NAME_SIZE)); + ACL_PCIE_DEBUG_MSG("JTAG Autodetect device found Cable:%s, Device Index:%s\n", ad_cable, ad_device_index); + break; + } + } + } + } + +#if defined(WINDOWS) + status = _pclose(fp); +#endif // WINDOWS +#if defined(LINUX) + status = pclose(fp); +#endif // LINUX + + if (status == -1) { + /* Error reported by pclose() */ + ACL_PCIE_INFO("Couldn't close find_cable_with_ISSP file\n"); + } else { + /* Use macros described under wait() to inspect `status' in order + * to determine success/failure of command executed by popen() + * */ + } + } + + if (!found_cable) { + ACL_PCIE_INFO("Autodetect Cable not found!!\n"); + } + + return found_cable; +} + +// Functions to save/load control registers form PCI Configuration Space +// This saved registers are used to restore the PCIe link after reprogramming +// through methods other than PR +// For Windows, the register values are stored in this class, and do +// nothing else +// For Linux, the register values are stored inside the kernel driver, +// And, it will disable the interrupt and the aer on the upstream, +// when the save_pci_control_regs() function is called. They will +// be enable when load_pci_control_regs() is called. +// Return 0 on success +int ACL_PCIE_CONFIG::save_pci_control_regs() { + int save_failed = 1; + +#if defined(WINDOWS) + fpga_result result = FPGA_OK; + + // IOCTL call to save PCI control register + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SAVE_PCI_CTRL_REG), NULL, NULL, 0); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when saving PCI Control registers.\n"); + + save_failed = (result == FPGA_OK) ? (0) : (-1); +#endif // WINDOWS +#if defined(LINUX) + struct acl_cmd cmd_save = {ACLPCI_CMD_BAR, ACLPCI_CMD_SAVE_PCI_CONTROL_REGS, NULL, NULL}; + save_failed = read(m_handle, &cmd_save, 0); +#endif // LINUX + + return save_failed; +} + +int ACL_PCIE_CONFIG::load_pci_control_regs() { + int load_failed = 1; +#if defined(WINDOWS) + + fpga_result result = FPGA_OK; + // IOCTL call to load PCI control register + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_LOAD_PCI_CTRL_REG), NULL, NULL, 0); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when loading PCI Control registers.\n"); + + load_failed = (result == FPGA_OK) ? (0) : (-1); +#endif // WINDOWS +#if defined(LINUX) + struct acl_cmd cmd_load = {ACLPCI_CMD_BAR, ACLPCI_CMD_LOAD_PCI_CONTROL_REGS, NULL, NULL}; + load_failed = read(m_handle, &cmd_load, 0); +#endif // LINUX + + return load_failed; +} + +// Functions to query the PCI related information +// Use NULL as input for the info that you don't care about +// Return 0 on success. +int ACL_PCIE_CONFIG::query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str) { + int status = 0; +#if defined(WINDOWS) + fpga_result result = FPGA_OK; + // IOCTL call to obtain PCIe gen information + result = fpgaProcessDeviceCmd( + m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_GEN), NULL, pcie_gen, sizeof(unsigned int)); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device gen info.\n"); + + result = fpgaProcessDeviceCmd( + m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_GET_PCI_LANES), NULL, pcie_num_lanes, sizeof(unsigned int)); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "fpgaProcessDeviceCmd failed when finding PCI device lanes info.\n"); + + status = (result == FPGA_OK) ? (0) : (-1); +#endif // WINDOWS +#if defined(LINUX) + struct acl_cmd driver_cmd; + + if (pcie_gen != NULL) { + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_GET_PCI_GEN; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = pcie_gen; + driver_cmd.size = sizeof(*pcie_gen); + status |= read(m_handle, &driver_cmd, sizeof(driver_cmd)); + } + + if (pcie_num_lanes != NULL) { + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_GET_PCI_NUM_LANES; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = pcie_num_lanes; + driver_cmd.size = sizeof(*pcie_num_lanes); + status |= read(m_handle, &driver_cmd, sizeof(driver_cmd)); + } + + if (pcie_slot_info_str != NULL) { + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_GET_PCI_SLOT_INFO; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = pcie_slot_info_str; + driver_cmd.size = sizeof(pcie_slot_info_str); + status |= read(m_handle, &driver_cmd, sizeof(driver_cmd)); + } +#endif // LINUX + return status; +} + +void ACL_PCIE_CONFIG::wait_seconds(unsigned seconds) { +#if defined(WINDOWS) + Sleep(seconds * 1000); +#endif // WINDOWS + +#if defined(LINUX) + sleep(seconds); +#endif // LINUX +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h new file mode 100644 index 0000000..3f07634 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_config.h @@ -0,0 +1,109 @@ +#ifndef ACL_PCIE_CONFIG_H +#define ACL_PCIE_CONFIG_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_config.h -------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle functions that program the FPGA. */ +/* The actual implementation of the class lives in the acl_pcie_config.cpp, */ +/* so look there for full documentation. */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#ifdef DLA_MMD +#include <cstddef> //size_t +#endif + +// Forward declaration for classes used by ACL_PCIE_DEVICE +class ACL_PCIE_DMA; +class ACL_PCIE_DEVICE; +class ACL_PCIE_MM_IO_MGR; + +#define PCIE_AER_CAPABILITY_ID ((DWORD)0x0001) +#define PCIE_AER_UNCORRECTABLE_STATUS_OFFSET ((DWORD)0x4) +#define PCIE_AER_UNCORRECTABLE_MASK_OFFSET ((DWORD)0x8) +#define PCIE_AER_CORRECTABLE_STATUS_OFFSET ((DWORD)0x10) +#define PCIE_AER_SURPRISE_DOWN_BIT ((DWORD)(1 << 5)) + +// The size of the char array that holds the name of autodetect JTAG cable and device index +#define AD_CABLE_SIZE 10 + +#if defined(LINUX) +typedef int fpga_handle; +#else +#include <opae/fpga.h> +#endif // LINUX + +class ACL_PCIE_CONFIG { + public: + ACL_PCIE_CONFIG(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma); + ~ACL_PCIE_CONFIG(); + + // Change the core only via PCIe, using an in-memory image of the core.rbf + // This is supported only for Stratix V and newer devices. + // Return 0 on success. + int program_core_with_PR_file_a10(char *core_bitstream, size_t core_rbf_len); + int program_core_with_PR_file_s10(char *core_bitstream, size_t core_rbf_len, char *pll_config_str); + + // Program the FPGA using a given SOF file + // Input filename, autodetect cable, autodetect device index + // Return 0 on success. + int program_with_SOF_file(const char *filename, const char *ad_cable, const char *ad_device_index); + + // Look up CADEID using ISSP + // Return TRUE with cable value in ad_cable, ad_device_index if cable found + // Otherwise return FALSE + bool find_cable_with_ISSP(unsigned int cade_id, char *ad_cable, char *ad_device_index); + + // Functions to save/load control registers from PCI Configuration Space + // Return 0 on success. + int save_pci_control_regs(); + int load_pci_control_regs(); + + // Functions to query the PCI related information + // Use NULL as input for the info that you don't care about + // Return 0 on success. + int query_pcie_info(unsigned int *pcie_gen, unsigned int *pcie_num_lanes, char *pcie_slot_info_str); + + // Windows-specific code to control AER, and retrain the link + int enable_AER_and_retrain_link_windows(void); + int disable_AER_windows(void); + + // Platform agnostic sleep (in seconds) + void wait_seconds(unsigned seconds); + + private: + ACL_PCIE_CONFIG &operator=(const ACL_PCIE_CONFIG &) { return *this; } + + ACL_PCIE_CONFIG(const ACL_PCIE_CONFIG &src) {} + + fpga_handle m_handle; + ACL_PCIE_MM_IO_MGR *m_io; + ACL_PCIE_DEVICE *m_pcie; + ACL_PCIE_DMA *m_dma; +#if defined(WINDOWS) + fpga_guid *FpgaCmd; +#endif // WINDOWS +}; + +#endif // ACL_PCIE_CONFIG_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp new file mode 100644 index 0000000..8afc1c7 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.cpp @@ -0,0 +1,61 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_debug.cpp ------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#include "acl_pcie_debug.h" +#include <stdio.h> +#include <stdlib.h> + +int ACL_PCIE_DEBUG = 0; +int ACL_PCIE_WARNING = 1; // turn on the warning message by default + +int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = 0; + +void set_mmd_debug() { + char* mmd_debug_var = getenv("ACL_PCIE_DEBUG"); + if (mmd_debug_var) { + char* endptr = NULL; + long parsed_count; + parsed_count = strtol(mmd_debug_var, &endptr, 10); + if (endptr == mmd_debug_var // no valid characters + || *endptr // an invalid character + || (parsed_count < 0 || parsed_count >= (long)VERBOSITY_EVERYTHING)) { + // malformed string, do nothing + } else { + ACL_PCIE_DEBUG = (int)parsed_count; + printf("\n:: MMD DEBUG LEVEL set to %d\n", ACL_PCIE_DEBUG); + } + } + + char* hal_debug_dump_flash_bootsect = getenv("ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR"); + if (hal_debug_dump_flash_bootsect) ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR = atoi(hal_debug_dump_flash_bootsect); +} + +void set_mmd_warn_msg() { + char* mmd_warn_var = getenv("ACL_PCIE_WARNING"); + if (mmd_warn_var) { + ACL_PCIE_WARNING = atoi(mmd_warn_var); + } +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h new file mode 100644 index 0000000..072eabc --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_debug.h @@ -0,0 +1,64 @@ +#ifndef ACL_PCIE_DEBUG_H +#define ACL_PCIE_DEBUG_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_debug.h --------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +enum ACL_VERBOSITY { + VERBOSITY_DEFAULT = 1, + VERBOSITY_INVOCATION = 2, // Dump kernel invocation details + VERBOSITY_OP = 3, // Dump operation invocation details + VERBOSITY_IRQ = 5, + VERBOSITY_BLOCKTX = 9, // Dump PCIe block transfers + VERBOSITY_PCIE = 10, // Dump all PCIe transactions + VERBOSITY_EVERYTHING = 100 +}; + +extern int ACL_PCIE_DEBUG; +extern int ACL_PCIE_WARNING; +extern int ACL_PCIE_DEBUG_FLASH_DUMP_BOOT_SECTOR; + +// This function gets the value of ACL_PCIE_DEBUG from the environment variable +void set_mmd_debug(); +void set_mmd_warn_msg(); + +#include <stdio.h> + +#define ACL_PCIE_DEBUG_MSG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_DEFAULT, m, ##__VA_ARGS__) +#define ACL_PCIE_DEBUG_MSG_VERBOSE(verbosity, m, ...) \ + if ((ACL_PCIE_DEBUG | 0) >= verbosity) do { \ + printf((m), ##__VA_ARGS__), fflush(stdout); \ + } while (0) + +#define ACL_PCIE_WARN_MSG(...) \ + do { \ + if (ACL_PCIE_WARNING) { \ + printf("** WARNING: " __VA_ARGS__); \ + fflush(stdout); \ + } \ + } while (0) + +#endif // ACL_PCIE_DEBUG_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp new file mode 100644 index 0000000..8489c32 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.cpp @@ -0,0 +1,2029 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_device.cpp ------------------------------------------ C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle operations on a single device. */ +/* The declaration of the class lives in the acl_pcie_device.h */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(WINDOWS) +#define NOMINMAX +#include <time.h> +#endif // WINDOWS + +// common and its own header files +#include "acl_pcie.h" +#include "acl_pcie_device.h" + +// other header files inside MMD driver +#include "acl_pcie_config.h" +#include "acl_pcie_debug.h" +#include "acl_pcie_dma.h" +#include "acl_pcie_mm_io.h" +#if !defined(DLA_MMD) || defined(WINDOWS) +#include "pkg_editor.h" +#endif + +// other standard header files +#include <stdlib.h> +#include <string.h> +#include <fstream> +#include <limits> +#include <random> +#include <sstream> +#include <stdexcept> +#include "acl_pcie_hostch.h" + +#if defined(LINUX) +#include <fcntl.h> +#include <signal.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#endif // LINUX + +#define MAX_LEN 1024 + +#define FREEZE_CTRL_OFFSET 4 +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif +#define ACL_VERSIONID_MIN 0xA0C7C1E0 + +static int num_open_devices = 0; + +#if defined(WINDOWS) +fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num); + +// Interrupt service routine for all interrupts on the PCIe interrupt line +// PCIe interrupts in Windows XP are level-based. The KMD is responsible for +// masking off the interrupt until this routine can service the request at +// user-mode priority. +extern void pcie_interrupt_handler(void *data); +#endif // WINDOWS +#if defined(LINUX) +fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num); +#endif // LINUX + +ACL_PCIE_DEVICE::ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number) + : kernel_interrupt(NULL), + kernel_interrupt_user_data(NULL), + device_interrupt(NULL), + device_interrupt_user_data(NULL), + event_update(NULL), + event_update_user_data(NULL), + m_user_signal_number(0), + m_io(NULL), + m_dma(NULL), + m_hostch(NULL), + m_config(NULL), + m_handle(-1), + m_device(INVALID_HANDLE_VALUE), +#if ACL_USE_DMA == 1 + m_use_dma_for_big_transfers(true), +#else + m_use_dma_for_big_transfers(false), +#endif + m_mmd_irq_handler_enable(false), + m_initialized(false), + m_being_programmed(false), + m_skip_quartus_version_check(false), + m_segment(0) { + if (NULL == name) { + // Throw an error and bail out + throw std::runtime_error("Invalid argument, passed in an empty name pointer when creating device object!"); + } + + int status = 0; + + // Set debug level from the environment variable ACL_PCIE_DEBUG + // Determine if warning messages should be disabled depends on ACL_PCIE_WARNING + if (num_open_devices == 0) { + set_mmd_debug(); + set_mmd_warn_msg(); + } + +#if defined(WINDOWS) + strncpy_s(m_name, MAX_NAME_LENGTH, name, (MAX_NAME_LENGTH - 1)); +#else + strncpy(m_name, name, (MAX_NAME_LENGTH - 1)); +#endif + m_name[(MAX_NAME_LENGTH - 1)] = '\0'; + + m_handle = handle; + m_info.vendor_id = ACL_PCI_INTELFPGA_VENDOR_ID; + m_info.device_id = 0; // search for all device id + m_info.interrupt_valid = false; + m_info.interrupt_data = 0x00; + m_info.interrupt_addr = 0x00; + +#if defined(WINDOWS) + m_device = open_device_windows(&m_info, dev_num); +#endif // WINDOWS +#if defined(LINUX) + m_device = open_device_linux(&m_info, dev_num); +#endif // LINUX + + // Return to caller if this is simply an invalid device. + if (m_device == INVALID_HANDLE_VALUE) { + return; + } + + // Initialize device IO and CONFIG objects + m_io = new ACL_PCIE_MM_IO_MGR(m_device); + + // Initialize the DMA object and enable interrupts on the DMA controller + try { + m_dma = new ACL_PCIE_DMA(m_device, m_io, this); + } + + // Catch any memory allocation failures + catch (std::bad_alloc &) { + throw std::bad_alloc(); + } + + try { + m_config = new ACL_PCIE_CONFIG(m_device, m_io, this, m_dma); + } + + catch (std::bad_alloc &) { + throw std::bad_alloc(); + } + + // Set the segment ID to 0 first forcing cached "segment" to all 1s + m_segment = ~m_segment; + if (this->set_segment(0x0)) { + return; + } + + // performance basic I/O tests + if (this->version_id_test()) { + return; + } + if (this->wait_for_uniphy()) { + return; + } + + // Get PCIE information + unsigned int pcie_gen, pcie_num_lanes; + char pcie_slot_info_str[PCIE_SLOT_INFO_STR_LEN] = {0}; + + status = m_config->query_pcie_info(&pcie_gen, &pcie_num_lanes, pcie_slot_info_str); + ACL_PCIE_ERROR_IF(status, return, "[%s] fail to query PCIe related information.\n", m_name); + snprintf(m_info.pcie_info_str, + PCIE_INFO_STR_LEN, + "dev_id = " DWORD_FMT_4X ", bus:slot.func = %s, Gen%u x%u", + m_info.device_id, + pcie_slot_info_str, + pcie_gen, + pcie_num_lanes); + + m_user_signal_number = user_signal_number; + + // Initialize the Host Channel object + m_hostch = new ACL_PCIE_HOSTCH(m_device, m_io, this, m_dma); + + if (this->enable_interrupts(m_user_signal_number)) { + return; + } + + char *str_test_quartus_ver = getenv("ACL_SKIP_QUARTUS_VERSION_CHECK"); + if (str_test_quartus_ver) m_skip_quartus_version_check = 1; + +#if defined(WINDOWS) + enable_msi(true); +#endif + +#ifdef DLA_MMD + // software reset + uint32_t software_reset_data = 0; // value doesn't matter, any write to software reset will cause it to trigger + constexpr int software_reset_offset = 0x8000; + status = m_io->kernel_if->write_block(software_reset_offset, sizeof(uint32_t), &software_reset_data); + ACL_PCIE_ERROR_IF(status, return, "[%s] failed to write block.\n", m_name); + // software reset applies backpressure to the avalon interface while the reset counter is running + // issue a read request, which will not return until the reset counter is done + status = m_io->kernel_if->read_block(software_reset_offset, sizeof(uint32_t), &software_reset_data); + ACL_PCIE_ERROR_IF(status, return, "[%s] failed to read block.\n", m_name); +#endif + // Done! + m_initialized = true; + ACL_PCIE_DEBUG_MSG(":: [%s] successfully initialized (device id: " DWORD_FMT_X ").\n", m_name, m_info.device_id); + ACL_PCIE_DEBUG_MSG(":: Using DMA for big transfers? %s\n", (m_use_dma_for_big_transfers ? "yes" : "no")); +} + +ACL_PCIE_DEVICE::~ACL_PCIE_DEVICE() { +#if defined(WINDOWS) + enable_msi(false); +#endif + + int status = this->disable_interrupts(); + ACL_PCIE_ERROR_IF(status, /* do nothing */, "[%s] fail disable interrupt in device destructor.\n", m_name); + + if (m_hostch) { + delete m_hostch; + m_hostch = NULL; + } + if (m_config) { + delete m_config; + m_config = NULL; + } + if (m_dma) { + delete m_dma; + m_dma = NULL; + } + if (m_io) { + delete m_io; + m_io = NULL; + } + + if (is_valid()) { + --num_open_devices; +#if defined(WINDOWS) + fpga_result result = fpgaClose(m_device); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return, "[%s] failed to close the device handle.\n", m_name); + +#endif // WINDOWS +#if defined(LINUX) + close(m_device); +#endif // LINUX + } +} + +#if defined(WINDOWS) +// Enable/Disable MSI +void ACL_PCIE_DEVICE::enable_msi(bool enable) { + int status; + + if (!m_info.interrupt_valid) { + return; + } + + if (!enable) { + // disable MSI DATA + m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, 0x00); + } else { + status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_L, m_info.interrupt_addr & 0xffffffff); + status = m_io->pcie_cra->write32(PCIE_CRA_MSI_ADDR_H, (m_info.interrupt_addr >> 0x20) & 0xffffffff); + MemoryBarrier(); + // enable MSI DATA + status = m_io->pcie_cra->write32(PCIE_CRA_MSI_DATA, PCIE_CRA_MSI_ENABLE | m_info.interrupt_data ); + } + MemoryBarrier(); +} + +fpga_handle open_device_windows(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) { + fpga_result result; + fpga_handle device = INVALID_HANDLE_VALUE; + DWORD pci_class_code_rev = 0; + DWORD pci_subsystem_ids = 0; + DWORD pci_link_info = 0; + + // Variables for fpga enumerate + fpga_properties filter = NULL; + UINT32 numMatches; + fpga_token afcToken; + volatile PUINT64 mmioPtr = NULL; + + // Variables for fpga properties + fpga_properties prop = nullptr; + UINT8 bus; + UINT8 l_device; + UINT8 function; + + const UINT8 CAP_PTR_ADDRESS = 0x34; + const UINT8 MSI_CAP_ID = 0x05; + UINT8 nextCapPtr; + UINT8 msiCapPtr; + UINT8 capID; + bool hasFound = false; + UINT8 capArray[2]; + UINT16 msi_control; + UINT16 data16 = 0x00; + UINT32 data32 = 0x00; + UINT64 data64 = 0x00; + + // Initialize filter structure + result = fpgaGetProperties(NULL, &filter); + if (result != FPGA_OK) { + device = INVALID_HANDLE_VALUE; + ACL_PCIE_ERROR_IF(1, goto End, "failed to get properties.\n"); + } + + // Set object type in filter structure + result = fpgaPropertiesSetObjectType(filter, FPGA_DEVICE); + if (result != FPGA_OK) { + device = INVALID_HANDLE_VALUE; + ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set object type.\n"); + } + + // Set vendor ID in the filter structure + result = fpgaPropertiesSetVendorID(filter, (uint16_t)info->vendor_id); + if (result != FPGA_OK) { + device = INVALID_HANDLE_VALUE; + ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to set vendor ID.\n"); + } + + // Enumerate all PCI devices and find devices matching the filters + result = fpgaEnumerate(&filter, 1, &afcToken, 1, &numMatches); + if (result != FPGA_OK) { + device = INVALID_HANDLE_VALUE; + ACL_PCIE_ERROR_IF(1, goto DestroyProp, "failed to scan for the PCI device.\n"); + } + + if (numMatches < 1) { + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] Device not found\n", dev_num); + device = INVALID_HANDLE_VALUE; + goto DestroyTok; + } + + // Open the device handle + result = fpgaOpen(afcToken, &device, 0); + if (result != FPGA_OK) { + device = INVALID_HANDLE_VALUE; + ACL_PCIE_ERROR_IF(1, goto DestroyTok, "[acl" ACL_BOARD_PKG_NAME "%d] failed to open the device.\n", dev_num); + } + + // Map MMIO number 0 + result = fpgaMapMMIO(device, 0, (PUINT64 *)&mmioPtr); + if (result != FPGA_OK) { + ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to map MMIO.\n", dev_num); + } + + // Read SubSystem IDs out of PCI config space + result = fpgaReadPciConfigSpace(device, 0x2C, (PVOID)&pci_subsystem_ids, sizeof(pci_subsystem_ids)); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI SubSystem IDs found: 0x%lx\n", dev_num, pci_subsystem_ids); + if ((ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 31, 16) != ACL_PCI_SUBSYSTEM_DEVICE_ID) || + (ACL_PCIE_READ_BIT_RANGE(pci_subsystem_ids, 15, 0) != ACL_PCI_SUBSYSTEM_VENDOR_ID)) { + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME + "%d] PCI SubSystem IDs do not match, found %08lx but expected %04x%04x\n", + dev_num, + pci_subsystem_ids, + ACL_PCI_SUBSYSTEM_DEVICE_ID, + ACL_PCI_SUBSYSTEM_VENDOR_ID); + goto Close; + } + // Save device id + info->device_id = ACL_PCI_SUBSYSTEM_DEVICE_ID; + + // Read Class code out of PCI config space + result = fpgaReadPciConfigSpace(device, 8, (PVOID)&pci_class_code_rev, sizeof(pci_class_code_rev)); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code and Rev is: %lx\n", dev_num, pci_class_code_rev); + if (((pci_class_code_rev & (0xff00ff00)) >> 8) != ACL_PCI_CLASSCODE) { + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Class Code does not match, expected %x, read %ld\n", + dev_num, + ACL_PCI_CLASSCODE, + (pci_class_code_rev & 0xff00ff00) >> 8); + goto Close; + } + + // Check PCI Revision + if ((pci_class_code_rev & 0x0ff) != ACL_PCI_REVISION) { + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Revision does not match\n", dev_num); + goto Close; + } + + // Read MSI data and address + info->interrupt_valid = false; + result = fpgaReadPciConfigSpace(device, CAP_PTR_ADDRESS, (PVOID)&nextCapPtr, sizeof(nextCapPtr)); + while (!hasFound && nextCapPtr > CAP_PTR_ADDRESS && FPGA_OK == result) { + result = fpgaReadPciConfigSpace(device, nextCapPtr, (PVOID)&capArray, sizeof(capArray)); + if (FPGA_OK == result) { + capID = capArray[0]; + if (capID == MSI_CAP_ID) { + hasFound = true; + info->interrupt_valid = true; + info->interrupt_addr = 0x00; + info->interrupt_data = 0x00; + msiCapPtr = nextCapPtr; + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x02, (PVOID)&msi_control, sizeof(msi_control)); + if (FPGA_OK == result) { + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] %d-bit address, %d-bit data\n", + dev_num, + (msi_control & 0x0080) ? 64 : 32, + (msi_control & 0x0200) ? 32 : 16); + if (msi_control & 0x0080) { // 64-bit address + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data64, sizeof(data64)); + if (FPGA_OK == result) { + info->interrupt_addr = data64; + if (msi_control & 0x0200) { // Extended message enable + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data32, sizeof(data32)); + if (FPGA_OK == result) { + info->interrupt_data = data32; + } + } else { + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x0C, (PVOID)&data16, sizeof(data16)); + if (FPGA_OK == result) { + info->interrupt_data = data16; + } + } + } + } else { // 32-bit address + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x04, (PVOID)&data32, sizeof(data32)); + if (FPGA_OK == result) { + info->interrupt_addr = data32; + if (msi_control & 0x0200) { // Extended message enable + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data32, sizeof(data32)); + if (FPGA_OK == result) { + info->interrupt_data = data32; + } + } else { + result = fpgaReadPciConfigSpace(device, msiCapPtr + 0x08, (PVOID)&data16, sizeof(data16)); + if (FPGA_OK == result) { + info->interrupt_data = data16; + } + } + } + } + } + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME + "%d] MSI Control = 0x%04x, MSI Address = 0x%llx, MSI Data = 0x%x\n", + dev_num, + msi_control, + info->interrupt_addr, + info->interrupt_data); + } else { + nextCapPtr = capArray[1]; + } + } + } + + if (result != FPGA_OK || !info->interrupt_valid) + { + ACL_PCIE_ERROR_IF(1, goto Close, "[acl" ACL_BOARD_PKG_NAME "%d] failed to read MSI interrupt address/data.\n", dev_num); + } + + result = fpgaGetProperties(afcToken, &prop); + if (prop) { + result = fpgaPropertiesGetBus(prop, &bus); + if (result != FPGA_OK) { + ACL_PCIE_ERROR_IF(1, goto Close, "failed to get bus.\n"); + } + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] bus is: %d\n", dev_num, bus); + result = fpgaPropertiesGetDevice(prop, &l_device); + if (result != FPGA_OK) { + ACL_PCIE_ERROR_IF(1, goto Close, "failed to get device.\n"); + } + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] device is: %d\n", dev_num, l_device); + result = fpgaPropertiesGetFunction(prop, &function); + if (result != FPGA_OK) { + ACL_PCIE_ERROR_IF(1, goto Close, "failed to get function.\n"); + } + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] function is: %d\n", dev_num, function); + snprintf(info->pcie_slot_info_str, + PCIE_SLOT_INFO_STR_LEN, + "%u:%u.%u", + bus, l_device, function); + fpgaDestroyProperties(&prop); + } + // Read Link status out of PCI config space + result = fpgaReadPciConfigSpace(device, 0x80, (PVOID)&pci_link_info, sizeof(pci_link_info)); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Status is: 0x%lx\n", dev_num, pci_link_info); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Link Speed is: %d\n", + dev_num, + ((pci_link_info >> 16) & 0x0F)); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Negotiated Link Width is: %d\n", + dev_num, + ((pci_link_info >> 20) & 0x3F)); + + // Read Maximum Payload Size out of PCI config space + result = fpgaReadPciConfigSpace(device, 0x78, (PVOID)&pci_link_info, sizeof(pci_link_info)); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size raw data is: 0x%lx\n", dev_num, pci_link_info); + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is: %d\n", dev_num, ((pci_link_info >> 5) & 0x0007)); + switch ((pci_link_info >> 5) & 0x0007) { + case 0: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 128-byte\n", dev_num); + break; + case 1: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 256-byte\n", dev_num); + break; + case 2: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 512-byte\n", dev_num); + break; + case 3: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 1024-byte\n", dev_num); + break; + case 4: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is 2048-byte\n", dev_num); + break; + default: + ACL_PCIE_DEBUG_MSG(":: [acl" ACL_BOARD_PKG_NAME "%d] PCI Maximum Payload Size is Unknown\n", dev_num); + break; + } + + ++num_open_devices; + goto DestroyTok; + + // Resource cleanup + +Close: + fpgaClose(device); + device = INVALID_HANDLE_VALUE; + +DestroyTok: + + if (afcToken != NULL) fpgaDestroyToken(&afcToken); + +DestroyProp: + + if (filter != NULL) fpgaDestroyProperties(&filter); + +End: + return device; +} +#endif // WINDOWS + +#if defined(LINUX) +fpga_handle open_device_linux(ACL_PCIE_DEVICE_DESCRIPTION *info, int dev_num) { + char buf[128] = {0}; + char expected_ver_string[128] = {0}; + int descriptor; + int oldflags; + int bytes_read; + struct acl_cmd driver_cmd; + + snprintf(buf, sizeof(buf), "/dev/acl" ACL_BOARD_PKG_NAME "%d", dev_num); + ssize_t device = open(buf, O_RDWR); + + // Return INVALID_DEVICE when the device is not available + if (device == -1) { + goto Close; + } + + // Make sure the Linux kernel driver is recent + driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_DRIVER_VERSION, NULL, buf, 0}; + bytes_read = read(device, &driver_cmd, 0); + ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command"); + + snprintf( + expected_ver_string, sizeof(expected_ver_string), "%s.%s", ACL_BOARD_PKG_NAME, KERNEL_DRIVER_VERSION_EXPECTED); + ACL_PCIE_ERROR_IF(strstr(buf, expected_ver_string) != buf, + goto Close, + "Kernel driver mismatch: The board kernel driver version is %s, but\nthis host program expects " + "%s.\n Please reinstall the driver using aocl install.\n", + buf, + expected_ver_string); + + // Save the device id for the selected board + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_GET_PCI_DEV_ID; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = &info->device_id; + driver_cmd.size = sizeof(info->device_id); + bytes_read = read(device, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ERROR_IF(bytes_read == -1, goto Close, "Failed to read driver command"); + + // Set the FD_CLOEXEC flag for the file handle to disable the child to + // inherit this file handle. So the jtagd will not hold the file handle + // of the device and keep sending bogus interrupts after we call quartus_pgm. + oldflags = fcntl(device, F_GETFD, 0); + descriptor = fcntl(device, F_SETFD, oldflags | FD_CLOEXEC); + if (descriptor < 0) { + goto Close; + } + + ++num_open_devices; + goto End; + +// I really don't want to use goto but it's for consistency with windows version, and convenience with macros +Close: + if (device >= 0) { + close(device); + } + device = INVALID_HANDLE_VALUE; + +End: + return device; +} + +#endif // LINUX + +// This function can be used for triggering a fake device exception for testing +void ACL_PCIE_DEVICE::test_trigger_device_interrupt() { + // Example: + // Raising ECC NON CORRECTABLE exception (exception code 2) + // Providing integer-type private_info (say, equals to 5) + unsigned long long int exception_type = 2; + int test_private_info = 5; + aocl_mmd_interrupt_info interrupt_data = {exception_type, &test_private_info, sizeof(test_private_info)}; + this->device_interrupt(m_handle, &interrupt_data, this->device_interrupt_user_data); +} + +// Perform operations required when an interrupt is received for this device +void ACL_PCIE_DEVICE::service_interrupt(unsigned int irq_type_flag) { + unsigned int kernel_update = 0; + unsigned int dma_update = 0; + + int status = this->get_interrupt_type(&kernel_update, &dma_update, irq_type_flag); + ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name); + + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_IRQ, + ":: [%s] Irq service routine called, kernel_update=%d, dma_update=%d \n", + m_name, + kernel_update, + dma_update); + + if (kernel_update && kernel_interrupt != NULL) { +#if defined(WINDOWS) + status = this->mask_irqs(); + ACL_PCIE_ERROR_IF(status, return, "[%s] failed to mask kernel interrupt.\n", m_name); +#endif + // A kernel-status interrupt - update the status of running kernels + ACL_PCIE_ASSERT(kernel_interrupt, "[%s] received kernel interrupt before the handler is installed.\n", m_name); + kernel_interrupt(m_handle, kernel_interrupt_user_data); + } else if (dma_update) { + // A DMA-status interrupt - let the DMA object handle this + m_dma->service_interrupt(); + } + + // Unmask the kernel_irq to enable the interrupt again. + if (m_mmd_irq_handler_enable) { + status = this->unmask_irqs(); + } else if (kernel_update) { + status = this->unmask_kernel_irq(); + } + ACL_PCIE_ERROR_IF(status, return, "[%s] fail to service the interrupt.\n", m_name); + + return; +} + +// Enable all interrupts (DMA and Kernel) +// Won't enable kernel irq unless kernel interrupt callback has been initialized +// Return 0 on success +int ACL_PCIE_DEVICE::unmask_irqs() { + int status = 0; + if (kernel_interrupt == NULL) { + // No masking for DMA interrupt. + + } else { + status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC)); + } + ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask all interrupts.\n", m_name); + + return 0; // success +} + +// Disable all interrupts to service kernel that triggered interrupt +// If other kernels finish while the interrupt is masked, MSI will trigger again when +// interrupts are re-enabled. +int ACL_PCIE_DEVICE::mask_irqs() { + int status = 0; + UINT32 val = 0; + status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to mask the kernel interrupts.\n", m_name); + + return 0; // success +} + +// Enable the kernel interrupt only +// Return 0 on success +int ACL_PCIE_DEVICE::unmask_kernel_irq() { + int status = 0; + UINT32 val = 0; + + status |= (int)(m_io->pcie_cra->read32(PCIE_CRA_IRQ_ENABLE, &val)); + val |= ACL_PCIE_GET_BIT(ACL_PCIE_KERNEL_IRQ_VEC); + status |= (int)(m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, val)); + + ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to unmask the kernel interrupts.\n", m_name); + + return 0; // success +} + +// Disable the interrupt +// Return 0 on success +int ACL_PCIE_DEVICE::disable_interrupts() { + int status; + + if (m_mmd_irq_handler_enable) { + ACL_PCIE_DEBUG_MSG(":: [%s] Disabling interrupts.\n", m_name); + + status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to disable pcie interrupt.\n", m_name); + +#if defined(WINDOWS) + // Disable KMD interrupt handling for Windows + fpga_properties prop = {0}; + fpga_result result = FPGA_OK; + uint32_t num_interrupts = 0; + uint32_t i = 0; + + // Get number of interrupts in the device from the properties structure + result = fpgaGetPropertiesFromHandle(m_device, &prop); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name); + + result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts); + if (result != FPGA_OK) { + fpgaDestroyProperties(&prop); + ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name); + } + + if (dev_event_handle != NULL) { + // Loop through all the interrupts and unregister the event and + // destroy event handle associated with the interrupt + for (i = 0; i < num_interrupts; i++) { + result = fpgaUnregisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i]); + + if (result != FPGA_OK) { + fpgaDestroyProperties(&prop); + ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaRegisterEvent Failed\n", m_name); + } + + result = fpgaDestroyEventHandle(&dev_event_handle[i]); + if (result != FPGA_OK) { + fpgaDestroyProperties(&prop); + ACL_PCIE_ERROR_IF(1, return -1, "[%s] fpgaCreateEventHandle Failed\n", m_name); + } + } + free(dev_event_handle); + dev_event_handle = NULL; + } + fpgaDestroyProperties(&prop); +#endif // WINDOWS + m_mmd_irq_handler_enable = false; + } + + return 0; // success +} + +#if defined(WINDOWS) + +// Enable PCI express interrupts. Set up the KMD to mask the interrupt enable bit when +// an interrupt is received to prevent the level-sensitive interrupt from immediately +// firing again. +// Return 0 on success +int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) { + int status; + fpga_properties prop = NULL; + fpga_result result = FPGA_OK; + uint32_t num_interrupts = 0; + uint32_t i = 0; + HANDLE deviceStopWaitObj = NULL; + BOOLEAN flag; + int ret_value = 0; // return 0 on success + + ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts.\n", m_name); + + // Mask off hardware interrupts before enabling them + status = m_io->pcie_cra->write32(PCIE_CRA_IRQ_ENABLE, 0); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to mask off all interrupts before enabling them.\n", m_name); + + // Enable interrupts in the KMD + + // Get number of interrupts in the device from the properties structure + result = fpgaGetPropertiesFromHandle(m_device, &prop); + ACL_PCIE_ERROR_IF(result != FPGA_OK, return -1, "[%s] fpgaGetPropertiesFromHandle Failed\n", m_name); + + result = fpgaPropertiesGetNumInterrupts(prop, &num_interrupts); + if (result != FPGA_OK) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaPropertiesGetNumInterrupts Failed\n", m_name); + } + + dev_event_handle = NULL; + dev_event_handle = (fpga_event_handle *)malloc(sizeof(fpga_event_handle) * num_interrupts); + if (dev_event_handle == NULL) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] malloc for event handle array Failed\n", m_name); + } + + // Loop through all the interrupts and register an event and + // create event handle associated with the interrupt + + for (i = 0; i < num_interrupts; i++) { + result = fpgaCreateEventHandle(&dev_event_handle[i]); + if (result != FPGA_OK) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaCreateEventHandle Failed\n", m_name); + } + + result = fpgaRegisterEvent(m_device, FPGA_EVENT_INTERRUPT, dev_event_handle[i], i); + if (result != FPGA_OK) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name); + } + + // Register the user-mode interrupt handler + // Executed after interrupt is recieved and processed in kernel + flag = (BOOLEAN)RegisterWaitForSingleObject(&deviceStopWaitObj, + dev_event_handle[i], + (WAITORTIMERCALLBACK)pcie_interrupt_handler, + static_cast<void *>(this), + INFINITE, + WT_EXECUTEINWAITTHREAD); + + if (flag == 0) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] fpgaRegisterEvent Failed\n", m_name); + } + } + status = this->unmask_irqs(); + if (status) { + ret_value = -1; + ACL_PCIE_ERROR_IF(1, goto End, "[%s] failed to enable interrupts.\n", m_name); + } + + m_mmd_irq_handler_enable = true; + + // Resource cleanup +End: + fpgaDestroyProperties(&prop); + return ret_value; +} + +// Use irq status to determine type of interrupt +// Result is returned in kernel_update/dma_update arguments. +// Return 0 on success +int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update, + unsigned int *dma_update, + unsigned int irq_type_flag) { + UINT32 irq_status; + unsigned int dma_status; + int status; + + status = m_io->pcie_cra->read32(PCIE_CRA_IRQ_STATUS, &irq_status); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] fail to interrupt type.\n", m_name); + + *kernel_update = ACL_PCIE_READ_BIT(irq_status, ACL_PCIE_KERNEL_IRQ_VEC); + + status = m_dma->check_dma_interrupt(&dma_status); + if (status != 1) { + *dma_update = dma_status; + } + + return 0; // success +} + +#endif // WINDOWS +#if defined(LINUX) + +// For Linux, it will set-up a signal handler for signals for kernel driver +// Return 0 on success +int ACL_PCIE_DEVICE::enable_interrupts(int user_signal_number) { + int status; + ACL_PCIE_DEBUG_MSG(":: [%s] Enabling PCIe interrupts on Linux (via signals).\n", m_name); + + // All interrupt controls are in the kernel driver. + m_mmd_irq_handler_enable = false; + + // Send the globally allocated signal number to the driver + struct acl_cmd signal_number_cmd {}; + signal_number_cmd.bar_id = ACLPCI_CMD_BAR; + signal_number_cmd.command = ACLPCI_CMD_SET_SIGNAL_NUMBER; + signal_number_cmd.device_addr = NULL; + signal_number_cmd.user_addr = &user_signal_number; + signal_number_cmd.size = sizeof(user_signal_number); + status = write(m_device, &signal_number_cmd, sizeof(signal_number_cmd)); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set signal number for interrupts.\n", m_name); + + // Sanity check, did the driver get it + int readback_signal_number; + signal_number_cmd.user_addr = &readback_signal_number; + signal_number_cmd.command = ACLPCI_CMD_GET_SIGNAL_NUMBER; + signal_number_cmd.size = sizeof(readback_signal_number); + status = read(m_device, &signal_number_cmd, sizeof(signal_number_cmd)); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to get signal number for interrupts.\n", m_name); + ACL_PCIE_ERROR_IF(readback_signal_number != user_signal_number, + return -1, + "[%s] got wrong signal number %d, expected %d\n", + m_name, + readback_signal_number, + user_signal_number); + + // Set "our" device id (the handle id received from acl_pcie.cpp) to correspond to + // the device managed by the driver. Will get back this id + // with signal from the driver. Will allow us to differentiate + // the source of kernel-done signals with multiple boards. + + // the last bit is reserved as a flag for DMA completion + int result = m_handle << 1; + struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_SET_SIGNAL_PAYLOAD, NULL, &result}; + status = write(m_device, &read_cmd, sizeof(result)); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to enable interrupts.\n", m_name); + + return 0; // success +} + +// Determine the interrupt type using the irq_type_flag +// Return 0 on success +int ACL_PCIE_DEVICE::get_interrupt_type(unsigned int *kernel_update, + unsigned int *dma_update, + unsigned int irq_type_flag) { + // For Linux, the interrupt type is mutually exclusive + *kernel_update = irq_type_flag ? 0 : 1; + *dma_update = 1 - *kernel_update; + + return 0; // success +} + +#endif // LINUX + +// Called by the host program when there are spare cycles +int ACL_PCIE_DEVICE::yield() { + // Give the DMA object a chance to crunch any pending data + return m_dma->yield(); +} + +// Set kernel interrupt and event update callbacks +// return 0 on success +int ACL_PCIE_DEVICE::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + int status; + + kernel_interrupt = fn; + kernel_interrupt_user_data = user_data; + + if (m_device != INVALID_HANDLE_VALUE) { + status = this->unmask_kernel_irq(); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set kernel interrupt callback funciton.\n", m_name); + } + + return 0; // success +} + +int ACL_PCIE_DEVICE::set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data) { + int status; + + device_interrupt = fn; + device_interrupt_user_data = user_data; + + if (m_device != INVALID_HANDLE_VALUE) { + status = this->unmask_kernel_irq(); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set device interrupt callback funciton.\n", m_name); + } + + return 0; // success +} + +int ACL_PCIE_DEVICE::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + event_update = fn; + event_update_user_data = user_data; + + return 0; // success +} + +// The callback function set by "set_status_handler" +// It's used to notify/update the host whenever an event is finished +void ACL_PCIE_DEVICE::event_update_fn(aocl_mmd_op_t op, int status) { + ACL_PCIE_ASSERT(event_update, "[%s] event_update is called with a empty update function pointer.\n", m_name); + + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, ":: [%s] Update for event e=%p.\n", m_name, op); + event_update(m_handle, event_update_user_data, op, status); +} + +// Forward get buffer call to host channel +void *ACL_PCIE_DEVICE::hostchannel_get_buffer(size_t *buffer_size, int channel, int *status) { + return m_hostch->get_buffer(buffer_size, channel, status); +} +// Forward ack call to host channel +size_t ACL_PCIE_DEVICE::hostchannel_ack_buffer(size_t send_size, int channel, int *status) { + return m_hostch->ack_buffer(send_size, channel, status); +} + +// Memory I/O +// return 0 on success +int ACL_PCIE_DEVICE::write_block( + aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) { +#ifdef DLA_MMD + ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::write_block"); +#else + ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name); +#endif + int status = -1; // assume failure + + switch (mmd_interface) { + case AOCL_MMD_KERNEL: + status = m_io->kernel_if->write_block(dev_addr, size, host_addr); + break; + case AOCL_MMD_MEMORY: + status = read_write_block(e, host_addr, dev_addr, size, false /*writing*/); + break; + case AOCL_MMD_PLL: + status = m_io->pll->write_block(dev_addr, size, host_addr); + break; + case AOCL_MMD_HOSTCH: + default: + ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name); + } + + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to write block.\n", m_name); + + return 0; // success +} + +int ACL_PCIE_DEVICE::read_block( + aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size) { +#ifdef DLA_MMD + ACL_PCIE_ASSERT(e == nullptr, "DLA_MMD does not support callback events in ACL_PCIE_DEVICE::read_block"); +#else + ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name); +#endif + int status = -1; // assume failure + + switch (mmd_interface) { + case AOCL_MMD_KERNEL: + status = m_io->kernel_if->read_block(dev_addr, size, host_addr); + break; + case AOCL_MMD_MEMORY: + status = read_write_block(e, host_addr, dev_addr, size, true /*reading*/); + break; + case AOCL_MMD_PLL: + status = m_io->pll->read_block(dev_addr, size, host_addr); + break; + case AOCL_MMD_HOSTCH: + default: + ACL_PCIE_ASSERT(0, "[%s] unknown MMD interface.\n", m_name); + } + + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read block.\n", m_name); + + return 0; // success +} + +// Copy a block between two locations in device memory +// return 0 on success +int ACL_PCIE_DEVICE::copy_block( + aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size) { + ACL_PCIE_ASSERT(event_update, "[%s] event_update callback function is not provided.\n", m_name); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, + ":: [%s] Copying " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X " (device) to 0x" SIZE_FMT_X + " (device), with e=%p\n", + m_name, + size, + src, + dst, + e); + +#define BLOCK_SIZE (8 * 1024 * 1024) +#if defined(WINDOWS) + __declspec(align(128)) static unsigned char data[BLOCK_SIZE]; +#endif // WINDOWS +#if defined(LINUX) + static unsigned char data[BLOCK_SIZE] __attribute__((aligned(128))); +#endif // LINUX + + do { + size_t transfer_size = (size > BLOCK_SIZE) ? BLOCK_SIZE : size; + read_block(NULL /* blocking read */, mmd_interface, data, src, transfer_size); + write_block(NULL /* blocking write */, mmd_interface, data, dst, transfer_size); + + src += transfer_size; + dst += transfer_size; + size -= transfer_size; + } while (size > 0); + + if (e) { + this->event_update_fn(e, 0); + } + + return 0; // success +} + +// Forward create hostchannel call to host channel +int ACL_PCIE_DEVICE::create_hostchannel(char *name, size_t queue_depth, int direction) { + return m_hostch->create_hostchannel(name, queue_depth, direction); +} + +// Forward destroy hostchannel call to host channel +int ACL_PCIE_DEVICE::destroy_channel(int channel) { return m_hostch->destroy_hostchannel(channel); } + +// Read or Write a block of data to device memory. +// Use either DMA or directly read/write through BAR +// Return 0 on success +int ACL_PCIE_DEVICE::read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading) { + const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(host_addr); + + int status = 0; + size_t dma_size = 0; + +#ifdef DLA_MMD + // CoreDLA runtime assumes host/device transfers are thread safe, enforce that here + // mutex will unlock when its lock goes out of scope + std::unique_lock<std::mutex> dma_mutex_lock(m_dma_mutex); +#endif + + if (reading) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, + ":: [%s] Reading " SIZE_FMT_U " bytes data from 0x" SIZE_FMT_X + " (device) to %p (host), with e=%p\n", + m_name, + size, + dev_addr, + host_addr, + e); + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, + ":: [%s] Writing " SIZE_FMT_U " bytes data from %p (host) to 0x" SIZE_FMT_X + " (device), with e=%p\n", + m_name, + size, + host_addr, + dev_addr, + e); + } + + // Return immediately if size is zero + if (size == 0) { + if (e) { + this->event_update_fn(e, 0); + } + return 0; + } + + bool aligned = ((uintptr_host & DMA_ALIGNMENT_BYTE_MASK) | (dev_addr & DMA_ALIGNMENT_BYTE_MASK)) == 0; + if (m_use_dma_for_big_transfers && aligned && (size >= 1024)) { + // DMA transfers must END at aligned boundary. + // If that's not the case, use DMA up to such boundary, and regular + // read/write for the remaining part. + dma_size = size - (size & DMA_ALIGNMENT_BYTE_MASK); + } else if (m_use_dma_for_big_transfers && (size >= 1024)) { + ACL_PCIE_WARN_MSG("[%s] NOT using DMA to transfer " SIZE_FMT_U + " bytes from %s to %s because of lack of alignment\n" + "** host ptr (%p) and/or dev offset (0x" SIZE_FMT_X + ") is not aligned to %u bytes\n", + m_name, + size, + (reading ? "device" : "host"), + (reading ? "host" : "device"), + host_addr, + dev_addr, + DMA_ALIGNMENT_BYTES); + } + + // Perform read/write through BAR if the data is not fit for DMA or if there is remaining part from DMA + if (dma_size < size) { + void *host_addr_new = reinterpret_cast<void *>(uintptr_host + dma_size); + size_t dev_addr_new = dev_addr + dma_size; + size_t remain_size = size - dma_size; + + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_OP, + ":: [%s] Perform read/write through BAR for remaining " SIZE_FMT_U + " bytes (out of " SIZE_FMT_U " bytes)\n", + m_name, + remain_size, + size); + + status = read_write_block_bar(host_addr_new, dev_addr_new, remain_size, reading); + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to perform read/write through BAR.\n", m_name); + } + + if (dma_size != 0) { + m_dma->read_write(host_addr, dev_addr, dma_size, e, reading); + + // Block if event is NULL + if (e == NULL) { + m_dma->stall_until_idle(); + } + } else { + if (e != NULL) { + this->event_update_fn(e, 0); + } + } + + return 0; // success +} + +// Read or Write a block of data to device memory through BAR +// Return 0 on success +int ACL_PCIE_DEVICE::read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading) { + void *cur_host_addr = host_addr; + size_t cur_dev_addr = dev_addr; + size_t bytes_transfered = 0; + + for (bytes_transfered = 0; bytes_transfered < size;) { + // decide the size to transfer for current iteration + size_t cur_size = ACL_PCIE_MEMWINDOW_SIZE - (cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE); + if (bytes_transfered + cur_size >= size) { + cur_size = size - bytes_transfered; + } + + // set the proper window segment + set_segment(cur_dev_addr); + size_t window_rel_ptr_start = cur_dev_addr % ACL_PCIE_MEMWINDOW_SIZE; + size_t window_rel_ptr = window_rel_ptr_start; + + // A simple blocking read + // The address should be in the global memory range, we assume + // any offsets are already accounted for in the offset + ACL_PCIE_ASSERT(window_rel_ptr + cur_size <= ACL_PCIE_MEMWINDOW_SIZE, + "[%s] trying to access out of the range of the memory window.\n", + m_name); + + // Workaround a bug in Jungo driver. + // First, transfer the non 8 bytes data at the front, one byte at a time + // Then, transfer multiple of 8 bytes (size of size_t) using read/write_block + // At the end, transfer the remaining bytes, one byte at a time + size_t dev_odd_start = std::min(sizeof(size_t) - window_rel_ptr % sizeof(size_t), cur_size); + if (dev_odd_start != sizeof(size_t)) { + read_write_small_size(cur_host_addr, window_rel_ptr, dev_odd_start, reading); + incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, dev_odd_start); + cur_size -= dev_odd_start; + } + + size_t tail_size = cur_size % sizeof(size_t); + size_t size_mul_8 = cur_size - tail_size; + + if (size_mul_8 != 0) { + if (reading) { + m_io->mem->read_block(window_rel_ptr, size_mul_8, cur_host_addr); + } else { + m_io->mem->write_block(window_rel_ptr, size_mul_8, cur_host_addr); + } + incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, size_mul_8); + } + + if (tail_size != 0) { + read_write_small_size(cur_host_addr, window_rel_ptr, tail_size, reading); + incr_ptrs(&cur_host_addr, &window_rel_ptr, &bytes_transfered, tail_size); + cur_size -= tail_size; + } + + // increase the current device address to be transferred + cur_dev_addr += (window_rel_ptr - window_rel_ptr_start); + } + + return 0; // success +} + +// Read or Write a small size of data to device memory, one byte at a time +// Return 0 on success +int ACL_PCIE_DEVICE::read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading) { + UINT8 *ucharptr_host = static_cast<UINT8 *>(host_addr); + int status; + + for (size_t i = 0; i < size; ++i) { + if (reading) { + status = m_io->mem->read8(dev_addr + i, ucharptr_host + i); + } else { + status = m_io->mem->write8(dev_addr + i, ucharptr_host[i]); + } + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to read write with odd size.\n", m_name); + } + + return 0; // success +} + +// Set the segment that the memory windows is accessing to +// Return 0 on success +int ACL_PCIE_DEVICE::set_segment(size_t addr) { + UINT64 segment_readback; + UINT64 cur_segment = addr & ~(ACL_PCIE_MEMWINDOW_SIZE - 1); + int status = 0; + + // Only execute the PCI write if we need to *change* segments + if (cur_segment != m_segment) { + // PCIe reordering rules could cause the segment change to get reordered, + // so read before and after! + status |= (int)(m_io->window->read64(0, &segment_readback)); + + status |= (int)(m_io->window->write64(0, cur_segment)); + m_segment = cur_segment; + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::::: [%s] Changed segment id to %llu.\n", m_name, m_segment); + + status |= (int)(m_io->window->read64(0, &segment_readback)); + } + + ACL_PCIE_ERROR_IF(status, return -1, "[%s] failed to set segment for memory access windows.\n", m_name); + + return 0; // success +} + +void ACL_PCIE_DEVICE::incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr) { + const uintptr_t uintptr_host = reinterpret_cast<uintptr_t>(*host); + + *host = reinterpret_cast<void *>(uintptr_host + incr); + *dev += incr; + *counter += incr; +} + +// Query the on-chip temperature sensor +bool ACL_PCIE_DEVICE::get_ondie_temp_slow_call(cl_int *temp) { + cl_int read_data; + + // We assume this during read later + ACL_PCIE_ASSERT(sizeof(cl_int) == sizeof(INT32), "sizeof(cl_int) != sizeof(INT32)"); + +#ifndef ACL_PCIE_HAS_TEMP_SENSOR + ACL_PCIE_DEBUG_MSG(":: [%s] On-chip temperature sensor not supported by this board.\n", m_name); + return false; +#endif + + ACL_PCIE_DEBUG_MSG(":: [%s] Querying on-chip temperature sensor...\n", m_name); + + // read temperature sensor + m_io->temp_sensor->read32(0, (UINT32 *)&read_data); + + ACL_PCIE_DEBUG_MSG(":: [%s] Read temp sensor data. Value is: %i\n", m_name, read_data); + *temp = read_data; + return true; +} + +void *ACL_PCIE_DEVICE::shared_mem_alloc(size_t size, unsigned long long *device_ptr_out) { +#if defined(WINDOWS) + return NULL; +#endif // WINDOWS +#if defined(LINUX) +#ifdef ACL_HOST_MEMORY_SHARED + void *host_ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, m_device, 0); + + if (device_ptr_out != NULL && host_ptr == (void *)-1) { + // when mmap fails, it returns (void*)-1, not NULL + host_ptr = NULL; + *device_ptr_out = (unsigned long long)0; + + } else if (device_ptr_out != NULL) { + /* map received host_ptr to FPGA-usable address. */ + void *dev_ptr = NULL; + struct acl_cmd read_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PHYS_PTR_FROM_VIRT, &dev_ptr, &host_ptr, sizeof(dev_ptr)}; + + bool failed_flag = (read(m_device, &read_cmd, sizeof(dev_ptr)) != 0); + ACL_PCIE_DEBUG_MSG( + " Mapped vaddr %p to phys addr %p. %s\n", host_ptr, dev_ptr, failed_flag == 0 ? "OK" : "FAILED"); + if (failed_flag) { + *device_ptr_out = (unsigned long long)NULL; + } else { + /* When change to 64-bit pointers on the device, update driver code + * to deal with larger-than-void* ptrs. */ + *device_ptr_out = (unsigned long long)dev_ptr; + + /* Now need to add offset of the shared system. */ + } + } + + return host_ptr; +#else + return NULL; +#endif +#endif // LINUX +} + +void ACL_PCIE_DEVICE::shared_mem_free(void *vptr, size_t size) { +#if defined(WINDOWS) + return; +#endif // WINDOWS +#if defined(LINUX) + if (vptr != NULL) { + munmap(vptr, size); + } +#endif // LINUX +} + +#ifdef DLA_MMD + +int ACL_PCIE_DEVICE::pause_and_save_pcie() +{ + int failed_cont_reg_save; + + // set the being_programmed flag + m_being_programmed = true; + + // disable interrupt and save control registers + const int failed_int_disable = this->disable_interrupts(); + ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup_save, "could not disable interrupt.\n"); + + // Do this last before programming + failed_cont_reg_save = m_config->save_pci_control_regs(); + ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup_save, "could not save control regs\n"); + + return 0; + + cleanup_save: + + m_being_programmed = false; + return 1; +} + +int ACL_PCIE_DEVICE::restore_and_resume_pcie() +{ +#if defined(LINUX) + m_config->load_pci_control_regs(); +#endif + + if (wait_for_uniphy()) { + ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name); + + m_being_programmed = false; + + return 1; + } + + m_being_programmed = false; + return 0; +} + +// JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core +// Return 0 on success +int ACL_PCIE_DEVICE::reprogram_sof(const char *sof_filename, const bool skipSaveRestore) { + int saveRetCode = 0; + + if (!skipSaveRestore) + { + saveRetCode = pause_and_save_pcie(); + if (saveRetCode) + { + return saveRetCode; + } + } + + int reprogram_failed = 1; // assume failure + + // JTAG programming the device + ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name); + reprogram_failed = m_config->program_with_SOF_file(sof_filename, "0" /*ad_cable*/, "0" /*ad_device_index*/); + + int restoreRetCode = 0; + + if (!skipSaveRestore) + { + restoreRetCode = restore_and_resume_pcie(); + if (restoreRetCode) + { + return restoreRetCode; + } + } + + if (!(reprogram_failed)) { + ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name); + } + + return reprogram_failed; +} +#else +// perform PR reprogram by attempting to program the board using an RBF. If this is not possible due to +// 1) Envoking the user of JTAG_PROGRAMMING via ACL_PCIE_USE_JTAG_PROGRAMMING +// 2) RBF or HASH are not present +// 3) PR Base ID does not match that with which the RBF was compiled +// 4) UniPhy fails to calibrate +// Then returns 1. Returns 0 on success. Always returns flag from arguments indicating source of failure +int ACL_PCIE_DEVICE::pr_reprogram(struct acl_pkg_file *pkg, + const char *SOFNAME, + int *rbf_or_hash_not_provided, + int *hash_mismatch, + unsigned *use_jtag_programming, + int *quartus_compile_version_mismatch) { + // Environment variable to control when to use JTAG instead of PR (overriding the default programming method: PR) + int reprogram_failed = 1; + size_t core_rbf_len = 0, pr_import_version_len = 0, quartus_version_len = 0, pll_config_len = 0; + *use_jtag_programming = 0; + char *str_use_jtag_programming = getenv("ACL_PCIE_USE_JTAG_PROGRAMMING"); + if (str_use_jtag_programming) *use_jtag_programming = 1; + + // 1. Default programming method: PR + if (!*use_jtag_programming) { + // checking that rbf and hash sections exist in fpga.bin + if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_CORE_RBF, &core_rbf_len) && + acl_pkg_section_exists(pkg, ACL_PKG_SECTION_HASH, &pr_import_version_len) && + (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len) || m_skip_quartus_version_check)) { + *rbf_or_hash_not_provided = 0; + ACL_PCIE_DEBUG_MSG( + ":: [%s] Programming kernel region using PR with rbf file size %i\n", m_name, (UINT32)core_rbf_len); + + // read rbf and hash from fpga.bin + char *core_rbf; + acl_aligned_malloc((void **)&core_rbf, core_rbf_len + 1); + int read_core_rbf_ok = acl_pkg_read_section(pkg, ACL_PKG_SECTION_CORE_RBF, core_rbf, core_rbf_len + 1); + + if (!m_skip_quartus_version_check) { + char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1); + if (quartus_compile_version_str) { + int quartus_compile_version_ok = + acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1); + + if (quartus_compile_version_ok) { + // Remove Linux and Windows new-line ending in .acl.qversion + if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' || + quartus_compile_version_str[quartus_version_len - 1] == '\r')) { + quartus_compile_version_str[quartus_version_len - 1] = '\0'; + } + if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) { + quartus_compile_version_str[quartus_version_len - 2] = '\0'; + } + + *quartus_compile_version_mismatch = quartus_ver_test(quartus_compile_version_str); + } else { + *quartus_compile_version_mismatch = 1; + } + free(quartus_compile_version_str); + quartus_compile_version_str = NULL; + } else { + *quartus_compile_version_mismatch = 1; + } + } else { + *quartus_compile_version_mismatch = 0; + } + + if (*quartus_compile_version_mismatch == 0) { + char *pr_import_version_str = (char *)malloc(pr_import_version_len + 1); + if (pr_import_version_str) { + int pr_import_version_ok = + acl_pkg_read_section(pkg, ACL_PKG_SECTION_HASH, pr_import_version_str, pr_import_version_len + 1); + + // checking that hash was successfully read from section .acl.hash within fpga.bin + if (pr_import_version_ok) { + unsigned int pr_import_version = (unsigned int)strtol(pr_import_version_str, NULL, 10); + + // checking that base revision hash matches import revision hash and aocx and programmed sof is from same + // Quartus version + if (pr_base_id_test(pr_import_version) == 0) { + *hash_mismatch = 0; + + // Kernel driver wants it aligned to 4 bytes. + int aligned_to_4_bytes(0 == (3 & (uintptr_t)(core_rbf))); + reprogram_failed = 1; // Default to fail before PRing + + // checking that rbf was successfully read from section .acl.core.rbf within fpga.bin + if (read_core_rbf_ok && !(core_rbf_len % 4) && aligned_to_4_bytes && !version_id_test()) { + // reprogram Arria 10 devices + if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) { + ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name); + reprogram_failed = m_config->program_core_with_PR_file_a10((char *)core_rbf, core_rbf_len); + ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name); + }; + + // reprogram Stratix 10 devices + if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) { + acl_pkg_section_exists(pkg, ACL_PKG_SECTION_PLL_CONFIG, &pll_config_len); + char *pll_config_str = (char *)malloc(pll_config_len + 1); + if (pll_config_str) { + int pll_config_ok = + acl_pkg_read_section(pkg, ACL_PKG_SECTION_PLL_CONFIG, pll_config_str, pll_config_len + 1); + if (pll_config_ok) { + ACL_PCIE_DEBUG_MSG(":: [%s] Starting PR programming of the device...\n", m_name); + reprogram_failed = m_config->program_core_with_PR_file_s10( + (char *)core_rbf, core_rbf_len, (char *)pll_config_str); + ACL_PCIE_DEBUG_MSG(":: [%s] Finished PR programming of the device.\n", m_name); + }; + }; + free(pll_config_str); + pll_config_str = NULL; + }; + + if (reprogram_failed) { + ACL_PCIE_DEBUG_MSG(":: [%s] PR programming failed.\n", m_name); + // PR failed. Check if device I/O is blocked. + if (check_kernel_region_status() == -1) { + ACL_PCIE_INFO("[%s] Partial Reconfiguration of FPGA has failed.\n", m_name); + ACL_PCIE_INFO("[%s] FPGA device will not be available until host has been powercycled.\n", m_name); + exit(1); + } + } else if (version_id_test()) { + ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name); + reprogram_failed = 1; + } else if (wait_for_uniphy()) { + ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name); + reprogram_failed = 1; + } else { + ACL_PCIE_DEBUG_MSG(":: [%s] PR programming passed.\n", m_name); + } + } + } + } + free(pr_import_version_str); + pr_import_version_str = NULL; + } + } + acl_aligned_free(core_rbf); + } + } + + return reprogram_failed; +} + +// Reprogram the device with given binary file. +// There are two ways to program: +// 1. PR to replace the OpenCL kernel partition +// 2. JTAG full-chip programming (using quartus_pgm via USB-Blaster) to replace periphery + core +// Return 0 on success +int ACL_PCIE_DEVICE::reprogram(void *data, size_t data_size, int program_mode) { + int reprogram_failed = 1; // assume failure + int rbf_or_hash_not_provided = 1; // assume no rbf or hash are provided in fpga.bin + int hash_mismatch = 1; // assume base revision and import revision hashes do not match + unsigned use_jtag_programming = 0; // assume no need for jtag programming + int quartus_compile_version_mismatch = 1; + size_t quartus_version_len; + + const char *SOFNAME = "reprogram_temp.sof"; + size_t sof_len = 0; + + ACL_PCIE_DEBUG_MSG(":: [%s] Starting to program device...\n", m_name); + + struct acl_pkg_file *pkg = acl_pkg_open_file_from_memory((char *)data, data_size, ACL_PKG_SHOW_ERROR); + ACL_PCIE_ERROR_IF(pkg == NULL, return reprogram_failed, "cannot open file from memory using pkg editor.\n"); + + // set the being_programmed flag + m_being_programmed = true; + + // the new reprogram flow: first try PR, if failed falls back to the old reprogram flow + int try_pr_failed = 0; + // if choose to try reprogram with preserving memory + if (program_mode == ACL_PCIE_PROGRAM_PR) { + // only try PR, no fall back to JTAG + ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name); + reprogram_failed = pr_reprogram(pkg, + SOFNAME, + &rbf_or_hash_not_provided, + &hash_mismatch, + &use_jtag_programming, + &quartus_compile_version_mismatch); + // clean up + if (reprogram_failed || use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch || + (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) { + // try PR failed + try_pr_failed = 1; + } + if (pkg) acl_pkg_close_file(pkg); + m_being_programmed = false; + return try_pr_failed; + } + + // the old reprogram flow. Try PR and then Try JTAG + // 1. Default to PR reprogramming + ACL_PCIE_DEBUG_MSG("[%s] Reprogram the device with data saving and restoring\n", m_name); + ACL_PCIE_DEBUG_MSG("[%s] Trying Partial Reconfiguration\n", m_name); + reprogram_failed = pr_reprogram(pkg, + SOFNAME, + &rbf_or_hash_not_provided, + &hash_mismatch, + &use_jtag_programming, + &quartus_compile_version_mismatch); + + // Autodetect JTAG cable & device index + // Cable and Index value should't overflow + char ad_cable[AD_CABLE_SIZE]; + char ad_device_index[AD_CABLE_SIZE]; + + // 2. Fallback programming method: JTAG full-chip programming + if (use_jtag_programming || rbf_or_hash_not_provided || hash_mismatch || + (quartus_compile_version_mismatch && !m_skip_quartus_version_check)) { + ACL_PCIE_DEBUG_MSG("[%s] Trying Full-Chip Reconfiguration (JTAG)\n", m_name); + + // checking that sof section exist in fpga.bin + if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_SOF, &sof_len)) { + // check if aocx is fast-compiled or not - if so, then sof is a base revision, + // and does not necessarily contain the desired kernel. Requires sof with + // matching pr_base.id to be programmed (base.sof) followed by PR programming + // with the given .rbf + size_t fast_compile_len = 0; + char *fast_compile_contents = NULL; + int fast_compile = 0; + if (acl_pkg_section_exists(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_len) && + acl_pkg_read_section_transient(pkg, ACL_PKG_SECTION_FAST_COMPILE, &fast_compile_contents)) { + fast_compile = 1; + ACL_PCIE_DEBUG_MSG(":: [%s] Fast-compile fpga.bin detected.\n", m_name); + } + // Find jtag cable for the board + // Returns 0 for both ad_cable,ad_device_index if not found + // or if Autodetect is disabled + this->find_jtag_cable(ad_cable, ad_device_index); + + // write out a SOF file + const int wrote_sof = acl_pkg_read_section_into_file(pkg, ACL_PKG_SECTION_SOF, SOFNAME); + ACL_PCIE_ERROR_IF(!wrote_sof, goto cleanup, "could not write %s.\n", SOFNAME); + + // disable interrupt and save control registers + const int failed_int_disable = this->disable_interrupts(); + ACL_PCIE_ERROR_IF(failed_int_disable, goto cleanup, "could not disable interrupt.\n"); + + // Do this last before programming + const int failed_cont_reg_save = m_config->save_pci_control_regs(); + ACL_PCIE_ERROR_IF(failed_cont_reg_save, goto cleanup, "could not save control regs\n"); + + // JTAG programming the device + ACL_PCIE_DEBUG_MSG(":: [%s] Starting JTAG programming of the device...\n", m_name); + reprogram_failed = m_config->program_with_SOF_file(SOFNAME, ad_cable, ad_device_index); + +#if defined(LINUX) + m_config->load_pci_control_regs(); +#endif + + ACL_PCIE_ERROR_IF(reprogram_failed, goto cleanup, "Failed to JTAG program\n"); + + if (!m_skip_quartus_version_check && + acl_pkg_section_exists(pkg, ACL_PKG_SECTION_QVERSION, &quartus_version_len)) { + char *quartus_compile_version_str = (char *)malloc(quartus_version_len + 1); + if (quartus_compile_version_str) { + int quartus_compile_version_ok = + acl_pkg_read_section(pkg, ACL_PKG_SECTION_QVERSION, quartus_compile_version_str, quartus_version_len + 1); + if (quartus_compile_version_ok) { + // Remove Linux and Windows new-line ending in .acl.qversion + if ((quartus_version_len > 0) && (quartus_compile_version_str[quartus_version_len - 1] == '\n' || + quartus_compile_version_str[quartus_version_len - 1] == '\r')) { + quartus_compile_version_str[quartus_version_len - 1] = '\0'; + } + if ((quartus_version_len > 1) && (quartus_compile_version_str[quartus_version_len - 2] == '\r')) { + quartus_compile_version_str[quartus_version_len - 2] = '\0'; + } + // Last character is NULL added by acl_pkg_read_section + m_io->quartus_ver->write_block(0, quartus_version_len + 1, quartus_compile_version_str); + } + free(quartus_compile_version_str); + quartus_compile_version_str = NULL; + } + } + + if (version_id_test()) { + ACL_PCIE_DEBUG_MSG(":: [%s] version_id_test() failed.\n", m_name); + reprogram_failed = 1; + } else if (wait_for_uniphy()) { + ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy failed to calibrate.\n", m_name); + reprogram_failed = 1; + } + if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) { + // S10 PR + if (deassert_pr_reset()) { + ACL_PCIE_DEBUG_MSG(":: [%s] PR region controller reset source deasserted.\n", m_name); + } + }; + if (fast_compile) { + // need to rerun pr_reprogram because design should be loaded now + hash_mismatch = 0; + rbf_or_hash_not_provided = 0; + reprogram_failed = pr_reprogram(pkg, + SOFNAME, + &rbf_or_hash_not_provided, + &hash_mismatch, + &use_jtag_programming, + &quartus_compile_version_mismatch); + } + if (!(reprogram_failed)) { + ACL_PCIE_DEBUG_MSG(":: [%s] JTAG programming passed.\n", m_name); + } + + } else { + ACL_PCIE_DEBUG_MSG(":: [%s] Could not read SOF file from fpga.bin.\n", m_name); + reprogram_failed = 1; + } + } + +cleanup: + // Clean up + if (pkg) acl_pkg_close_file(pkg); + m_being_programmed = false; + + return reprogram_failed; +} +#endif + +// Perform a simple version id read to test the basic PCIe read functionality +// Return 0 on success +int ACL_PCIE_DEVICE::version_id_test() { + unsigned int version = ACL_VERSIONID ^ 1; // make sure it's not what we hope to find. + unsigned int iattempt; + unsigned int max_attempts = 1; + unsigned int usleep_per_attempt = 20; // 20 ms per. + + ACL_PCIE_DEBUG_MSG(":: [%s] Doing PCIe-to-fabric read test ...\n", m_name); + for (iattempt = 0; iattempt < max_attempts; iattempt++) { + m_io->version->read32(0, &version); + if ((version >= (unsigned int)ACL_VERSIONID_MIN) && (version <= (unsigned int)ACL_VERSIONID)) { + ACL_PCIE_DEBUG_MSG(":: [%s] PCIe-to-fabric read test passed\n", m_name); + return 0; + } +#if defined(WINDOWS) + Sleep(usleep_per_attempt); +#endif // WINDOWS +#if defined(LINUX) + usleep(usleep_per_attempt * 1000); +#endif // LINUX + } + + // Kernel read command succeed, but got bad data. (version id doesn't match) + ACL_PCIE_INFO("[%s] PCIe-to-fabric read test failed, read 0x%0x after %u attempts\n", m_name, version, iattempt); + return -1; +} + +// Perform a read of the kernel region status IP +// Return 0 on success (PR region is unfrozen and ready to use) +int ACL_PCIE_DEVICE::check_kernel_region_status() { +#if defined(LINUX) + unsigned int value; + struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_GET_PR_REGION_STATUS, NULL, &value, sizeof(value)}; + if (read(m_device, &driver_cmd, sizeof(driver_cmd)) == -1) { + return -1; + } else { + return value; + } +#endif // Linux + return 0; +} + +// Performs a write to PR region controller to deassert reset to PR region +// Return 0 on success +int ACL_PCIE_DEVICE::deassert_pr_reset() { + ACL_PCIE_DEBUG_MSG(":: [%s] Deasserting PR region controller reset ...\n", m_name); + m_io->pr_region_ctrl->write32(FREEZE_CTRL_OFFSET, 0); + + return 0; +} + +// Quartus Compile Version check +// Return 0 on success +int ACL_PCIE_DEVICE::quartus_ver_test(char *pkg_qversion_str) { + char *fpga_qversion_str; + unsigned int version; + + // Check version ID to ensure feature supported in HW + m_io->version->read32(0, &version); + if (version < (unsigned int)ACL_QUARTUSVER_VERSIONID) { + ACL_PCIE_DEBUG_MSG(":: [%s] Programming on board without Quartus Version RAM\n", m_name); + return 1; + } + + // Allocate buffer for Quartus version read from FPGA with + // largest expected size + 1 for NULL + fpga_qversion_str = reinterpret_cast<char*>(malloc(ACL_QUARTUSVER_ROM_SIZE + 1)); + if (NULL == fpga_qversion_str) { + ACL_PCIE_DEBUG_MSG(":: Memory allocation failed, allocating %d bytes\n", ACL_QUARTUSVER_ROM_SIZE + 1); + free(fpga_qversion_str); + return 1; + } + // Make sure it's not what we hope to find + memset(fpga_qversion_str, 0, ACL_QUARTUSVER_ROM_SIZE + 1); + + m_io->quartus_ver->read_block(0, ACL_QUARTUSVER_ROM_SIZE, fpga_qversion_str); + + size_t fpga_qversion_len = 0; + fpga_qversion_len = strnlen(fpga_qversion_str, MAX_LEN); + + size_t pkg_qversion_len = 0; + if (pkg_qversion_str) { + pkg_qversion_len = strnlen(pkg_qversion_str, MAX_LEN); + + if (fpga_qversion_len != pkg_qversion_len) { + // Kernel read command succeed, but got bad data. (Quartus Version doesn't match) + ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name); + ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str); + ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str); + free(fpga_qversion_str); + return 1; + } + + if (strncmp(pkg_qversion_str, fpga_qversion_str, fpga_qversion_len) == 0) { + ACL_PCIE_DEBUG_MSG(":: [%s] Quartus versions for base and import compile match\n", m_name); + ACL_PCIE_DEBUG_MSG(":: [%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str); + ACL_PCIE_DEBUG_MSG(":: [%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str); + free(fpga_qversion_str); + return 0; + } + + // Kernel read command succeed, but got bad data. (Quartus Version doesn't match) + ACL_PCIE_DEBUG_MSG("[%s] Quartus versions for base and import compile do not match\n", m_name); + ACL_PCIE_DEBUG_MSG("[%s] Board is currently programmed with sof from Quartus %s\n", m_name, fpga_qversion_str); + ACL_PCIE_DEBUG_MSG("[%s] PR import was compiled with Quartus %s\n", m_name, pkg_qversion_str); + } + free(fpga_qversion_str); + return 1; +} + +// Perform a simple read to the PR base ID in the static region and compare it with the given ID +// Return 0 on success +int ACL_PCIE_DEVICE::pr_base_id_test(unsigned int pr_import_version) { + unsigned int pr_base_version = 0; // make sure it's not what we hope to find. + + ACL_PCIE_DEBUG_MSG(":: [%s] Reading PR base ID from fabric ...\n", m_name); + m_io->pr_base_id->read32(0, &pr_base_version); + if (pr_base_version == pr_import_version) { + ACL_PCIE_DEBUG_MSG(":: [%s] PR base and import compile IDs match\n", m_name); + ACL_PCIE_DEBUG_MSG(":: [%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version); + ACL_PCIE_DEBUG_MSG(":: [%s] PR import compile ID is 0x%0x\n", m_name, pr_import_version); + return 0; + }; + + // Kernel read command succeed, but got bad data. (version id doesn't match) + ACL_PCIE_DEBUG_MSG("[%s] PR base and import compile IDs do not match\n", m_name); + ACL_PCIE_DEBUG_MSG("[%s] PR base ID currently configured is 0x%0x\n", m_name, pr_base_version); + ACL_PCIE_DEBUG_MSG("[%s] PR import compile expects ID to be 0x%0x\n", m_name, pr_import_version); + return -1; +} + +// 1. Write a random value to cade_id register, do a read to confirm the write +// 2. Use the random value to find the JTAG cable for that board +// 3. Return "0" on ad_cable,ad_device_index if cable not found +void ACL_PCIE_DEVICE::find_jtag_cable(char *ad_cable, char *ad_device_index) { + bool jtag_ad_disabled = false; + bool jtag_ad_cable_found = false; + unsigned int version = 0; + + // Check if Autodetect is disabled + const char *cable = getenv("ACL_PCIE_JTAG_CABLE"); + const char *device_index = getenv("ACL_PCIE_JTAG_DEVICE_INDEX"); + if (cable || device_index) { + jtag_ad_disabled = true; + ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled!!!\n", m_name); + } + + // Check version ID to ensure feature supported in HW + m_io->version->read32(0, &version); + if (version < (unsigned int)ACL_CADEID_VERSIONID) { + jtag_ad_disabled = true; + ACL_PCIE_DEBUG_MSG(":: [%s] JTAG cable autodetect disabled due to old HW version!!!\n", m_name); + } + + // If JTAG autodetect is enabled, program the CADEID register + // and look for the value using in system sources and probes + if (!jtag_ad_disabled) { + // Only use random device here because we only want one value. Normally use mersenne twister for more values + std::random_device rd; + std::uniform_int_distribution<unsigned int> dist(0u, 0xFFFFFFFFu); + unsigned int cade_id_write = dist(rd) & 0xFFFFFFFF; + cade_id_write = cade_id_write | 0x80000000; // Write a full 32 bit value + unsigned int cade_id_read = 0x0; + + ACL_PCIE_DEBUG_MSG(":: [%s] Writing Cade ID to fabric ...\n", m_name); + m_io->cade_id->write32(0, cade_id_write); + + ACL_PCIE_DEBUG_MSG(":: [%s] Reading Cade ID from fabric ...\n", m_name); + m_io->cade_id->read32(0, &cade_id_read); + + if (cade_id_write == cade_id_read) { + ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read success ...\n", m_name); + ACL_PCIE_DEBUG_MSG( + ":: [%s] Cade ID cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read); + + // Returns NULL on ad_cable,ad_device_index if no cable found + jtag_ad_cable_found = m_config->find_cable_with_ISSP(cade_id_write, ad_cable, ad_device_index); + + if (!jtag_ad_cable_found) { + ACL_PCIE_DEBUG_MSG(":: [%s] Using default cable 1 ...\n", m_name); + } else { + ACL_PCIE_DEBUG_MSG(":: [%s] Found Cable ...\n", m_name); + } + } else { + ACL_PCIE_DEBUG_MSG(":: [%s] Cade ID write/read failed. Check BSP version or PCIE link...\n", m_name); + ACL_PCIE_DEBUG_MSG( + ":: [%s] Cade ID cade_id_write 0x%0x, cade_id_read 0x%0x\n", m_name, cade_id_write, cade_id_read); + } + } + + if (jtag_ad_disabled || !jtag_ad_cable_found) { + snprintf(ad_cable, AD_CABLE_SIZE, "%s", "0"); + snprintf(ad_device_index, AD_CABLE_SIZE, "%s", "0"); + } +} + +// Wait until the uniphy calibrated +// Return 0 on success +int ACL_PCIE_DEVICE::wait_for_uniphy() { + const unsigned int ACL_UNIPHYSTATUS = 0; + unsigned int status = 1, retries = 0; + + while (retries++ < 8) { + m_io->uniphy_status->read32(0, &status); + + if (status == ACL_UNIPHYSTATUS) { + ACL_PCIE_DEBUG_MSG(":: [%s] Uniphys are calibrated\n", m_name); + return 0; // success + } + + ACL_PCIE_DEBUG_MSG(":: [%s] Uniphy status read was %x\n", m_name, status); + ACL_PCIE_DEBUG_MSG(":: [%s] Resetting Uniphy try %d\n", m_name, retries); + m_io->uniphy_reset->write32(0, 1); + +#if defined(WINDOWS) + Sleep(400); +#endif // WINDOWS +#if defined(LINUX) + usleep(400 * 1000); +#endif // LINUX + } + + ACL_PCIE_INFO("[%s] uniphy(s) did not calibrate. Expected 0 but read %x\n", m_name, status); + + // Failure! Was it communication error or actual calibration failure? + if (ACL_PCIE_READ_BIT(status, 3)) // This bit is hardcoded to 0 + ACL_PCIE_INFO( + " Uniphy calibration status is corrupt. This is likely a communication error with the board " + "and/or uniphy_status module.\n"); + else { + // This is a 32-bit interface with the first 4 bits aggregating the + // various calibration signals. The remaining 28-bits would indicate + // failure for their respective memory core. Tell users which ones + // failed + for (int i = 0; i < 32 - 4; i++) { + if (ACL_PCIE_READ_BIT(status, 4 + i)) ACL_PCIE_INFO(" Uniphy core %d failed to calibrate\n", i); + } + ACL_PCIE_INFO(" If there are more failures than Uniphy controllers connected, \n"); + ACL_PCIE_INFO(" ensure the uniphy_status core is correctly parameterized.\n"); + } + + return -1; // failure +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h new file mode 100644 index 0000000..29f5128 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_device.h @@ -0,0 +1,209 @@ +#ifndef ACL_PCIE_DEVICE_H +#define ACL_PCIE_DEVICE_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_device.h -------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle operations on a single device. */ +/* The actual implementation of the class lives in the acl_pcie_device.cpp */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// Forward declaration for classes used by ACL_PCIE_DEVICE +class ACL_PCIE_DMA; +class ACL_PCIE_CONFIG; +class ACL_PCIE_MM_IO_MGR; +class ACL_PCIE_HOSTCH; + +#if defined(LINUX) +typedef int fpga_handle; +#else +#include <opae/fpga.h> +#endif // LINUX + +#ifdef DLA_MMD +// CoreDLA runtime assumes host/device transfers are thread safe +#include <mutex> +// don't assume opencl has been installed +typedef int cl_int; +#endif + +// Encapsulates the functionality of an ACL device connected to the host +// through a PCI express bus. +class ACL_PCIE_DEVICE { + public: + ACL_PCIE_DEVICE(int dev_num, const char *name, int handle, int user_signal_number); + ~ACL_PCIE_DEVICE(); + ACL_PCIE_DEVICE(const ACL_PCIE_DEVICE&) = delete; + ACL_PCIE_DEVICE& operator= (const ACL_PCIE_DEVICE&) = delete; + + bool is_valid() { return m_device != INVALID_HANDLE_VALUE; }; + bool is_initialized() { return m_initialized; }; + bool is_being_programmed() { return m_being_programmed; }; + + // Perform operations required when an interrupt is received for this device + void service_interrupt(unsigned int irq_type_flag = 0); + // This function can be used for triggering a fake device exception for + void test_trigger_device_interrupt(); + + // The callback function set by "set_status_handler" + // It's used to notify/update the host whenever an event is finished + void event_update_fn(aocl_mmd_op_t op, int status); + + // Called by the host program when there are spare cycles + int yield(); + + // Memory I/O + // return 0 on success + int write_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size); + int read_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, void *host_addr, size_t dev_addr, size_t size); + int copy_block(aocl_mmd_op_t e, aocl_mmd_interface_t mmd_interface, size_t src, size_t dst, size_t size); + + // Create channel. return handle to channel on success, negative otherwise + int create_hostchannel(char *name, size_t queue_depth, int direction); + + // return 0 on success + int destroy_channel(int channel); + + // return pointer that user can write to for write channel, and read from for read channel + void *hostchannel_get_buffer(size_t *buffer_size, int channel, int *status); + + // return the size in bytes of the amount of buffer that was acknlowedged to channel + size_t hostchannel_ack_buffer(size_t send_size, int channel, int *status); + + // Set kernel, device interrupts and event update callbacks + // return 0 on success + int set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data); + int set_device_interrupt(aocl_mmd_device_interrupt_handler_fn fn, void *user_data); + int set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + + // Query PCIe information of the device + char *get_dev_pcie_info() { return m_info.pcie_info_str; }; + + // Query on-die temperature sensor, if available + bool get_ondie_temp_slow_call(cl_int *temp); + + // Shared memory manipulation functions + void *shared_mem_alloc(size_t size, unsigned long long *device_ptr_out); + void shared_mem_free(void *host_ptr, size_t size); + + // Reprogram the device with given binary file + // return 0 on success +#ifdef DLA_MMD + int pause_and_save_pcie(); + int restore_and_resume_pcie(); + int reprogram_sof(const char *sof_filename, const bool skipSaveRestore = false); +#else + int reprogram(void *data, size_t data_size, int program_mode); +#endif + + private: + // Helper routines for interrupts + // return 0 on success, negative on error + int mask_irqs(); + int unmask_irqs(); + int unmask_kernel_irq(); + int disable_interrupts(); + int enable_interrupts(int user_signal_number); + int get_interrupt_type(unsigned int *kernel_update, unsigned int *dma_update, unsigned int irq_type_flag); +#if defined(WINDOWS) + void enable_msi(bool enable); +#endif // WINDOWS + + // Helper routines for read or write operations + // return 0 on success, negative on error (except for the "incr_ptrs" routine) + int read_write_block(aocl_mmd_op_t e, void *host_addr, size_t dev_addr, size_t size, bool reading); + int read_write_block_bar(void *host_addr, size_t dev_addr, size_t size, bool reading); + int read_write_small_size(void *host_addr, size_t dev_addr, size_t size, bool reading); + int set_segment(size_t addr); + void incr_ptrs(void **host, size_t *dev, size_t *counter, size_t incr); + int does_base_periph_match_new_periph(struct acl_pkg_file *pkg, const char *dev_name); + + // Helper routines for simple functionality test + // return 0 on success, negative on error + int version_id_test(); + int wait_for_uniphy(); + int pr_base_id_test(unsigned int pr_import_version); + int deassert_pr_reset(); + int quartus_ver_test(char *pkg_qversion_str); + int check_kernel_region_status(); + + // Write a random value to cade_id register, do a read to confirm the write + // Use the random value to find the JTAG cable for that board + // Return 0 on ad_cable,ad_device_index if cable not found + void find_jtag_cable(char *ad_cable, char *ad_device_index); + +#ifndef DLA_MMD + // Performs PR reprogramming if possible, and returns different statuses on + // PR Hash, JTAG programming, RBF or Hash Presence + // Returns 0 on success, 1 on reprogram fail + int pr_reprogram(struct acl_pkg_file *pkg, + const char *SOFNAME, + int *rbf_or_hash_not_provided, + int *hash_mismatch, + unsigned *use_jtag_programming, + int *quartus_compile_version_mismatch); +#endif + + // Kernel interrupt handler and event update callbacks + aocl_mmd_interrupt_handler_fn kernel_interrupt; + void *kernel_interrupt_user_data; + aocl_mmd_device_interrupt_handler_fn device_interrupt; + void *device_interrupt_user_data; + aocl_mmd_status_handler_fn event_update; + void *event_update_user_data; + int m_user_signal_number; + + ACL_PCIE_MM_IO_MGR *m_io; + ACL_PCIE_DMA *m_dma; + ACL_PCIE_HOSTCH *m_hostch; + ACL_PCIE_CONFIG *m_config; + + static const int MAX_NAME_LENGTH = 32; + int m_handle; + char m_name[MAX_NAME_LENGTH]; + fpga_handle m_device; + ACL_PCIE_DEVICE_DESCRIPTION m_info; + + bool m_use_dma_for_big_transfers; + bool m_mmd_irq_handler_enable; + bool m_initialized; + bool m_being_programmed; + bool m_skip_quartus_version_check; + + // IRQ acknowledgement commands in the KMD + static const unsigned int NUM_ACK_CMDS = 3; +#if defined(WINDOWS) + fpga_event_handle *dev_event_handle; +#endif // WINDOWS + + // For the host, memory is segmented. This stores the last used segment + // ID so we don't needlessly update it in hardware + UINT64 m_segment; + +#ifdef DLA_MMD + std::mutex m_dma_mutex; +#endif +}; + +#endif // ACL_PCIE_DEVICE_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h new file mode 100644 index 0000000..ec9fdb1 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma.h @@ -0,0 +1,37 @@ +#ifndef ACL_PCIE_DMA_H +#define ACL_PCIE_DMA_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_dma.h ----------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(WINDOWS) +#include "acl_pcie_dma_windows.h" +#endif // WINDOWS +#if defined(LINUX) +#include "acl_pcie_dma_linux.h" +#endif // LINUX + +#endif // ACL_PCIE_DMA_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp new file mode 100644 index 0000000..a83b0dd --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.cpp @@ -0,0 +1,141 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_dma_linux.cpp --------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle Linux-specific DMA operations. */ +/* The declaration of the class lives in the acl_pcie_dma_linux.h */ +/* The actual implementation of DMA operation is inside the Linux kernel driver. */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(LINUX) + +// common and its own header files +#include "acl_pcie_dma_linux.h" +#include "acl_pcie.h" + +// other header files inside MMD driver +#include "acl_pcie_device.h" +#include "acl_pcie_mm_io.h" + +// other standard header files +#include <stdio.h> +#include <sys/time.h> +#include <unistd.h> + +ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie) { + ACL_PCIE_ASSERT(dev != INVALID_DEVICE, "passed in an invalid device when creating dma object.\n"); + ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n"); + ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n"); + + m_handle = dev; + m_pcie = pcie; + m_io = io; + m_event = NULL; +} + +ACL_PCIE_DMA::~ACL_PCIE_DMA() { + struct acl_cmd driver_cmd = {ACLPCI_CMD_BAR, ACLPCI_CMD_DMA_STOP, NULL, NULL}; + int bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "failed to read driver command \n"); +} + +bool ACL_PCIE_DMA::is_idle() { + unsigned int result = 0; + int bytes_read; + struct acl_cmd driver_cmd; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_GET_DMA_IDLE_STATUS; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = &result; + driver_cmd.size = sizeof(result); + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + + return (bytes_read != -1 && result != 0); +} + +// Perform operations required when a DMA interrupt comes +// For Linux, +// All of the DMA related interrupts are handled inside the kernel driver, +// so when MMD gets a signal from the kernel driver indicating DMA is finished, +// it only needs to call the event_update_fn when it's needed. +void ACL_PCIE_DMA::service_interrupt() { + if (m_event) { + // Use a temporary variable to save the event data and reset m_event + // before calling event_update_fn to avoid race condition that the main + // thread may start a new DMA transfer before this work-thread is able to + // reset the m_event. + // therefore, an assertion is implemented here, as defensively preventing + // sending interrupt signals incorrectly. + ACL_PCIE_ASSERT( + this->is_idle(), + "The dma is still in running, cannot service an interrupt to invoke another read/write operation\n"); + aocl_mmd_op_t temp_event = m_event; + m_event = NULL; + + m_pcie->event_update_fn(temp_event, 0); + } +} + +// relinquish the CPU to let any other thread to run +// return 0 since there is no useful work to be performed here +int ACL_PCIE_DMA::yield() { + usleep(0); + return 0; +} + +// Transfer data between host and device +// This function returns right after the transfer is scheduled +// Return 0 on success +int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) { + // Currently dma cannot operate multiple read/write the same time. + // This means the read/write should be executed if and only if the dma is idle. + // Otherwise, it would cause assertion failure in the kernel space of the OS, + // which result in hanging, and even kernel panic and machine frozen as worst case. + // An assertion is implemented here, as defensively preventing race condition or incorrect sending of signal. + ACL_PCIE_ASSERT(this->is_idle(), + "The dma is still in running, cannot perform another %s operation concurrently.\n", + reading ? "read" : "write"); + + m_event = e; + + // There are two scenarios of the read/write operation + // 1. the referred event is NULL, MMD would be stalled and keep polling the DMA until it is idle. + // 2. the referred event is valid, MMD would return immediately, runtime will wait for + // the DMA service interrupt signal to update the status of the read/write operation. + // + // Therefore, the dma service interrupt is expected only when the event is valid. + struct acl_cmd driver_cmd {}; + driver_cmd.bar_id = ACLPCI_DMA_BAR; + driver_cmd.command = m_event ? ACLPCI_CMD_DMA_SERVICE_SIGNAL : ACLPCI_CMD_DMA_NO_SIGNAL; + driver_cmd.device_addr = reinterpret_cast<void *>(dev_addr); + driver_cmd.user_addr = host_addr; + driver_cmd.size = bytes; + if (reading) { + if (read(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1; // reading failed + } else { + if (write(m_handle, &driver_cmd, sizeof(driver_cmd)) == -1) return -1; + } + return 0; // success +} + +#endif // LINUX diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h new file mode 100644 index 0000000..2ad1762 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_linux.h @@ -0,0 +1,75 @@ +#ifndef ACL_PCIE_DMA_LINUX_H +#define ACL_PCIE_DMA_LINUX_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_dma_linux.h ----------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle Linux-specific DMA operations. */ +/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(LINUX) + +#ifdef DLA_MMD +#include <cstddef> //size_t +#include "aocl_mmd.h" +typedef int fpga_handle; +#endif + +class ACL_PCIE_DEVICE; +class ACL_PCIE_MM_IO_MGR; + +class ACL_PCIE_DMA { + public: + ACL_PCIE_DMA(fpga_handle dev, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie); + ~ACL_PCIE_DMA(); + + bool is_idle(); + void stall_until_idle() { + while (!is_idle()) yield(); + }; + + // Perform operations required when a DMA interrupt comes + void service_interrupt(); + + // Relinquish the CPU to let any other thread to run + // Return 0 since there is no useful work to be performed here + int yield(); + + // Transfer data between host and device + // This function returns right after the transfer is scheduled + // Return 0 on success + int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading); + + private: + aocl_mmd_op_t m_event; + + fpga_handle m_handle; + ACL_PCIE_DEVICE *m_pcie; + ACL_PCIE_MM_IO_MGR *m_io; +}; + +#endif // LINUX + +#endif // ACL_PCIE_DMA_LINUX_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp new file mode 100644 index 0000000..ab5e7b2 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.cpp @@ -0,0 +1,1381 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_dma_windows.cpp ------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle Windows-specific DMA operations. */ +/* The declaration of the class lives in the acl_pcie_dma_windows.h */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(WINDOWS) + +// common and its own header files +#include "acl_pcie.h" +#include "acl_pcie_dma_windows.h" +#include "hw_pcie_constants.h" + +// other header files inside MMD driver +#include "acl_pcie_device.h" +#include "acl_pcie_mm_io.h" +#include "acl_pcie_timer.h" +#include "acl_pcie_debug.h" +#include <iostream> +#include <stdlib.h> + +#define ACL_PCIE_DMA_DEBUG(m, ...) ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, m, __VA_ARGS__) + +// The callback function to be scheduled inside the interrupt handler +// It will release the semaphore to allow new work to be scheduled and +// perform the dma update function +void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) { + ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context; + + ReleaseSemaphore(m_dma->m_workqueue_semaphore, 1, NULL); + + m_dma->update(true); +} + +void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) { + ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context; + + m_dma->unpin_from_queue(); +} + +void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work) { + ACL_PCIE_DMA *m_dma = (ACL_PCIE_DMA *)context; + + m_dma->prepin_memory(); +} + +ACL_PCIE_DMA::ACL_PCIE_DMA(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie) + : hostch_data(), + m_table_virt_addr(NULL), + m_table_dma_addr(), + m_table_dma_phys_addr(0), + m_active_descriptor(NULL), + m_last_pinned_size(0), + m_last_pinned_addr(NULL), + m_prepinned(0), + m_last_id(0), + m_event(NULL), + m_dev_addr(0), + m_host_addr(NULL), + m_bytes(0), + m_bytes_sent(0), + m_bytes_rem(0), + m_read(0), + m_idle(0), + m_interrupt_disabled(0), + m_pcie(NULL), + m_io(NULL), + m_timer(NULL), + m_callback_env(), + m_work(NULL), + m_workqueue_semaphore(NULL), + m_dma_unpin_pending(), + m_unpin_callback_env(), + m_unpin_threadpool(NULL), + m_unpin_work(NULL), + m_pin_callback_env(), + m_pin_threadpool(NULL), + m_pin_work(NULL) { + ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n"); + ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n"); + ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n"); + + m_handle = handle; + m_io = io; + m_pcie = pcie; + + HOSTCH_DESC *h = &hostch_data; + + const char *use_msi = getenv("ACL_PCIE_DMA_USE_MSI"); + if (use_msi) + m_use_polling = 0; + else + m_use_polling = 1; + + SecureZeroMemory(&m_active_mem, sizeof(PINNED_MEM)); + SecureZeroMemory(&m_pre_pinned_mem, sizeof(PINNED_MEM)); + SecureZeroMemory(&m_done_mem, sizeof(PINNED_MEM)); + + // Initialize Host Channel + SecureZeroMemory(&h->m_hostch_rd_mem, sizeof(PINNED_MEM)); + SecureZeroMemory(&h->m_hostch_wr_mem, sizeof(PINNED_MEM)); + SecureZeroMemory(&h->m_hostch_rd_pointer, sizeof(PINNED_MEM)); + SecureZeroMemory(&h->m_hostch_wr_pointer, sizeof(PINNED_MEM)); + SecureZeroMemory(&h->m_sync_thread_pointer, sizeof(PINNED_MEM)); + h->push_valid = 0; + h->pull_valid = 0; + + m_timer = new ACL_PCIE_TIMER(); + + // create the threadpool to perform work the interrupt + m_threadpool = CreateThreadpool(NULL); + ACL_PCIE_ERROR_IF(m_threadpool == NULL, return, "failed to create threadpool.\n"); + + // set the number of work threads to 1 + // so that no scheduled work will be running in parallel between them + SetThreadpoolThreadMaximum(m_threadpool, 1); + bool status = SetThreadpoolThreadMinimum(m_threadpool, 1); + ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n"); + + // create the work for threadpool and its semaphore + InitializeThreadpoolEnvironment(&m_callback_env); + SetThreadpoolCallbackPool(&m_callback_env, m_threadpool); + + m_work = CreateThreadpoolWork(myWorkCallback, (void *)this, &m_callback_env); + ACL_PCIE_ERROR_IF(m_work == NULL, return, "failed to create work for threadpool.\n"); + + m_workqueue_semaphore = CreateSemaphore(NULL, 1, 1, NULL); + ACL_PCIE_ERROR_IF(m_workqueue_semaphore == NULL, return, "failed to create semaphore.\n"); + + /////////////////////////////////////////////////////////////////////////////////////////// + // Unpin thread + m_unpin_threadpool = CreateThreadpool(NULL); + ACL_PCIE_ERROR_IF(m_unpin_threadpool == NULL, return, "failed to create threadpool.\n"); + + // set the number of work threads to 1 + // so that no scheduled work will be running in parallel between them + SetThreadpoolThreadMaximum(m_unpin_threadpool, 1); + status = SetThreadpoolThreadMinimum(m_unpin_threadpool, 1); + ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n"); + + // create the work for threadpool and its semaphore + InitializeThreadpoolEnvironment(&m_unpin_callback_env); + SetThreadpoolCallbackPool(&m_unpin_callback_env, m_unpin_threadpool); + + m_unpin_work = CreateThreadpoolWork(myWorkUnpinCallback, (void *)this, &m_unpin_callback_env); + ACL_PCIE_ERROR_IF(m_unpin_work == NULL, return, "failed to create work for unpin threadpool.\n"); + + /////////////////////////////////////////////////////////////////////////////////////////// + // pin thread + m_pin_threadpool = CreateThreadpool(NULL); + ACL_PCIE_ERROR_IF(m_pin_threadpool == NULL, return, "failed to create threadpool.\n"); + + // set the number of work threads to 1 + // so that no scheduled work will be running in parallel between them + SetThreadpoolThreadMaximum(m_pin_threadpool, 1); + status = SetThreadpoolThreadMinimum(m_pin_threadpool, 1); + ACL_PCIE_ERROR_IF(status == false, return, "failed to set # of work thread to 1.\n"); + + // create the work for threadpool and its semaphore + InitializeThreadpoolEnvironment(&m_pin_callback_env); + SetThreadpoolCallbackPool(&m_pin_callback_env, m_pin_threadpool); + + m_pin_work = CreateThreadpoolWork(myWorkPinCallback, (void *)this, &m_pin_callback_env); + ACL_PCIE_ERROR_IF(m_pin_work == NULL, return, "failed to create work for unpin threadpool.\n"); + + /////////////////////////////////////////////////////////////////////////////////////////// + // Contiguous DMA'able memory allocation for descriptor table + + fpga_result FPGA_status; + size_t desc_table_size = sizeof(struct DMA_DESC_TABLE); + size_t page_table_size = sizeof(struct HOSTCH_TABLE); + + // Lock DMA_DESC_TABLE using WsId + FPGA_status = fpgaPrepareBuffer( + m_handle, (UINT64)desc_table_size, (PVOID *)&m_table_virt_addr, &m_table_dma_addr.WsId, FPGA_BUF_QUIET); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n"); + + // IOCTL call to flush CPU buffers + FPGA_status = fpgaProcessDeviceCmd( + m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), &m_table_dma_addr.WsId, NULL, 0); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + // Obtain Physical address for the Page associated with the buffer + FPGA_status = fpgaGetPhysicalAddress(m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, NULL); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + // Allocate memory for SG List + m_table_dma_addr.Page = (sg_element *)malloc(m_table_dma_addr.dwPages * sizeof(sg_element)); + + // Throw an exception in case of malloc failure + if (m_table_dma_addr.Page == NULL) throw std::bad_alloc(); + + FPGA_status = fpgaGetPhysicalAddress( + m_handle, m_table_dma_addr.WsId, (uint64_t *)&m_table_dma_addr.dwPages, (void *)m_table_dma_addr.Page); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked DMA descriptor table memory.\n"); + ACL_PCIE_ASSERT(m_table_dma_addr.dwPages == 1, "fpgaPrepareBuffer function allocated more than 1 page.\n"); + + if (m_table_dma_addr.Page != NULL) m_table_dma_phys_addr = m_table_dma_addr.Page[0].phys_addr; + + // Lock HOSTCH_TABLE push channel using WsId + FPGA_status = fpgaPrepareBuffer(m_handle, + (UINT64)page_table_size, + (PVOID *)&h->push_page_table, + &hostch_data.push_page_table_addr.WsId, + FPGA_BUF_QUIET); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function failed.\n"); + + // IOCTL call to flush CPU buffers + FPGA_status = fpgaProcessDeviceCmd(m_handle, + GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), + (PVOID)&hostch_data.push_page_table_addr.WsId, + NULL, + 0); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + // Obtain Physical address for the Page associated with the buffer + FPGA_status = fpgaGetPhysicalAddress( + m_handle, hostch_data.push_page_table_addr.WsId, (uint64_t *)&hostch_data.push_page_table_addr.dwPages, NULL); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + // Allocate memory for SG List + hostch_data.push_page_table_addr.Page = + (sg_element *)malloc(hostch_data.push_page_table_addr.dwPages * sizeof(sg_element)); + + // Throw an exception in case of malloc failure + if (hostch_data.push_page_table_addr.Page == NULL) throw std::bad_alloc(); + + FPGA_status = fpgaGetPhysicalAddress(m_handle, + hostch_data.push_page_table_addr.WsId, + (uint64_t *)&hostch_data.push_page_table_addr.dwPages, + (void *)hostch_data.push_page_table_addr.Page); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table for Hostchannel memory.\n"); + ACL_PCIE_ASSERT(hostch_data.push_page_table_addr.dwPages == 1, + "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n"); + + if (hostch_data.push_page_table_addr.Page != NULL) + hostch_data.push_page_table_bus_addr = hostch_data.push_page_table_addr.Page[0].phys_addr; + + // Lock HOSTCH_TABLE pull channel + FPGA_status = fpgaPrepareBuffer(m_handle, + (UINT64)page_table_size, + (PVOID *)&h->pull_page_table, + &hostch_data.pull_page_table_addr.WsId, + FPGA_BUF_QUIET); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaPrepareBuffer function for Hostchannel failed. \n"); + + // IOCTL call to flush CPU buffers + FPGA_status = fpgaProcessDeviceCmd(m_handle, + GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), + (PVOID)&hostch_data.pull_page_table_addr.WsId, + NULL, + 0); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + // Obtain Physical address for the Page associated with the buffer + FPGA_status = fpgaGetPhysicalAddress( + m_handle, hostch_data.pull_page_table_addr.WsId, (uint64_t *)&hostch_data.pull_page_table_addr.dwPages, NULL); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + // Allocate memory for SG List + hostch_data.pull_page_table_addr.Page = + (sg_element *)malloc(hostch_data.pull_page_table_addr.dwPages * sizeof(sg_element)); + + // Throw an exception in case of malloc failure + if (hostch_data.pull_page_table_addr.Page == NULL) throw std::bad_alloc(); + + FPGA_status = fpgaGetPhysicalAddress(m_handle, + hostch_data.pull_page_table_addr.WsId, + (uint64_t *)&hostch_data.pull_page_table_addr.dwPages, + (void *)hostch_data.pull_page_table_addr.Page); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Successfully locked descriptor table memory.\n"); + ACL_PCIE_ASSERT(hostch_data.pull_page_table_addr.dwPages == 1, + "fpgaPrepareBuffer function for HostChannel allocated more than 1 page.\n"); + + if (hostch_data.pull_page_table_addr.Page != NULL) + hostch_data.pull_page_table_bus_addr = hostch_data.pull_page_table_addr.Page[0].phys_addr; + + // set idle status to true when finish initialization + m_idle = true; +} + +ACL_PCIE_DMA::~ACL_PCIE_DMA() { + fpga_result FPGA_status; + stall_until_idle(); + + // make sure no more work queued for threadpool + WaitForThreadpoolWorkCallbacks(m_work, FALSE); + + // hostch_destroy is expected to be called by user but to make sure, call in the destructor + hostch_destroy(ACL_HOST_CHANNEL_0_ID); + hostch_destroy(ACL_HOST_CHANNEL_1_ID); + + // Unlock all the previously allocated tables from the constructor + FPGA_status = fpgaReleaseBuffer(m_handle, m_table_dma_addr.WsId); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n"); + + if (m_table_dma_addr.Page != NULL) { + free(m_table_dma_addr.Page); + m_table_dma_addr.Page = NULL; + } + + FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.push_page_table_addr.WsId); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n"); + + if (hostch_data.push_page_table_addr.Page != NULL) { + free(hostch_data.push_page_table_addr.Page); + hostch_data.push_page_table_addr.Page = NULL; + } + + FPGA_status = fpgaReleaseBuffer(m_handle, hostch_data.pull_page_table_addr.WsId); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaReleaseBuffer was not successful\n"); + + if (hostch_data.pull_page_table_addr.Page != NULL) { + free(hostch_data.pull_page_table_addr.Page); + hostch_data.pull_page_table_addr.Page = NULL; + } + + CloseHandle(m_workqueue_semaphore); + CloseThreadpoolWork(m_work); + CloseThreadpool(m_threadpool); + + CloseThreadpoolWork(m_unpin_work); + CloseThreadpool(m_unpin_threadpool); + + CloseThreadpoolWork(m_pin_work); + CloseThreadpool(m_pin_threadpool); + + if (m_timer) { + delete m_timer; + m_timer = NULL; + } +} + +int ACL_PCIE_DMA::check_dma_interrupt(unsigned int *dma_update) { + if (!m_use_polling) { + if (m_last_id > 0 && m_last_id <= ACL_PCIE_DMA_DESC_MAX_ENTRIES) { + *dma_update = (m_table_virt_addr->header.flags[m_last_id - 1]); + } else { + return 1; + } + } + return 0; +} + +void ACL_PCIE_DMA::unpin_from_queue() { + fpga_result result; + ACL_PCIE_ASSERT(!m_dma_unpin_pending.empty(), "m_dma_unpin_pending is empty but unpin mem thread was called\n"); + + QUEUE_STRUCT entry; + + entry = m_dma_unpin_pending.front(); + m_dma_unpin_pending.pop(); + + // IOCTL call to flush IO buffers + result = fpgaProcessDeviceCmd( + m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID) & (entry.WsId), NULL, 0); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + // Unlock the allocated tables associated with wsId + result = fpgaReleaseBuffer(m_handle, entry.WsId); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n"); + if (entry.SGListPtr != NULL) free(entry.SGListPtr); +} + +void ACL_PCIE_DMA::prepin_memory() { pin_memory(&m_pre_pinned_mem, true); } + +void ACL_PCIE_DMA::wait_finish() { + UINT32 wait_timer; + + while (1) { + wait_timer = ACL_PCIE_DMA_TIMEOUT; + while (wait_timer > 0) { + wait_timer--; + + if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait done\n"); + set_desc_table_header(); + if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) { + SubmitThreadpoolWork(m_work); + } + return; + } + } + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Wait timed out. Sleeping for 1ms.\n"); + Sleep(1); + } +} + +#if defined(GEN3_x16) + // Add extra descriptor for DMA controller to report 'done status' in the DMA table +void ACL_PCIE_DMA::add_extra_dma_desc() { + /* + One extra descriptor is required to be fetched. Two if using interrupts. + For reads (Host <-- FPGA), the last descriptor sets the DMA done status. + For writes (Host --> FPGA), the last descriptor fetches the status + descriptor which then sets the DMA done status. + When using interrupts, there is an additional descriptor that sends the + interrupt, handled in the same way as the above. + */ + // Clear done status flag. + m_table_virt_addr->header.flags[m_last_id - 1] = 0; // ID = m_last_id - 1 + + if (m_read) { + // descriptor[m_last_id]: write 0x1ULL to flags[m_last_id-1] which is used to indicate DMA done. + set_immediate_desc( // Set status bit + &(m_table_virt_addr->descriptors[m_last_id]), // descriptor[m_last_id] location in user space + m_table_dma_phys_addr + 4*(m_last_id - 1), // physical address for 0x1ULL to write (flags[m_last_id].. flag filed size is 4 byte) + 0x1ULL, + 255 + ); + } else { + // Need to fetch status desc into different destination. + // descriptor[m_last_id]: DMA Descriptor[m_last_id+1](32 byte) to WDP register set in DMA controller. + m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]); + set_read_desc(m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER) + (m_last_id + 1) * 32, // src: set_immediate_desc descriptor location + WRITE_DESC_PRIO_OFFSET + DESC_OFFSET, // des, location of WDP register set + 32/4 // copy 32-byte, 8 word + ); + + // descriptor[m_last_id+1]: write 0x1ULL(4-byte) to status[m_last_id-1] which is used to indicate DMA done. + set_immediate_desc( // Set status bit + &(m_table_virt_addr->descriptors[m_last_id + 1]), + m_table_dma_phys_addr + 4*(m_last_id - 1), //4: size per status entry + 0x1ULL, + 255 + ); + } + MemoryBarrier(); +} +#endif + +void ACL_PCIE_DMA::send_dma_desc() { + // Disabling interrupt is used in hostch_create function during polling +#if defined(GEN3_x8) + if (m_read) { + m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL); + m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32); + m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO); + m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI); + m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1); + if (m_interrupt_disabled) + m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT); + else + m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT); + MemoryBarrier(); + m_io->dma->write32(ACL_PCIE_DMA_WR_LAST_PTR, m_last_id - 1); + } else { + m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL); + m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32); + m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO); + m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI); + m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1); + if (m_interrupt_disabled) + m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_DISABLE_INT); + else + m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT); + MemoryBarrier(); + m_io->dma->write32(ACL_PCIE_DMA_RD_LAST_PTR, m_last_id - 1); + } +#elif defined(GEN3_x16) + DMA_DESC_ENTRY dt_fetch_desc; + UINT32 ctrl, *pValue32; + UINT64 dt_fetch_queue_addr64; + int i; + + add_extra_dma_desc(); + // init a descriptor for start dma + dt_fetch_desc.src_addr = m_table_dma_phys_addr + sizeof(DMA_DESC_HEADER); // physical addrees of first desciptor (assume dma always start from ID 0) + dt_fetch_desc.dst_addr = m_read ? WRITE_DESC_NORM_OFFSET : READ_DESC_NORM_OFFSET; + dt_fetch_desc.dst_addr += DESC_OFFSET; + ctrl = ((m_last_id - 1) + 2) * 8; // interrupt is not enabled case ... (ID+3)*8 if interrupted is enabled (note: ID = m_last_id-1) + ctrl |= 1 << 20; // Single destination + ctrl |= 0xFE << 24; // Special descriptor ID + dt_fetch_desc.ctrl = ctrl; + + dt_fetch_queue_addr64 = m_read ? READ_DESC_PRIO_OFFSET : READ_DESC_NORM_OFFSET; + pValue32 = (UINT32 *)(&dt_fetch_desc); + for (i = 0; i < 4; i++) { + m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + i * 4, *(pValue32 + i)); + } + // Most significant DWord must be written last. + MemoryBarrier(); + m_io->dma->write32(DESC_CTRLLER_BASE + dt_fetch_queue_addr64 + 4 * 4,*(((uint32_t *)(&dt_fetch_desc)) + 4)); + MemoryBarrier(); +#else + #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option" +#endif +} + +void ACL_PCIE_DMA::setup_dma_desc() { +#if defined(GEN3_x8) + m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL); + m_io->dma->write32(ACL_PCIE_DMA_RC_WR_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32); + m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_LO); + m_io->dma->write32(ACL_PCIE_DMA_EP_WR_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_WR_FIFO_BASE_HI); + m_io->dma->write32(ACL_PCIE_DMA_WR_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1); + + m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_LOW, m_table_dma_phys_addr & 0xffffffffUL); + m_io->dma->write32(ACL_PCIE_DMA_RC_RD_DESC_BASE_HIGH, m_table_dma_phys_addr >> 32); + m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_LOW, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_LO); + m_io->dma->write32(ACL_PCIE_DMA_EP_RD_FIFO_BASE_HIGH, ACL_PCIE_DMA_ONCHIP_RD_FIFO_BASE_HI); + m_io->dma->write32(ACL_PCIE_DMA_RD_TABLE_SIZE, ACL_PCIE_DMA_TABLE_SIZE - 1); +#endif +} + +void ACL_PCIE_DMA::set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len) { +#if defined(GEN3_x8) + m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL); + m_active_descriptor->src_addr_udw = (source >> 32); + m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL); + m_active_descriptor->dest_addr_udw = (dest >> 32); + m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18)); + m_active_descriptor->reserved[0] = 0; + m_active_descriptor->reserved[1] = 0; + m_active_descriptor->reserved[2] = 0; +#elif defined(GEN3_x16) + m_active_descriptor->src_addr = source; + m_active_descriptor->dst_addr = dest; + m_active_descriptor->ctrl = (ctl_dma_len | (m_last_id << 24)); + m_active_descriptor->reserved[0] = 0; + m_active_descriptor->reserved[1] = 0; + m_active_descriptor->reserved[2] = 0; +#else + #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option" +#endif +} + +void ACL_PCIE_DMA::set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len) { +#if defined(GEN3_x8) + m_active_descriptor->src_addr_ldw = (source & 0xffffffffUL); + m_active_descriptor->src_addr_udw = (source >> 32); + m_active_descriptor->dest_addr_ldw = (dest & 0xffffffffUL); + m_active_descriptor->dest_addr_udw = (dest >> 32); + m_active_descriptor->ctl_dma_len = (ctl_dma_len | (m_last_id << 18)); + m_active_descriptor->reserved[0] = 0; + m_active_descriptor->reserved[1] = 0; + m_active_descriptor->reserved[2] = 0; +#elif defined(GEN3_x16) + set_read_desc(source, dest, ctl_dma_len); +#else + #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option" +#endif +} + +#if defined(GEN3_x16) +void ACL_PCIE_DMA::set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id) { + uint32_t ctrl; + + desc->src_addr = data; // The data to write to given address + desc->dst_addr = addr; + ctrl = 1; // 1 DW status + ctrl |= 1 << 18; // Immediate access + ctrl |= id << 24; // Status descriptor ID + desc->ctrl = ctrl; + desc->reserved[0] = 0x0; + desc->reserved[1] = 0x0; + desc->reserved[2] = 0x0; +} +#endif + +void ACL_PCIE_DMA::set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num) { + page_entry->page_addr_ldw = (page_addr & 0xffffffffUL); + page_entry->page_addr_udw = (page_addr >> 32); + page_entry->page_num = page_num; + page_entry->reserved[0] = 0; + page_entry->reserved[1] = 0; + page_entry->reserved[2] = 1; + page_entry->reserved[3] = 0; + page_entry->reserved[4] = 0; +} + +void ACL_PCIE_DMA::set_desc_table_header() { + int i; + for (i = 0; i < ACL_PCIE_DMA_DESC_MAX_ENTRIES; i++) m_table_virt_addr->header.flags[i] = 0; +} + +// Perform operations required when a DMA interrupt comes +void ACL_PCIE_DMA::service_interrupt() { + if (!m_use_polling) { + // only submit a new work to the pool when there is not work in queued + if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) { + set_desc_table_header(); + SubmitThreadpoolWork(m_work); + } + } +} + +void ACL_PCIE_DMA::spin_loop_ns(UINT64 wait_ns) { + cl_ulong start = m_timer->get_time_ns(); + cl_ulong finish; + + do { + finish = m_timer->get_time_ns(); + } while (finish - start < wait_ns); +} + +void ACL_PCIE_DMA::check_last_id(UINT32 *last_id) { + ACL_PCIE_ASSERT(*last_id <= (ACL_PCIE_DMA_RESET_ID + 1), "last id was greater than 255.\n"); + + if (*last_id == (ACL_PCIE_DMA_RESET_ID + 1)) { + *last_id = 0; + return; + } else if (*last_id == ACL_PCIE_DMA_TABLE_SIZE) { + *last_id = 0; + return; + } + ACL_PCIE_ASSERT(*last_id < (ACL_PCIE_DMA_TABLE_SIZE), "last id was greater than 127.\n"); +} + +// Relinquish the CPU to let any other thread to run +// Return 0 since there is no useful work to be performed here +int ACL_PCIE_DMA::yield() { + Sleep(0); + return 0; +} + +// Add a byte-offset to a void* pointer +inline void *ACL_PCIE_DMA::compute_address(void *base, uintptr_t offset) { + uintptr_t p = reinterpret_cast<uintptr_t>(base); + return reinterpret_cast<void *>(p + offset); +} + +int ACL_PCIE_DMA::hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem) { + fpga_result FPGA_status; + UINT64 wsid; + + // No active segment of pinned memory - pin one + + // Lock HOSTCH_TABLE using WsId + FPGA_status = fpgaPrepareBuffer(m_handle, (UINT64)len, (PVOID *)&addr, &wsid, FPGA_BUF_PREALLOCATED); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaPrepareBuffer function for Hostchannel failed.\n"); + + // Obtain Physical address for the Page associated with the buffer + FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n"); + + new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element)); + + FPGA_status = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "HostCh : fpgaGetPhysicalAddress function for Hostchannel failed.\n"); + + new_mem->WsId = wsid; + new_mem->UsrVa = (PVOID)addr; + new_mem->next_page = new_mem->dma_page; + + // IOCTL call to flush CPU buffers + FPGA_status = + fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0); + ACL_PCIE_ASSERT(FPGA_status == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh Pinning 0x%zx bytes at 0x%p.\n", len, addr); + + return 0; +} + +// Only 1 pin_memory can be running at a time +void ACL_PCIE_DMA::pin_memory(PINNED_MEM *new_mem, bool prepin) { + fpga_result result; + UINT64 wsid = 0x0; + + // No active segment of pinned memory - pin one + m_bytes_rem = prepin ? (m_bytes_rem - m_last_pinned_size) : (m_bytes - m_bytes_sent); + UINT32 last_id = prepin ? 0 : m_last_id; + check_last_id(&last_id); + size_t last_id_size_offset = last_id * PAGE_SIZE; + size_t lock_size = (m_bytes_rem > ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset) + ? ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset + : m_bytes_rem; + void *lock_addr = + prepin ? compute_address(m_last_pinned_addr, m_last_pinned_size) : compute_address(m_host_addr, m_bytes_sent); + uintptr_t last_page_portion = (reinterpret_cast<uintptr_t>(lock_addr) + lock_size) & ACL_PCIE_DMA_PAGE_ADDR_MASK; + + // If doing max pinning, check if will *end* on page boundary. If not, better + // to pin a bit less and end up on the boundary. This way, will have fewer + // descriptors to send. + if (lock_size == (ACL_PCIE_DMA_MAX_PINNED_MEM_SIZE - last_id_size_offset) && last_page_portion != 0) { + lock_size -= (size_t)last_page_portion; + } + + assert(lock_size < MAXDWORD); + + // Lock memory using WsId + result = fpgaPrepareBuffer(m_handle, (UINT64)lock_size, (PVOID *)&lock_addr, &wsid, FPGA_BUF_PREALLOCATED); + ACL_PCIE_ASSERT(result == FPGA_OK, "HostCh : fpgaPrepareBuffer function failed.\n"); + + // Obtain Physical address for the Page associated with the buffer + result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, NULL); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + new_mem->dma_page = (sg_element *)malloc(new_mem->pages_rem * sizeof(sg_element)); + + result = fpgaGetPhysicalAddress(m_handle, wsid, (PUINT64)&new_mem->pages_rem, (void *)new_mem->dma_page); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaGetPhysicalAddress function failed.\n"); + + new_mem->WsId = wsid; + new_mem->UsrVa = (PVOID)lock_addr; + new_mem->next_page = new_mem->dma_page; + + // IOCTL call to flush CPU buffers + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_CPU_BUFFERS), (PVOID)&wsid, NULL, 0); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + m_last_pinned_size = lock_size; + m_last_pinned_addr = lock_addr; + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Pinning 0x%zx bytes at 0x%p.\n", lock_size, lock_addr); +} + +// Unpin Memory +void ACL_PCIE_DMA::unpin_memory(PINNED_MEM *old_mem) { + fpga_result result = FPGA_OK; + UINT64 wsId = old_mem->WsId; + + // IOCTL call to flush I/O buffers + result = fpgaProcessDeviceCmd(m_handle, GUID_TO_FPGA_GUID(GUID_PCI_OPENCL_SYNC_IO_BUFFERS), (PVOID)&wsId, NULL, 0); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaProcessDeviceCmd function failed.\n"); + + // UnLock previously locked memory using WsId + result = fpgaReleaseBuffer(m_handle, wsId); + ACL_PCIE_ASSERT(result == FPGA_OK, "fpgaReleaseBuffer function failed.\n"); + + if (old_mem->dma_page != NULL) free(old_mem->dma_page); + + old_mem->next_page = NULL; + old_mem->dma_page = NULL; + old_mem->pages_rem = 0; + old_mem->UsrVa = NULL; +} + +// Check if user's 'ack' API updated end pointer of circular buf +// Update end pointer in IP +int ACL_PCIE_DMA::hostch_push_update() { + HOSTCH_DESC *h = &hostch_data; + + if (h->rd_buf_end_pointer != *h->user_rd_end_pointer) { + h->rd_buf_end_pointer = *h->user_rd_end_pointer; + } else { + h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter; + return 1; + } + h->loop_counter = HOSTCH_LOOP_COUNTER; + + m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, (UINT32)h->rd_buf_end_pointer); + + return 0; +} + +// Check if user's 'ack' API updated front pointer of circular buf +// Update end pointer in IP +int ACL_PCIE_DMA::hostch_pull_update() { + HOSTCH_DESC *h = &hostch_data; + + if (h->wr_buf_front_pointer != *h->user_wr_front_pointer) { + h->wr_buf_front_pointer = *h->user_wr_front_pointer; + } else { + h->loop_counter = (h->loop_counter > 0) ? h->loop_counter - 1 : h->loop_counter; + return 1; + } + h->loop_counter = HOSTCH_LOOP_COUNTER; + + m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, (UINT32)h->wr_buf_front_pointer); + return 0; +} + +// Transfer data between host and device +// This function returns right after the transfer is scheduled +// Return 0 on success +int ACL_PCIE_DMA::read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading) { + ACL_PCIE_ASSERT(m_event == NULL, "non-empty event before a new DMA read/write.\n"); + + // Copy the parameters over and mark the job as running + m_event = e; + m_read = reading; + m_bytes = bytes; + m_host_addr = host_addr; + m_dev_addr = dev_addr; + + // Start processing the request + m_bytes_sent = 0; + m_last_id = ACL_PCIE_DMA_RESET_ID; + m_prepinned = 0; + +#if defined(GEN3_x8) + if (m_read) { + m_io->dma->read32(ACL_PCIE_DMA_WR_LAST_PTR, &m_last_id); + m_last_id++; + } else { + m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id); + m_last_id++; + } + +#elif defined(GEN3_x16) + m_last_id = 0; +#else + #error "Define a PCIe 3.0/4.0/5.0 slot with x1, x2, x4, x8, and x16 lanes option" +#endif + m_idle = false; + + // setup the work inside the threadpool to perform the first DMA transaction + ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0, + return -1, + "failed to schedule the first work for DMA read/write.\n"); + + SubmitThreadpoolWork(m_work); + + return 0; // success +} + +// function to be scheduled to execute whenever an interrupt arrived +bool ACL_PCIE_DMA::update(bool forced) { + cl_ulong start; + int status; + UINT32 max_transfer; + unsigned int i; + HOSTCH_DESC *h = &hostch_data; + size_t current_transfer_size = 0; + + if (!forced) return false; + + if (h->pull_valid && m_idle) { + // Check user memory to see if there was update to user buffer pointer for pull + status = hostch_pull_update(); + } + + if (h->push_valid && m_idle) { + // Check user memory to see if there was update to user buffer pointer for push + status = hostch_push_update(); + } + + if ((h->push_valid | h->pull_valid) && m_idle && (h->thread_sync_valid && h->loop_counter > 0)) { + // setup the work inside the threadpool to perform the first DMA transaction + ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0, + return false, + "HostCh : failed to schedule the first work for DMA read/write.\n"); + SubmitThreadpoolWork(m_work); + return false; + + } else if (m_idle && (h->thread_sync_valid && h->loop_counter == 0)) { + *h->user_thread_sync = 0; + return false; + + } else if (m_idle) { + return false; + } + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Bytes left %zu\n", m_bytes - m_bytes_sent); + // Process any descriptors that have completed + set_desc_table_header(); + cl_ulong finish = 0; + if (ACL_PCIE_DEBUG >= VERBOSITY_BLOCKTX) finish = m_timer->get_time_ns(); + + // Check if the transaction is complete + if (m_bytes_sent == m_bytes) { + if (m_active_mem.UsrVa != NULL) unpin_memory(&m_active_mem); + ACL_PCIE_DMA_DEBUG(":::: [DMA] Transaction complete!\n"); + ACL_PCIE_ASSERT(m_active_mem.UsrVa == NULL, "there is still active pinned memory after the DMA read/write.\n"); + WaitForThreadpoolWorkCallbacks(m_unpin_work, false); + if (!m_dma_unpin_pending.empty()) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] Done, but pinned memory still in queue. Wait until queue is empty.\n"); + if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) { + SubmitThreadpoolWork(m_work); + } + + Sleep(0); + return true; + } + + m_last_id = ACL_PCIE_DMA_RESET_ID; + m_idle = true; + + if (m_event) { + // Use a temporary variable to save the event data and reset m_event before calling event_update_fn + // to avoid race condition that the main thread may start a new DMA transfer before this work-thread + // is able to reset the m_event. + aocl_mmd_op_t temp_event = m_event; + m_event = NULL; + + m_pcie->event_update_fn(temp_event, 0); + } + + if ((h->push_valid | h->pull_valid) && (h->thread_sync_valid && h->loop_counter > 0)) { + ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0, + return false, + "HostCh : failed to schedule the first work for DMA read/write.\n"); + SubmitThreadpoolWork(m_work); + } + + return true; + } + + // Check if we are done with previously pinned memory. + if (m_active_mem.UsrVa == NULL || m_active_mem.pages_rem == 0) { + m_done_mem = m_active_mem; + + WaitForThreadpoolWorkCallbacks(m_pin_work, false); + + // Get pre-pinned memory if there are any. + if (m_pre_pinned_mem.UsrVa != NULL) { + m_active_mem = m_pre_pinned_mem; + m_pre_pinned_mem.UsrVa = NULL; + m_prepinned = 0; + } else if (m_prepinned) { + if (WaitForSingleObject(m_workqueue_semaphore, 0L) == WAIT_OBJECT_0) { + SubmitThreadpoolWork(m_work); + } + Sleep(1); + return true; + } else { + pin_memory(&m_active_mem, false); + } + } + + // Main DMA execution + // 1. Transfers up to 128 descriptors + // - Each descriptor can transfer up to ACL_PCIE_DMA_MAX_TRANSFER_SIZE bytes + // 2. Launch a thread to unpin memory + // 3. Launch a thread to pre-pin next memory + if (m_active_mem.pages_rem > 0) { + // Calculate how many descriptors can be sent + check_last_id(&m_last_id); + ACL_PCIE_DMA_DEBUG(":::: [DMA] last id was %u\n", m_last_id); + max_transfer = ACL_PCIE_DMA_TABLE_SIZE - m_last_id; + + ACL_PCIE_DMA_DEBUG(":::: [DMA] max_transfer %u\n", max_transfer); + + // Build descriptor table + for (i = 0; i < max_transfer; i++) { + if (strcmp(ACL_BSP_TYPE, "Arria10") == 0) { + // A10 DMA + m_active_descriptor = &(m_table_virt_addr->descriptors[i]); + }; + if (strcmp(ACL_BSP_TYPE, "Stratix10") == 0) { + // S10 DMA + m_active_descriptor = &(m_table_virt_addr->descriptors[m_last_id]); + }; + if (m_read) { + if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for read. Page size is %u bytes\n", + ACL_PCIE_DMA_MAX_TRANSFER_SIZE, + m_active_mem.next_page->length); + set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4); + m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + } else { + set_write_desc(m_dev_addr, m_active_mem.next_page->phys_addr, m_active_mem.next_page->length / 4); + m_dev_addr += m_active_mem.next_page->length; + m_bytes_sent += m_active_mem.next_page->length; + current_transfer_size += m_active_mem.next_page->length; + ++m_active_mem.next_page; + m_active_mem.pages_rem--; + } + } else { + if (m_active_mem.next_page->length > ACL_PCIE_DMA_MAX_TRANSFER_SIZE) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] page size is larger than %u for write. Page size is %u bytes\n", + ACL_PCIE_DMA_MAX_TRANSFER_SIZE, + m_active_mem.next_page->length); + set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, ACL_PCIE_DMA_MAX_TRANSFER_SIZE / 4); + m_active_mem.next_page->length -= ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_active_mem.next_page->phys_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_dev_addr += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + m_bytes_sent += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + current_transfer_size += ACL_PCIE_DMA_MAX_TRANSFER_SIZE; + } else { + set_read_desc(m_active_mem.next_page->phys_addr, m_dev_addr, m_active_mem.next_page->length / 4); + m_dev_addr += m_active_mem.next_page->length; + m_bytes_sent += m_active_mem.next_page->length; + current_transfer_size += m_active_mem.next_page->length; + ++m_active_mem.next_page; + m_active_mem.pages_rem--; + } + } + m_last_id++; + if (m_active_mem.pages_rem == 0) break; + } + ACL_PCIE_DMA_DEBUG(":::: [DMA] Transferring %zu bytes using %u descriptors\n", current_transfer_size, i); + + MemoryBarrier(); + // Send descriptor table to DMA + start = m_timer->get_time_ns(); + m_interrupt_disabled = FALSE; + send_dma_desc(); + int pinning = 0; + int unpinning = 0; + cl_ulong unpin_start = 0, unpin_finish = 0; + + // Launch unpin thread + if (m_done_mem.UsrVa != NULL) { + unpin_start = m_timer->get_time_ns(); + unpinning = 1; + + // wait for previous unpin to finish + WaitForThreadpoolWorkCallbacks(m_unpin_work, false); + + QUEUE_STRUCT entry; + + entry.WsId = m_done_mem.WsId; + entry.SGListPtr = (PVOID)(m_done_mem.dma_page); + + m_dma_unpin_pending.push(entry); + + // Make sure Push into unpin queue comes before launching unpin thread + MemoryBarrier(); + + // Launch unpin thread + SubmitThreadpoolWork(m_unpin_work); + + m_done_mem.next_page = NULL; + + // if (m_done_mem.dma_page != NULL) + // free(m_done_mem.dma_page); + + m_done_mem.dma_page = NULL; + + m_done_mem.UsrVa = NULL; + unpin_finish = m_timer->get_time_ns(); + } + + // Launch pre-pin thread + cl_ulong pin_start = 0, pin_finish = 0; + if (((m_bytes_rem - m_last_pinned_size) > 0) && (m_prepinned == 0)) { + pin_start = m_timer->get_time_ns(); + pinning = 1; + m_prepinned = 1; + + // This wait should pass right through. + // There is another wait above, before switching active and prepin memory + WaitForThreadpoolWorkCallbacks(m_pin_work, false); + SubmitThreadpoolWork(m_pin_work); + pin_finish = m_timer->get_time_ns(); + } + + if (m_use_polling) { + wait_finish(); + finish = m_timer->get_time_ns(); + ACL_PCIE_DMA_DEBUG( + ":::: [DMA] Transfer (%zu bytes) completed in %.2f us - %.2f MB/s :: pinning %i in %.2f us :: unpinning %i " + "in %.2f us :: pages rem %li\n", + current_transfer_size, + (finish - start) / 1000.0, + 1000000000.0 * current_transfer_size / (finish - start) / (1024.0 * 1024.0), + pinning, + (pin_finish - pin_start) / 1000.0, + unpinning, + (unpin_finish - unpin_start) / 1000.0, + m_active_mem.pages_rem); + } + + return true; + } + + ACL_PCIE_DMA_DEBUG(":::: [DMA] Nothing happened\n"); + return true; +} + +// Poll DMA transfer +// Only used during host channel create +// Used to transfer the page table of pinned down MMD circular buffer to host channel IP +// The size of this transfer is known to be small +void ACL_PCIE_DMA::poll_wait() { + UINT32 wait_timer; + + while (1) { + wait_timer = ACL_PCIE_DMA_TIMEOUT; + while (wait_timer > 0) { + wait_timer--; + + if (m_table_virt_addr->header.flags[m_last_id - 1] == 1) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh : Wait done\n"); + set_desc_table_header(); +#if defined(GEN3_x8) + if (m_read) + m_io->dma->write32(ACL_PCIE_DMA_WR_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT); + else + m_io->dma->write32(ACL_PCIE_DMA_RD_INT_CONTROL, ACL_PCIE_DMA_ENABLE_INT); +#endif + m_interrupt_disabled = FALSE; + + return; + } + // Delay the CPU from checking the memory for 1us. CPU is still running this thread. + // but reduces memory access from CPU + spin_loop_ns(1000); + } + + // If DMA hasn't finished yet, free up the CPU for 1ms + ACL_PCIE_DMA_DEBUG( + ":::: [DMA] HostCh : Poll wait failed while transferring host channel page table to IP. Sleeping for 1ms.\n"); + Sleep(1); + } +} + +// Set IP's parameters for host channel. +// Parameters are txs address to write updated front/end pointer to on host memory, +// Address to DMA data to, to stream data into kernel +void ACL_PCIE_DMA::hostch_start(int channel) { + HOSTCH_DESC *h = &hostch_data; + + if (channel == (int)ACL_HOST_CHANNEL_0_ID) { + // Fix this Line + h->user_rd_front_pointer_bus_addr = h->m_hostch_rd_pointer.dma_page[0].phys_addr; + + m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_LOW, h->user_rd_front_pointer_bus_addr & 0xffffffffUL); + m_io->dma->write32(ACL_HOST_CHANNEL_0_TXS_ADDR_HIGH, (h->user_rd_front_pointer_bus_addr) >> 32); + m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_LOW, ACL_HOST_CHANNEL_0_DMA_ADDR & 0xffffffffUL); + m_io->dma->write32(ACL_HOST_CHANNEL_0_IP_ADDR_HIGH, ACL_HOST_CHANNEL_0_DMA_ADDR >> 32); + m_io->dma->write32(ACL_HOST_CHANNEL_0_BUF_SIZE, (UINT32)h->buffer_size); + m_io->dma->write32(ACL_HOST_CHANNEL_0_HOST_ENDP, 0); + m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 1); + + } else if (channel == (int)ACL_HOST_CHANNEL_1_ID) { + h->user_wr_end_pointer_bus_addr = h->m_hostch_wr_pointer.dma_page[0].phys_addr + sizeof(size_t); + + m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_LOW, h->user_wr_end_pointer_bus_addr & 0xffffffffUL); + m_io->dma->write32(ACL_HOST_CHANNEL_1_TXS_ADDR_HIGH, (h->user_wr_end_pointer_bus_addr) >> 32); + m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_LOW, ACL_HOST_CHANNEL_1_DMA_ADDR & 0xffffffffUL); + m_io->dma->write32(ACL_HOST_CHANNEL_1_IP_ADDR_HIGH, ACL_HOST_CHANNEL_1_DMA_ADDR >> 32); + m_io->dma->write32(ACL_HOST_CHANNEL_1_BUF_SIZE, (UINT32)h->buffer_size); + m_io->dma->write32(ACL_HOST_CHANNEL_1_HOST_FRONTP, 0); + m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 1); + } +} + +void ACL_PCIE_DMA::hostch_thread_sync(void *user_addr) { + int status; + HOSTCH_DESC *h = &hostch_data; + + if ((user_addr == NULL) & (h->thread_sync_valid)) { + if ((h->push_valid | h->pull_valid) && m_idle && (*h->user_thread_sync == 0)) { + h->loop_counter = HOSTCH_LOOP_COUNTER; + SubmitThreadpoolWork(m_work); + *h->user_thread_sync = 1; + } + } else { + status = hostch_buffer_lock(user_addr, sizeof(size_t), &(h->m_sync_thread_pointer)); + h->user_thread_sync = (size_t *)h->m_sync_thread_pointer.UsrVa; + h->loop_counter = HOSTCH_LOOP_COUNTER; + *h->user_thread_sync = 0; + h->thread_sync_valid = 1; + } +} + +int ACL_PCIE_DMA::hostch_create(void *user_addr, void *buf_pointer, size_t size, int channel) { + int status; + uint32_t i; + HOSTCH_DESC *h = &hostch_data; + + DMA_ADDR dma_address; + h->buffer_size = size; + + setup_dma_desc(); +#if defined(GEN3_x8) + m_io->dma->read32(ACL_PCIE_DMA_RD_LAST_PTR, &m_last_id); + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: read dma_rd_last_id %u\n", (unsigned)m_last_id); + + // Set variables before calling dma helper functions + m_last_id++; +#endif + m_read = 0; + + // Only create push channel if it's not already open + if ((int)ACL_HOST_CHANNEL_0_ID == channel && !h->push_valid) { + h->user_rd_buffer = user_addr; + + // Pin push user buffer + status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_rd_mem)); + status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_rd_pointer)); + + // Map circular push buffer's end pointer so that the driver can poll on it for update from user space + h->user_rd_front_pointer = (size_t *)h->m_hostch_rd_pointer.UsrVa; + h->user_rd_end_pointer = h->user_rd_front_pointer + 1; + + // Send the circular push buffer's pinned address to IP, so IP can initiate DMA transfer by itself. + for (i = 0; i < (size / PAGE_SIZE); i++) { + dma_address = h->m_hostch_rd_mem.next_page->phys_addr; + set_hostch_page_entry(&(h->push_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i); + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: push page entry[%u] = %#016llx size = %#016x\n", + (unsigned)i, + (UINT64)dma_address, + h->m_hostch_rd_mem.next_page->length); + + // Make 4KB pages from an array of pages of m_hostch_rd_mem + if (h->m_hostch_rd_mem.next_page->length == PAGE_SIZE) { + ++h->m_hostch_rd_mem.next_page; + h->m_hostch_rd_mem.pages_rem--; + } else { + h->m_hostch_rd_mem.next_page->length -= PAGE_SIZE; + h->m_hostch_rd_mem.next_page->phys_addr += PAGE_SIZE; + } + } + + set_desc_table_header(); + check_last_id(&m_last_id); + +#if defined(GEN3_x8) + // Set variable before calling dma helper functions + m_active_descriptor = &(m_table_virt_addr->descriptors[0]); + set_read_desc( + h->push_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_RD_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4)); + m_last_id++; + + // Read Interrupt will be disabled from send_dma_desc till poll_wait + m_interrupt_disabled = TRUE; + send_dma_desc(); + poll_wait(); +#endif + + // Reset and enable the push channel on IP + UINT32 data; + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0); + m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data); + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 1); + m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, &data); + + // Set IP's control registers for push channel + hostch_start((int)ACL_HOST_CHANNEL_0_ID); + + h->push_valid = 1; + + // Only launch queue if pull channel is not open and if there is no DMA transfer + if (!h->pull_valid && m_idle) { + ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0, + return -1, + "HostCh : failed to schedule the first work for DMA read/write.\n"); + SubmitThreadpoolWork(m_work); + } + return 0; + + } else if ((int)ACL_HOST_CHANNEL_1_ID == channel && !h->pull_valid) { + h->user_wr_buffer = user_addr; + + // Pin pull user buffer + status = hostch_buffer_lock(user_addr, size, &(h->m_hostch_wr_mem)); + status |= hostch_buffer_lock(buf_pointer, 2 * sizeof(size_t), &(h->m_hostch_wr_pointer)); + + // Map circular pull buffer's end pointer so that the driver can poll on it for update from user space + h->user_wr_front_pointer = (size_t *)h->m_hostch_wr_pointer.UsrVa; + h->user_wr_end_pointer = h->user_wr_front_pointer + 1; + + // Send the circular pull buffer's pinned address to IP, so IP can initiate DMA transfer by itself. + for (i = 0; i < (size / PAGE_SIZE); i++) { + dma_address = h->m_hostch_wr_mem.next_page->phys_addr; + set_hostch_page_entry(&(h->pull_page_table->page_entry[i]), (UINT64)dma_address, (UINT32)i); + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: pull page entry[%u] = %#016llx size = %#016x\n", + (unsigned)i, + (UINT64)dma_address, + h->m_hostch_wr_mem.next_page->length); + + // Make 4KB pages from an array of pages of m_hostch_wr_mem + if (h->m_hostch_wr_mem.next_page->length == PAGE_SIZE) { + ++h->m_hostch_wr_mem.next_page; + h->m_hostch_wr_mem.pages_rem--; + } else { + h->m_hostch_wr_mem.next_page->length -= PAGE_SIZE; + h->m_hostch_wr_mem.next_page->phys_addr += PAGE_SIZE; + } + } + + set_desc_table_header(); + check_last_id(&m_last_id); + +#if defined(GEN3_x8) + // Set variable before calling dma helper functions + m_active_descriptor = &(m_table_virt_addr->descriptors[0]); + set_read_desc( + h->pull_page_table_bus_addr, (UINT64)(ACL_PCIE_DMA_WR_FIFO_BASE), (UINT32)((32 * size / PAGE_SIZE) / 4)); + m_last_id++; + + // Read Interrupt will be disabled from send_dma_desc till poll_wait + m_interrupt_disabled = TRUE; + send_dma_desc(); + poll_wait(); +#endif + + // Reset and enable the pull channel on IP + UINT32 temp; + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0); + m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp); + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 1); + m_io->pcie_cra->read32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, &temp); + + // Set IP's control registers for pull channel + hostch_start((int)ACL_HOST_CHANNEL_1_ID); + + h->pull_valid = 1; + + // Only launch queue if push channel is not open and if there is no DMA transfer + if (!h->push_valid && m_idle) { + ACL_PCIE_ERROR_IF(WaitForSingleObject(m_workqueue_semaphore, 0L) != WAIT_OBJECT_0, + return -1, + "HostCh : failed to schedule the first work for DMA read/write.\n"); + SubmitThreadpoolWork(m_work); + } + return 0; + + } else { + return ERROR_INVALID_CHANNEL; + } +} + +// Destroy channel call from user. +// Unlock all buffers and reset IP +int ACL_PCIE_DMA::hostch_destroy(int channel) { + HOSTCH_DESC *h = &hostch_data; + + if ((int)ACL_HOST_CHANNEL_0_ID == channel) { + if (h->push_valid) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying push host channel."); + m_io->dma->write32(ACL_HOST_CHANNEL_0_LOGIC_EN, 0); + MemoryBarrier(); + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PUSH + HOSTCH_BASE, 0); + MemoryBarrier(); + + if (h->m_hostch_rd_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_mem); + if (h->m_hostch_rd_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_rd_pointer); + h->push_valid = 0; + + if (!h->pull_valid) { + if (h->thread_sync_valid) { + h->thread_sync_valid = 0; + if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer); + } + if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false); + } + } + } else if ((int)ACL_HOST_CHANNEL_1_ID == channel) { + if (h->pull_valid) { + ACL_PCIE_DMA_DEBUG(":::: [DMA] HostCh: destroying pull host channel."); + m_io->dma->write32(ACL_HOST_CHANNEL_1_LOGIC_EN, 0); + MemoryBarrier(); + m_io->pcie_cra->write32(HOSTCH_CONTROL_ADDR_PULL + HOSTCH_BASE, 0); + MemoryBarrier(); + + if (h->m_hostch_wr_mem.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_mem); + if (h->m_hostch_wr_pointer.UsrVa != NULL) unpin_memory(&h->m_hostch_wr_pointer); + h->pull_valid = 0; + + if (!h->push_valid) { + if (h->thread_sync_valid) { + h->thread_sync_valid = 0; + if (h->m_sync_thread_pointer.UsrVa != NULL) unpin_memory(&h->m_sync_thread_pointer); + } + if (m_idle) WaitForThreadpoolWorkCallbacks(m_work, false); + } + } + } + + return 0; +} + +#endif // WINDOWS diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h new file mode 100644 index 0000000..311c634 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_dma_windows.h @@ -0,0 +1,262 @@ +#ifndef ACL_PCIE_DMA_WINDOWS_H +#define ACL_PCIE_DMA_WINDOWS_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_dma_windows.h --------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle Windows-specific DMA operations. */ +/* The actual implementation of the class lives in the acl_pcie_dma_windows.cpp */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +// TODO: update DMA related stuff and add wsid + +#if defined(WINDOWS) + +#include "hw_host_channel.h" +#include "hw_pcie_dma.h" + +#include <windows.h> +#include <queue> + +class ACL_PCIE_DEVICE; +class ACL_PCIE_MM_IO_MGR; +class ACL_PCIE_TIMER; + +typedef struct _PAGE_INFO { + ULONG64 pPhysicalAddr; + UINT32 dwBytes; +} PAGE_INFO, *PPAGE_INFO; + +typedef struct _DMA_PAGE { + sg_element *Page; + DWORD dwPages; + UINT64 WsId; +} DMA_PAGE, *PDMA_PAGE; + +typedef struct _QUEUE_STRUCT { + UINT64 WsId; + PVOID SGListPtr; + +} QUEUE_STRUCT, *PQUEUE_STRUCT; + +class ACL_PCIE_DMA { + public: + ACL_PCIE_DMA(fpga_handle Handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie); + ~ACL_PCIE_DMA(); + + bool is_idle() { return m_idle; }; + void stall_until_idle() { + while (!is_idle()) yield(); + }; + + // Called by acl_pcie_device to check dma interrupt status + int check_dma_interrupt(unsigned int *dma_update); + + // Perform operations required when a DMA interrupt comes + void service_interrupt(); + + // Relinquish the CPU to let any other thread to run + // Return 0 since there is no useful work to be performed here + int yield(); + + // Transfer data between host and device + // This function returns right after the transfer is scheduled + // Return 0 on success + int read_write(void *host_addr, size_t dev_addr, size_t bytes, aocl_mmd_op_t e, bool reading); + + // the callback function to be scheduled inside the interrupt handler + friend void CALLBACK myWorkCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work); + + // Seperate function to unpin memory + friend void CALLBACK myWorkUnpinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work); + + // Seperate function to pin memory + friend void CALLBACK myWorkPinCallback(PTP_CALLBACK_INSTANCE instance, void *context, PTP_WORK work); + + // Host channel functions + int hostch_create(void *user_addr, void *buf_pointer, size_t size, int reading); + int hostch_destroy(int reading); + void hostch_thread_sync(void *m_sync_thread); + + private: + ACL_PCIE_DMA &operator=(const ACL_PCIE_DMA &) { return *this; } + + ACL_PCIE_DMA(const ACL_PCIE_DMA &src) {} + + struct PINNED_MEM { + sg_element *next_page; + DWORD pages_rem; + sg_element *dma_page; // Pointer to the original array + UINT64 WsId; + PVOID UsrVa; + }; + + struct HOSTCH_DESC { + size_t buffer_size; + unsigned int loop_counter; + + // Host channel valid + // If channel is open, equal to 1 + int push_valid; + int pull_valid; + + // User memory circular buffer + void *user_rd_buffer; + void *user_wr_buffer; + + // Array of physical addresses of locked hostch pages + HOSTCH_TABLE *push_page_table; + HOSTCH_TABLE *pull_page_table; + + DMA_PAGE push_page_table_addr; + DMA_PAGE pull_page_table_addr; + + // Physical address of the page table + DMA_ADDR push_page_table_bus_addr; + DMA_ADDR pull_page_table_bus_addr; + + PINNED_MEM m_hostch_rd_mem; + PINNED_MEM m_hostch_wr_mem; + + // User memory circular buffer front and end pointers + size_t *user_rd_front_pointer; + size_t *user_rd_end_pointer; + size_t *user_wr_front_pointer; + size_t *user_wr_end_pointer; + + DMA_ADDR user_rd_front_pointer_bus_addr; + DMA_ADDR user_wr_end_pointer_bus_addr; + + PINNED_MEM m_hostch_rd_pointer; + PINNED_MEM m_hostch_wr_pointer; + + // Keep track of push end pointer + size_t rd_buf_end_pointer; + + // Keep track of pull front pointer + size_t wr_buf_front_pointer; + + // User and driver thread synchronizer + int thread_sync_valid; + size_t *user_thread_sync; + DMA_ADDR user_thread_sync_bus_addr; + PINNED_MEM m_sync_thread_pointer; + }; + + // function to be scheduled to execute whenever an interrupt arrived + bool update(bool force_update = false); + + // Helper functions + inline void *compute_address(void *base, uintptr_t offset); + void set_read_desc(DMA_ADDR source, UINT64 dest, UINT32 ctl_dma_len); + void set_write_desc(UINT64 source, DMA_ADDR dest, UINT32 ctl_dma_len); + void set_desc_table_header(); + void send_dma_desc(); + void check_last_id(UINT32 *last_id); + void pin_memory(PINNED_MEM *new_mem, bool prepin); + void unpin_memory(PINNED_MEM *old_mem); + void wait_finish(); + void unpin_from_queue(); + void prepin_memory(); + + void set_immediate_desc(DMA_DESC_ENTRY *desc, UINT64 addr, UINT32 data, UINT32 id); + void add_extra_dma_desc(); + // Hostchannel helper function + void hostch_start(int channel); + int hostch_push_update(); + int hostch_pull_update(); + int hostch_buffer_lock(void *addr, size_t len, PINNED_MEM *new_mem); + void poll_wait(); + void set_hostch_page_entry(HOSTCH_ENTRY *page_entry, UINT64 page_addr, UINT32 page_num); + void setup_dma_desc(); + void spin_loop_ns(UINT64 wait_ns); + + // From environment variable + int m_use_polling; + + // The dma object we are currently building transactions for + PINNED_MEM m_active_mem; + PINNED_MEM m_pre_pinned_mem; + PINNED_MEM m_done_mem; + + // Hostchannel Struct + HOSTCH_DESC hostch_data; + + // The transaction we are currently working on + DMA_DESC_TABLE *m_table_virt_addr; + DMA_PAGE m_table_dma_addr; + DMA_ADDR m_table_dma_phys_addr; + DMA_DESC_ENTRY *m_active_descriptor; + + size_t m_last_pinned_size; + void *m_last_pinned_addr; + + // Signal to stop multiple pre-pinning from running + bool m_prepinned; + + // Local copy of last transfer id. Read once when DMA transfer starts + UINT32 m_last_id; + + // variables for the read/write request + aocl_mmd_op_t m_event; + size_t m_dev_addr; + void *m_host_addr; + size_t m_bytes; + size_t m_bytes_sent; + size_t m_bytes_rem; + bool m_read; + bool m_idle; + bool m_interrupt_disabled; + + fpga_handle m_handle; + ACL_PCIE_DEVICE *m_pcie; + ACL_PCIE_MM_IO_MGR *m_io; + ACL_PCIE_TIMER *m_timer; + + // variables needed for the threadpool and works that submitted to it + TP_CALLBACK_ENVIRON m_callback_env; + PTP_POOL m_threadpool; + PTP_WORK m_work; + + // This variable is accessed by the callback function defined in acl_pcie_dma_windows.cpp + // This semaphore is intended to keep at most 1 work in queued (not running) + HANDLE m_workqueue_semaphore; + + // Seperate thread to unpin + + std::queue<QUEUE_STRUCT> m_dma_unpin_pending; + + TP_CALLBACK_ENVIRON m_unpin_callback_env; + PTP_POOL m_unpin_threadpool; + PTP_WORK m_unpin_work; + + // Separate thread to pre-pin + + TP_CALLBACK_ENVIRON m_pin_callback_env; + PTP_POOL m_pin_threadpool; + PTP_WORK m_pin_work; +}; + +#endif // WINDOWS + +#endif // ACL_PCIE_DMA_WINDOWS_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp new file mode 100644 index 0000000..0dc6d74 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.cpp @@ -0,0 +1,764 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_hostch.cpp ------------------------------------------ C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle Linux-specific DMA operations. */ +/* The declaration of the class lives in the acl_pcie_dma_linux.h */ +/* The actual implementation of DMA operation is inside the Linux kernel driver. */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_pcie_hostch.h" +#include "acl_pcie.h" + +// other header files inside MMD driver +#include "acl_pcie_debug.h" +#include "acl_pcie_device.h" +#include "acl_pcie_mm_io.h" +#include "acl_pcie_timer.h" +#include "hw_host_channel.h" + +// other standard header files +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <iostream> + +#if defined(LINUX) +#include <unistd.h> +#endif // LINUX +#if defined(WINDOWS) +#include "acl_pcie_dma_windows.h" +#endif // WINDOWS + +void acl_aligned_malloc(void **result, size_t size) { +#if defined(LINUX) + int posix_success; + *result = NULL; + posix_success = posix_memalign(result, PAGE_SIZE, size); + ACL_PCIE_ASSERT(posix_success == 0, "posix_memalign has failed.\n"); +#endif // LINUX +#if defined(WINDOWS) + *result = _aligned_malloc(size, PAGE_SIZE); +#endif // WINDOWS +} + +void acl_aligned_free(void *ptr) { +#if defined(LINUX) + free(ptr); +#endif // LINUX +#if defined(WINDOWS) + _aligned_free(ptr); +#endif // WINDOWS +} + +ACL_PCIE_HOSTCH::ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma) + : m_push_queue(NULL), + m_push_queue_local_end_p(0), + m_push_queue_size(0), + m_pull_queue(NULL), + m_pull_queue_local_front_p(0), + m_pull_queue_size(0), + m_pull_queue_available(0), + m_pull_queue_pointer(NULL), + m_push_queue_pointer(NULL), + m_pull_queue_front_p(NULL), + m_pull_queue_end_p(NULL), + m_push_queue_front_p(NULL), + m_push_queue_end_p(NULL), + m_sync_thread(NULL) { + ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating dma object.\n"); + ACL_PCIE_ASSERT(io != NULL, "passed in an empty pointer for io when creating dma object.\n"); + ACL_PCIE_ASSERT(pcie != NULL, "passed in an empty pointer for pcie when creating dma object.\n"); + ACL_PCIE_ASSERT(dma != NULL, "passed in an empty pointer for dma when creating dma object.\n"); + + m_handle = handle; + m_pcie = pcie; + m_io = io; + m_dma = dma; + m_timer = new ACL_PCIE_TIMER(); + + // Set the valid for all the channels and helper function that checks status of driver thread + // to 0 + m_hostch_push_valid = 0; + m_hostch_pull_valid = 0; + m_sync_thread_valid = 0; + + const char *dma_timer = getenv("ACL_PCIE_DMA_TIMER"); + if (dma_timer) + m_use_timer = 1; + else + m_use_timer = 0; +} + +ACL_PCIE_HOSTCH::~ACL_PCIE_HOSTCH() { + // If push channel (channel 0) is valid, reset its IP and unpin the MMD buffer + if (m_hostch_push_valid) { +#if defined(LINUX) + struct acl_cmd driver_cmd; + int bytes_read; + // Save the device id for the selected board + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = NULL; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_destroy(ACL_HOST_CHANNEL_0_ID); +#endif // WINDOWS + + if (m_push_queue) { + acl_aligned_free(m_push_queue); + m_push_queue = NULL; + } + + if (m_push_queue_pointer) { + acl_aligned_free(m_push_queue_pointer); + m_push_queue_pointer = NULL; + } + + m_hostch_push_valid = 0; + } + + // If pull channel (channel 1) is valid, reset its IP and unpin the MMD buffer + if (m_hostch_pull_valid) { +#if defined(LINUX) + struct acl_cmd driver_cmd; + int bytes_read; + // Save the device id for the selected board + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = NULL; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_destroy(ACL_HOST_CHANNEL_1_ID); +#endif // WINDOWS + + if (m_pull_queue) { + acl_aligned_free(m_pull_queue); + m_pull_queue = NULL; + } + + if (m_pull_queue_pointer) { + acl_aligned_free(m_pull_queue_pointer); + m_pull_queue_pointer = NULL; + } + + m_hostch_pull_valid = 0; + } + + if (m_timer) { + delete m_timer; + m_timer = NULL; + } +} + +// Get host channel version of currently programmed device +unsigned int ACL_PCIE_HOSTCH::get_hostch_version() { + // Make sure version is not what you expect + unsigned int version = ACL_VERSIONID ^ 1; + unsigned int hostch_version = ACL_HOSTCH_ZERO_CHANNELS ^ 1; + + // Read device version + m_io->version->read32(0, &version); + + if (!ACL_HOSTCH_ENABLE) { + return ACL_HOSTCH_ZERO_CHANNELS; + } + + // Read hostchannel version + m_io->hostch_ver->read32(0, &hostch_version); + + return hostch_version; +} + +// Function to check that the driver thread that update host channel IP with +// user's updates to MMD buffer's end and front index, is still running. +// Ack call will call sync_thread() if driver thread has timed out. +// Linux kernel space driver thread is set to timeout in 1ms +// if there hasn't been any changes to circular buffer pointer from the host. +int ACL_PCIE_HOSTCH::launch_sync_thread() { + if (m_sync_thread_valid == 0) { + acl_aligned_malloc((void **)&m_sync_thread, sizeof(size_t)); + + if (m_sync_thread == NULL) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n"); + return -1; + } + +#if defined(LINUX) + // Save the device id for the selected board + struct acl_cmd driver_cmd; + int bytes_read; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = m_sync_thread; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_thread_sync(m_sync_thread); +#endif // WINDOWS + + m_sync_thread_valid = 1; + } else { + return 1; + } + return 0; +} + +int ACL_PCIE_HOSTCH::sync_thread() { + if (m_sync_thread_valid && (*m_sync_thread == 0)) { +#if defined(LINUX) + // Save the device id for the selected board + struct acl_cmd driver_cmd; + int bytes_read; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_THREAD_SYNC; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = NULL; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_thread_sync(NULL); +#endif // WINDOWS + + return 0; + } + return 1; +} + +// This is called only when there aren't any host channels open +// m_sync_thread is unpinned as part of destroy call to driver. Now free it. +void ACL_PCIE_HOSTCH::destroy_sync_thread() { + if (m_sync_thread_valid) { + if (m_sync_thread != NULL) acl_aligned_free(m_sync_thread); + + m_sync_thread_valid = 0; + m_sync_thread = NULL; + } +} + +// Create host channel. Allocate circular buffer and pin it. +// Then set channel to valid. +int ACL_PCIE_HOSTCH::create_hostchannel(char *name, size_t queue_depth, int direction) { + int status; + unsigned int hostch_version; + + hostch_version = get_hostch_version(); + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel version read was %u\n", hostch_version); + + // Check if channel name user wants to open exists + if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) && + (strncmp(ACL_HOST_CHANNEL_0_NAME, name, strnlen(ACL_HOST_CHANNEL_0_NAME, MAX_NAME_SIZE)) == 0)) { + int channel = ACL_HOST_CHANNEL_0_ID; + // Check if hostchannel version is one that has ACL_HOST_CHANNEL_0_ID + if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, + ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n", + ACL_HOST_CHANNEL_0_NAME); + return ERROR_INVALID_CHANNEL; + } + + // check if the direction for the channel is correct + if (direction != ACL_HOST_CHANNEL_0_WRITE) return ERROR_INCORRECT_DIRECTION; + + // Check if channel was already opened previously + if (m_hostch_push_valid) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_0_NAME); + return ERROR_CHANNEL_PREVIOUSLY_OPENED; + } + + // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size + size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.))); + size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE) + ? HOSTCH_MAX_BUF_SIZE + : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE); + + // Make sure the channel depth is at least 4KB + if (!channel_depth) channel_depth = PAGE_SIZE; + + // Create circular buffer for push + acl_aligned_malloc(&m_push_queue, channel_depth); + + if (m_push_queue == NULL) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n"); + return -1; + } + + // Create buffer to hold front and end pointer for the circular buffer + acl_aligned_malloc((void **)&m_push_queue_pointer, sizeof(size_t) * 2); + + if (m_push_queue_pointer == NULL) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n"); + acl_aligned_free(m_push_queue); + return -1; + } + + // Set parameters for the push channel + m_push_queue_size = channel_depth; + m_push_queue_local_end_p = 0; + + m_push_queue_front_p = m_push_queue_pointer; + m_push_queue_end_p = (m_push_queue_pointer + 1); + + *m_push_queue_front_p = 0; + *m_push_queue_end_p = 0; + + // sync_thread() used to check if kernel thread is still running when user has additional data available. + status = launch_sync_thread(); + if (status == -1) { + acl_aligned_free(m_push_queue); + acl_aligned_free(m_push_queue_pointer); + return -1; + } + +#if defined(LINUX) + struct acl_cmd driver_cmd; + int bytes_read; + // Send the pointers for the 2 buffers to driver, along with queue size + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_RD; + driver_cmd.device_addr = m_push_queue_pointer; + driver_cmd.user_addr = m_push_queue; + driver_cmd.size = channel_depth; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_create(m_push_queue, m_push_queue_pointer, channel_depth, channel); +#endif // WINDOWS + + m_hostch_push_valid = 1; + return channel; + } else if ((strnlen(name, MAX_NAME_SIZE) == strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) && + (strncmp(ACL_HOST_CHANNEL_1_NAME, name, strnlen(ACL_HOST_CHANNEL_1_NAME, MAX_NAME_SIZE)) == 0)) { + int channel = ACL_HOST_CHANNEL_1_ID; + + // Check if hostchannel version is one that has ACL_HOST_CHANNEL_1_ID + if (hostch_version != ACL_HOSTCH_TWO_CHANNELS) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, + ":::: [HOST CHANNEL] Host Channel %s does not exist in currently programmed device.\n", + ACL_HOST_CHANNEL_1_NAME); + return ERROR_INVALID_CHANNEL; + } + + // Check if direction is correct + if (direction != ACL_HOST_CHANNEL_1_WRITE) return ERROR_INCORRECT_DIRECTION; + + // Make sure the channel depth is at most 1MB, power-of-2, and divisible by page_size + size_t queue_depth_upper_pow2 = (size_t)pow(2, ceil(log((double)queue_depth) / log(2.))); + size_t channel_depth = (queue_depth_upper_pow2 >= HOSTCH_MAX_BUF_SIZE) + ? HOSTCH_MAX_BUF_SIZE + : queue_depth_upper_pow2 & (HOSTCH_MAX_BUF_SIZE - PAGE_SIZE); + + // Make sure the circular buffer is at least 4KB + if (!channel_depth) channel_depth = PAGE_SIZE; + + // Check if pull channel was previously opened + if (m_hostch_pull_valid) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel '%s' already open\n", ACL_HOST_CHANNEL_1_NAME); + return ERROR_CHANNEL_PREVIOUSLY_OPENED; + } + + // Create circular buffer + acl_aligned_malloc(&m_pull_queue, channel_depth); + + if (m_pull_queue == NULL) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n"); + return -1; + } + + // Create buffer to hold front and end pointer of the circular buffer + acl_aligned_malloc((void **)&m_pull_queue_pointer, sizeof(size_t) * 2); + + if (m_pull_queue_pointer == NULL) { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Internal buffer memory allocation failed.\n"); + acl_aligned_free(m_pull_queue); + return -1; + } + + // Set pull channel parameters + m_pull_queue_size = channel_depth; + m_pull_queue_available = 0; + m_pull_queue_local_front_p = 0; + + m_pull_queue_front_p = m_pull_queue_pointer; + m_pull_queue_end_p = (m_pull_queue_pointer + 1); + + *m_pull_queue_front_p = 0; + *m_pull_queue_end_p = 0; + + // sync_thread() used to check if kernel thread is dead or alive when user pulls data + status = launch_sync_thread(); + if (status == -1) { + acl_aligned_free(m_pull_queue); + acl_aligned_free(m_pull_queue_pointer); + return -1; + } + +#if defined(LINUX) + // Send the pointers for the 2 buffers to driver, along with queue size, and initiate IP + struct acl_cmd driver_cmd; + int bytes_read; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_CREATE_WR; + driver_cmd.device_addr = m_pull_queue_pointer; + driver_cmd.user_addr = m_pull_queue; + driver_cmd.size = channel_depth; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_create(m_pull_queue, m_pull_queue_pointer, channel_depth, channel); +#endif // WINDOWS + + m_hostch_pull_valid = 1; + return channel; + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel does not exist.\n"); + return ERROR_INVALID_CHANNEL; + } +} + +// Destroy Channel. Unlock all buffer, and set channel to invalid. +int ACL_PCIE_HOSTCH::destroy_hostchannel(int channel) { + if (channel == ACL_HOST_CHANNEL_0_ID) { + if (m_hostch_push_valid) { + // set pull IP to reset and unlock all buffers +#if defined(LINUX) + struct acl_cmd driver_cmd; + int bytes_read; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_RD; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = NULL; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_destroy(channel); +#endif // WINDOWS + + if (m_push_queue) { + acl_aligned_free(m_push_queue); + m_push_queue = NULL; + } + if (m_push_queue_pointer) { + acl_aligned_free(m_push_queue_pointer); + m_push_queue_pointer = NULL; + } + + m_hostch_push_valid = 0; + if (m_hostch_pull_valid == 0) { + destroy_sync_thread(); + } + return 0; + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME); + return ERROR_CHANNEL_CLOSED; + } + } else if (channel == ACL_HOST_CHANNEL_1_ID) { + if (m_hostch_pull_valid) { +#if defined(LINUX) + // set push IP to reset and unlock all buffers + struct acl_cmd driver_cmd; + int bytes_read; + driver_cmd.bar_id = ACLPCI_CMD_BAR; + driver_cmd.command = ACLPCI_CMD_HOSTCH_DESTROY_WR; + driver_cmd.device_addr = NULL; + driver_cmd.user_addr = NULL; + driver_cmd.size = 0; + bytes_read = read(m_handle, &driver_cmd, sizeof(driver_cmd)); + ACL_PCIE_ASSERT(bytes_read != -1, "error reading driver command.\n"); +#endif // LINUX +#if defined(WINDOWS) + m_dma->hostch_destroy(channel); +#endif // WINDOWS + + if (m_pull_queue) { + acl_aligned_free(m_pull_queue); + m_pull_queue = NULL; + } + + if (m_pull_queue_pointer) { + acl_aligned_free(m_pull_queue_pointer); + m_pull_queue_pointer = NULL; + } + + m_hostch_pull_valid = 0; + + if (m_hostch_push_valid == 0) { + destroy_sync_thread(); + } + + return 0; + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME); + return ERROR_CHANNEL_CLOSED; + } + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel); + } + + return ERROR_INVALID_CHANNEL; +} + +// Call for user to get pointer to location in circular buffer +// User can then write data or read data from the buffer, depending on direction. +void *ACL_PCIE_HOSTCH::get_buffer(size_t *buffer_size, int channel, int *status) { + // Check if channel exists + if (channel == ACL_HOST_CHANNEL_0_ID) { + // Check if channel was created + if (m_hostch_push_valid == 0) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME); + *status = ERROR_CHANNEL_CLOSED; + *buffer_size = 0; + return NULL; + } + *status = 0; + + char *temp_input_queue = (char *)m_push_queue; + + size_t push_queue_end, push_queue_front; + + // m_push_queue_front_p is directly updated by host channel IP + // through write over Txs. Save value in local variable, + // so it doesn't get modified in middle of get_buffer call + push_queue_end = *m_push_queue_end_p; + push_queue_front = *m_push_queue_front_p; + + // Calculate available free space in host to device push buffer + size_t push_buf_avail; + if (push_queue_end > push_queue_front) + push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32; + else if (push_queue_end < push_queue_front) + push_buf_avail = push_queue_front - push_queue_end - 32; + else + push_buf_avail = m_push_queue_size - 32; + + // Calculate how much of the free space is before loop around and after loop around + size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail) + ? push_buf_avail + : m_push_queue_size - m_push_queue_local_end_p; + size_t loop_push = (m_push_queue_size > m_push_queue_local_end_p + push_buf_avail) + ? 0 + : (m_push_queue_local_end_p + push_buf_avail - m_push_queue_size); + + // Return to user the pointer to circular buffer for + // space that's available without loop around + if (cont_push > 0) { + *buffer_size = cont_push; + return temp_input_queue + m_push_queue_local_end_p; + } else if (loop_push > 0) { + *buffer_size = loop_push; + return temp_input_queue; + } else { + *status = 0; + *buffer_size = 0; + + // See if the driver thread is still running + sync_thread(); + + return NULL; + } + } else if (channel == ACL_HOST_CHANNEL_1_ID) { + if (m_hostch_pull_valid == 0) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME); + *status = ERROR_CHANNEL_CLOSED; + *buffer_size = 0; + return NULL; + } + *status = 0; + + char *temp_output_queue = (char *)m_pull_queue; + + size_t pull_queue_end, pull_queue_front; + + // m_pull_queue_end_p is directly updated by host channel IP + // through write over Txs. Save value in local variable, + // so it doesn't get modified in middle of get_buffer call + pull_queue_end = *m_pull_queue_end_p; + pull_queue_front = *m_pull_queue_front_p; + + // Calculate available new data in device to host pull buffer + if (pull_queue_end > pull_queue_front) + m_pull_queue_available = pull_queue_end - pull_queue_front; + else if (pull_queue_end < pull_queue_front) + m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end; + else + m_pull_queue_available = 0; + + // Calculate how much of the data is before loop around and after loop around + size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available) + ? m_pull_queue_available + : (m_pull_queue_size - m_pull_queue_local_front_p); + size_t loop_pull = (m_pull_queue_size > m_pull_queue_local_front_p + m_pull_queue_available) + ? 0 + : (m_pull_queue_local_front_p + m_pull_queue_available - m_pull_queue_size); + + // Return to user the pointer to circular buffer for + // data that's available without loop around + if (cont_pull > 0) { + *buffer_size = cont_pull; + return temp_output_queue + m_pull_queue_local_front_p; + } else if (loop_pull > 0) { + *buffer_size = loop_pull; + return temp_output_queue; + } else { + *buffer_size = 0; + + // See if the driver thread is still running + sync_thread(); + + return NULL; + } + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel); + *status = ERROR_INVALID_CHANNEL; + *buffer_size = 0; + return NULL; + } +} + +// User has acknowledged the buffer, meaning data was written to or read from the buffter. +// Hand off to API using end pointer if push channel, and front pointer if pull channel. +size_t ACL_PCIE_HOSTCH::ack_buffer(size_t send_size, int channel, int *status) { + if (channel == ACL_HOST_CHANNEL_0_ID) { + if (m_hostch_push_valid == 0) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_0_NAME); + *status = ERROR_CHANNEL_CLOSED; + return 0; + } + *status = 0; + + size_t push_queue_end, push_queue_front; + + // Same calculations as get buffer call to see how much + // space is available in MMD circular buffer + push_queue_end = *m_push_queue_end_p; + push_queue_front = *m_push_queue_front_p; + + size_t push_buf_avail; + if (push_queue_end > push_queue_front) + push_buf_avail = m_push_queue_size - push_queue_end + push_queue_front - 32; + else if (push_queue_end < push_queue_front) + push_buf_avail = push_queue_front - push_queue_end - 32; + else + push_buf_avail = m_push_queue_size - 32; + + // Check to see if user wants to send more than the space available in buffer + // Chose lesser of the two to send + size_t user_words = send_size / 32; + size_t current_push = ((user_words * 32) > push_buf_avail) ? push_buf_avail : (user_words * 32); + + // User can't write back to beginning of MMD buffer, since they can't loop around from the pointer + // they got from get_buffer. Only send up to the end of MMD circular buffer to host channel IP + size_t cont_push = (m_push_queue_size > m_push_queue_local_end_p + current_push) + ? current_push + : (m_push_queue_size - m_push_queue_local_end_p); + + // Update the end index that the driver thread will read, to write the update to host channel IP + // and loop around + m_push_queue_local_end_p = + (m_push_queue_local_end_p + current_push >= m_push_queue_size) ? 0 : m_push_queue_local_end_p + current_push; + *m_push_queue_end_p = m_push_queue_local_end_p; + + // See if the driver thread is still running + sync_thread(); + + return cont_push; + } else if (channel == ACL_HOST_CHANNEL_1_ID) { + if (m_hostch_pull_valid == 0) { + ACL_PCIE_DEBUG_MSG_VERBOSE( + VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Host Channel %s is not open.\n", ACL_HOST_CHANNEL_1_NAME); + *status = ERROR_CHANNEL_CLOSED; + return 0; + } + *status = 0; + + size_t driver_pulled; + + size_t pull_queue_end, pull_queue_front; + + // Same calculations as get buffer call to see how much + // data is available in MMD circular buffer + pull_queue_end = *m_pull_queue_end_p; + pull_queue_front = *m_pull_queue_front_p; + + if (pull_queue_end > pull_queue_front) + m_pull_queue_available = pull_queue_end - pull_queue_front; + else if (pull_queue_end < pull_queue_front) + m_pull_queue_available = m_pull_queue_size - pull_queue_front + pull_queue_end; + else + m_pull_queue_available = 0; + + // Check to see if user read more than the data available in buffer + // Chose lesser of the two to tell the user how much was actually + // freed up for host channel IP to write to. + driver_pulled = (send_size > m_pull_queue_available) ? m_pull_queue_available : send_size; + + // User can't loop around and read from the beginning of MMD buffer + // Tell the host channel IP that the buffer is free, only up to the end of the circular buffer + size_t cont_pull = (m_pull_queue_size > m_pull_queue_local_front_p + driver_pulled) + ? driver_pulled + : (m_pull_queue_size - m_pull_queue_local_front_p); + + // Update the front index that the driver thread will read, to write the update to host channel IP + // and loop around + m_pull_queue_local_front_p = (m_pull_queue_local_front_p + driver_pulled >= m_pull_queue_size) + ? 0 + : m_pull_queue_local_front_p + driver_pulled; + *m_pull_queue_front_p = m_pull_queue_local_front_p; + + // See if the driver thread is still running + sync_thread(); + + return cont_pull; + } else { + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_BLOCKTX, ":::: [HOST CHANNEL] Channel with ID %i does not exist.\n", channel); + *status = ERROR_INVALID_CHANNEL; + return 0; + } +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h new file mode 100644 index 0000000..e86fa61 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_hostch.h @@ -0,0 +1,136 @@ +#ifndef ACL_PCIE_HOSTCH_H +#define ACL_PCIE_HOSTCH_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_hostch.h -------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle Linux-specific DMA operations. */ +/* The actual implementation of the class lives in the acl_pcie_dma_linux.cpp */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#ifdef DLA_MMD +#include <cstddef> //size_t +#if defined(LINUX) +typedef int fpga_handle; +#else +#include <opae/fpga.h> +#endif +#endif + +class ACL_PCIE_DEVICE; +class ACL_PCIE_MM_IO_MGR; +class ACL_PCIE_TIMER; +class ACL_PCIE_DMA; + +class ACL_PCIE_HOSTCH { + public: + ACL_PCIE_HOSTCH(fpga_handle handle, ACL_PCIE_MM_IO_MGR *io, ACL_PCIE_DEVICE *pcie, ACL_PCIE_DMA *dma); + + ~ACL_PCIE_HOSTCH(); + + // Initialize host channel specified by name, and return handle to it + int create_hostchannel(char *name, size_t queue_depth, int direction); + + // Destroy host channel specified by channel handle + // return 0 on success and negative otherwise + int destroy_hostchannel(int channel); + + // Provide pointer to user with pointer to write and read to host channel + // IP with. Pointer is pointer to MMD circular buffer, that's pre-pinned. + // Address of this pre-pinned memory is transferred to IP during create + void *get_buffer(size_t *buffer_size, int channel, int *status); + + // Acknowledge from user that send_size bytes of data has be written to + // or read from host channel MMD buffer, that's provided by the channel + // handle. This will move end index for push channel, and front index for + // pull channel + size_t ack_buffer(size_t send_size, int channel, int *status); + + private: + ACL_PCIE_HOSTCH &operator=(const ACL_PCIE_HOSTCH &) { return *this; } + + ACL_PCIE_HOSTCH(const ACL_PCIE_HOSTCH &src) {} + + // Host Channel version of programmed device + unsigned int get_hostch_version(); + + // Helper functions to see if the thread that updates + // host channel IP with user's buffer updates, is still running + int launch_sync_thread(); + int sync_thread(); + void destroy_sync_thread(); + + fpga_handle m_handle; + ACL_PCIE_DEVICE *m_pcie; + ACL_PCIE_MM_IO_MGR *m_io; + ACL_PCIE_DMA *m_dma; + + ACL_PCIE_TIMER *m_timer; + int m_use_timer; + + // Host Channel valid + // If channel is open, equal to 1 + int m_hostch_push_valid; + int m_hostch_pull_valid; + + // Input Queue + // Write data into circular buffer in MMD, that host channel + // can read from + void *m_push_queue; + size_t m_push_queue_local_end_p; + size_t m_push_queue_size; + + // Information to track input queue + void *m_pull_queue; + size_t m_pull_queue_local_front_p; + size_t m_pull_queue_size; + size_t m_pull_queue_available; + + // Shared front and end pointer with driver + // Circular buffer in MMD that the host channel IP can + // write into. Host will then read from it + size_t *m_pull_queue_pointer; + size_t *m_push_queue_pointer; + + size_t *m_pull_queue_front_p; + size_t *m_pull_queue_end_p; + size_t *m_push_queue_front_p; + size_t *m_push_queue_end_p; + + // User space memory that Linux kernel space has write + // access to. Since the MMD buffer is circular, whenever + // user writes to reads from it, the index for end and front + // changes, respectively. This needs to be sent to host channel IP + // and the thread in driver handles that. However, this thread will + // die after 1ms of inactivity to free up the CPU. When it does that, + // it will write to m_sync_thread with value of 0, so that MMD knows to + // launch it again, for subsequent get_buffer and ack_buffer calls. + int m_sync_thread_valid; + size_t *m_sync_thread; +}; + +void acl_aligned_malloc(void **result, size_t size); +void acl_aligned_free(void *ptr); + +#endif // ACL_PCIE_HOSTCH_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp new file mode 100644 index 0000000..92c9cf0 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.cpp @@ -0,0 +1,556 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_mm_io.cpp ------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to handle memory mapped IO over PCIe. */ +/* The declaration of the class lives in the acl_pcie_mm_io.h. */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_pcie_mm_io.h" +#include "acl_pcie.h" + +// other header files inside MMD driver +#include "acl_pcie_debug.h" + +// other standard header files +#include <string.h> + +#if defined(LINUX) +#include <unistd.h> // template +#endif // LINUX + +ACL_PCIE_MM_IO_DEVICE::ACL_PCIE_MM_IO_DEVICE( + fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian) { + ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid handle when creating mm_io object.\n"); + +#if defined(WINDOWS) + strncpy_s(m_name, MAX_NAME_LENGTH - 1, name, (MAX_NAME_LENGTH - 1)); +#else + strncpy(m_name, name, (MAX_NAME_LENGTH - 1)); +#endif + m_name[(MAX_NAME_LENGTH - 1)] = '\0'; + + m_handle = handle; + m_bar = bar; + m_offset = device_offset; + m_diff_endian = diff_endian; + + ACL_PCIE_DEBUG_MSG(":: [%s] Init: Bar " DWORD_FMT_U ", Total offset 0x%zu, diff_endian is %d \n", + m_name, + m_bar, + (size_t)m_offset, + m_diff_endian ? 1 : 0); +} + +ACL_PCIE_MM_IO_DEVICE::~ACL_PCIE_MM_IO_DEVICE() {} + +#if defined(LINUX) +// Helper functions to implement all other read/write functions +template <typename T> +DWORD linux_read(fpga_handle device, DWORD bar, KPTR address, T *data) { + struct acl_cmd driver_cmd; + driver_cmd.bar_id = bar; + driver_cmd.command = ACLPCI_CMD_DEFAULT; + driver_cmd.device_addr = reinterpret_cast<void *>(address); + driver_cmd.user_addr = data; + driver_cmd.size = sizeof(*data); + // function invoke linux_read will not write to global memory. + // So is_diff_endian is always false + driver_cmd.is_diff_endian = 0; + + return read(device, &driver_cmd, sizeof(driver_cmd)); +} + +template <typename T> +DWORD linux_write(fpga_handle device, DWORD bar, KPTR address, T data) { + struct acl_cmd driver_cmd; + driver_cmd.bar_id = bar; + driver_cmd.command = ACLPCI_CMD_DEFAULT; + driver_cmd.device_addr = reinterpret_cast<void *>(address); + driver_cmd.user_addr = &data; + driver_cmd.size = sizeof(data); + // function invoke linux_write will not write to global memory. + // So is_diff_endian is always false + driver_cmd.is_diff_endian = 0; + + return write(device, &driver_cmd, sizeof(driver_cmd)); +} +#endif // LINUX + +int ACL_PCIE_MM_IO_DEVICE::read8(size_t addr, UINT8 *data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT8)); +#endif // WINDOWS +#if defined(LINUX) + status = linux_read(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Read 8 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Read 8 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + *data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::write8(size_t addr, UINT8 data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT8)); +#endif // WINDOWS +#if defined(LINUX) + status = linux_write(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Writing 8 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Wrote 8 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::read16(size_t addr, UINT16 *data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaReadMmio(m_handle, m_bar, bar_addr, (PVOID)data, sizeof(UINT16)); +#endif // WINDOWS +#if defined(LINUX) + status = linux_read(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Read 16 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Read 16 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + *data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::write16(size_t addr, UINT16 data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (PVOID)&data, sizeof(UINT16)); +#endif // WINDOWS +#if defined(LINUX) + status = linux_write(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Writing 16 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Wrote 16 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::read32(size_t addr, UINT32 *data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaReadMMIO32(m_handle, m_bar, bar_addr, data); +#endif // WINDOWS +#if defined(LINUX) + status = linux_read(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Read 32 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Read 32 bits (0x%x) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + *data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::write32(size_t addr, UINT32 data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + status = fpgaWriteMMIO32(m_handle, m_bar, bar_addr, data); +#endif // WINDOWS +#if defined(LINUX) + status = linux_write(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Writing 32 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Wrote 32 bits (0x%x) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::read64(size_t addr, UINT64 *data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + // Original code had a 32-bit Read + status = fpgaReadMmio(m_handle, m_bar, bar_addr, data, 8); + +#endif // WINDOWS +#if defined(LINUX) + status = linux_read(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Read 64 bits from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Read 64 bits (0x%llx) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + *data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::write64(size_t addr, UINT64 data) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); +#if defined(WINDOWS) + // Original code had a 32-bit Write + status = fpgaWriteMmio(m_handle, m_bar, bar_addr, (void *)&data, 8); + +#endif // WINDOWS +#if defined(LINUX) + status = linux_write(m_handle, m_bar, bar_addr, data); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Writing 64 bits to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + bar_addr, + (size_t)bar_addr); + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Wrote 64 bits (0x%llx) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + data, + addr, + (size_t)bar_addr); + + return 0; // success +} + +int ACL_PCIE_MM_IO_DEVICE::write_block(size_t addr, size_t size, void *src) { + fpga_result status; + KPTR bar_addr = convert_to_bar_addr(addr); + + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X + " with offset)\n", + m_name, + size, + addr, + (size_t)bar_addr); + +#if defined(WINDOWS) + DWORD FP_size = static_cast<DWORD>(size); + size_t alignment_size = size % 4; + DWORD FP_alignment_size = static_cast<DWORD>(alignment_size); + // 32-bit MMIO Write + status = fpgaWriteMmio(m_handle, m_bar, bar_addr, src, FP_size - FP_alignment_size); + if (alignment_size) { + void *alignment_addr = compute_address(src, size - alignment_size); + KPTR alignment_bar_addr = bar_addr + size - alignment_size; + status = fpgaWriteMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size); + } + +#endif // WINDOWS +#if defined(LINUX) + // Can't use templated linux_write here because *src doesn't give you the size to read. + struct acl_cmd driver_cmd {}; + driver_cmd.bar_id = m_bar; + driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr); + driver_cmd.user_addr = src; + driver_cmd.size = size; + // Notify the driver if the host and device's memory have different endianess. + driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0; + status = write(m_handle, &driver_cmd, sizeof(driver_cmd)); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Writing block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + size, + addr, + (size_t)bar_addr); + return 0; // success +} + +inline void *ACL_PCIE_MM_IO_DEVICE::compute_address(void *base, uintptr_t offset) { + uintptr_t p = reinterpret_cast<uintptr_t>(base); + return reinterpret_cast<void *>(p + offset); +} + +int ACL_PCIE_MM_IO_DEVICE::read_block(size_t addr, size_t size, void *dst) { + DWORD status; + KPTR bar_addr = convert_to_bar_addr(addr); + + ACL_PCIE_DEBUG_MSG_VERBOSE(VERBOSITY_PCIE, + ":::::: [%s] Reading block (" SIZE_FMT_U " bytes) from 0x" SIZE_FMT_X " (0x" SIZE_FMT_X + " with offset)\n", + m_name, + size, + addr, + (size_t)bar_addr); + +#if defined(WINDOWS) + DWORD FP_size = static_cast<DWORD>(size); + size_t alignment_size = size % 4; + DWORD FP_alignment_size = static_cast<DWORD>(alignment_size); + // 32-bit MMIO Read + status = fpgaReadMmio(m_handle, m_bar, bar_addr, dst, FP_size - FP_alignment_size); + if (alignment_size) { + void *alignment_addr = compute_address(dst, size - alignment_size); + KPTR alignment_bar_addr = bar_addr + size - alignment_size; + status |= fpgaReadMmio(m_handle, m_bar, alignment_bar_addr, alignment_addr, FP_alignment_size); + } + +#endif // WINDOWS +#if defined(LINUX) + // Can't use templated linux_write here because *src doesn't give you the size to read. + struct acl_cmd driver_cmd; + driver_cmd.bar_id = m_bar; + driver_cmd.device_addr = reinterpret_cast<void *>(bar_addr); + driver_cmd.user_addr = dst; + driver_cmd.size = size; + // Notify the driver if the host and device's memory have different endianess. + driver_cmd.is_diff_endian = m_diff_endian ? 1 : 0; + status = read(m_handle, &driver_cmd, sizeof(driver_cmd)); +#endif // LINUX + + ACL_PCIE_ERROR_IF(status != FPGA_OK, + return -1, + "[%s] Reading block (" SIZE_FMT_U " bytes) to 0x" SIZE_FMT_X " (0x" SIZE_FMT_X " with offset)\n", + m_name, + size, + addr, + (size_t)bar_addr); + return 0; // success +} + +ACL_PCIE_MM_IO_MGR::ACL_PCIE_MM_IO_MGR(fpga_handle handle) + : mem(NULL), + pcie_cra(NULL), + window(NULL), + version(NULL), + pr_base_id(NULL), + pr_region_ctrl(NULL), + quartus_ver(NULL), + cade_id(NULL), + uniphy_status(NULL), + uniphy_reset(NULL), + kernel_if(NULL), + pll(NULL), + temp_sensor(NULL), + hostch_ver(NULL) { + ACL_PCIE_ASSERT(handle != INVALID_HANDLE_VALUE, "passed in an invalid device when creating mm_io_mgr.\n"); + + // This is the PCIe's interface for directly accessing memory (which is + // significantly slower than using DMA). This view of memory is segmented + // so that the size of this address space can be smaller than the amount of + // physical device memory. The window interface controls which region of + // physical memory this interface currently maps to. + // The last flag indicate if the device on both side of transferring have + // different endianess. +#ifdef ACL_BIG_ENDIAN + mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", true); +#else + mem = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_GLOBAL_MEM_BAR, (KPTR)ACL_PCIE_MEMWINDOW_BASE, "GLOBAL-MEM", false); +#endif + + // This is the CRA port of our PCIe controller. Used for configuring + // interrupts and things like that. + pcie_cra = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCI_CRA_BAR, ACL_PCI_CRA_OFFSET, "PCIE-CRA"); + + // This interface sets the high order address bits for the PCIe's direct + // memory accesses via "mem" (above). + window = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_MEMWINDOW_BAR, ACL_PCIE_MEMWINDOW_CRA, "MEMWINDOW"); + + // DMA interfaces + dma = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_DMA_INTERNAL_BAR, ACL_PCIE_DMA_INTERNAL_CTR_BASE, "DMA-CTR"); + + // Version ID check + version = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_VERSIONID_OFFSET, "VERSION"); + + // PR base ID check + pr_base_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRBASEID_BAR, ACL_PRBASEID_OFFSET, "PRBASEID"); + + // PR region controller + pr_region_ctrl = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PRREGIONFREEZE_BAR, ACL_PRREGIONFREEZE_OFFSET, "PRREGIONCTRL"); + + // Quartus Version + quartus_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_QUARTUSVER_BAR, ACL_QUARTUSVER_OFFSET, "QUARTUS-VERSION"); + + // Quartus Version + hostch_ver = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_HOSTCH_VERSION_BAR, ACL_HOSTCH_VERSION_OFFSET, "HOSTCH-VERSION"); + + // Cable auto detect ID + cade_id = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_CADEID_BAR, ACL_CADEID_OFFSET, "CADEID"); + + // Uniphy Status + uniphy_status = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYSTATUS_BAR, ACL_UNIPHYSTATUS_OFFSET, "UNIPHYSTATUS"); + + // Uniphy Reset + uniphy_reset = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_UNIPHYRESET_BAR, ACL_UNIPHYRESET_OFFSET, "UNIPHYRESET"); + + // Kernel interface + // The DLA BSP eliminates the kernel interface present in the original PR Terasic BSP + // We reuse the kernel_if object here to simplify the DLA-specific changes required +#ifdef DLA_MMD + kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_DLA_CSR_OFFSET, "KERNEL"); +#else + kernel_if = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_KERNEL_CSR_BAR, ACL_KERNEL_CSR_OFFSET, "KERNEL"); +#endif // DLA_MMD + + // PLL interface + pll = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_PCIE_KERNELPLL_RECONFIG_BAR, ACL_PCIE_KERNELPLL_RECONFIG_OFFSET, "PLL"); + + // temperature sensor +#ifdef ACL_PCIE_HAS_TEMP_SENSOR + temp_sensor = new ACL_PCIE_MM_IO_DEVICE(handle, ACL_VERSIONID_BAR, ACL_PCIE_TEMP_SENSOR_ADDRESS, "TEMP-SENSOR"); +#endif +} + +ACL_PCIE_MM_IO_MGR::~ACL_PCIE_MM_IO_MGR() { + if (mem) { + delete mem; + mem = NULL; + } + if (pcie_cra) { + delete pcie_cra; + pcie_cra = NULL; + } + if (window) { + delete window; + window = NULL; + } + if (version) { + delete version; + version = NULL; + } + if (pr_base_id) { + delete pr_base_id; + pr_base_id = NULL; + } + if (pr_region_ctrl) { + delete pr_region_ctrl; + pr_region_ctrl = NULL; + } + if (quartus_ver) { + delete quartus_ver; + quartus_ver = NULL; + } + if (cade_id) { + delete cade_id; + cade_id = NULL; + } + if (uniphy_status) { + delete uniphy_status; + uniphy_status = NULL; + } + if (uniphy_reset) { + delete uniphy_reset; + uniphy_reset = NULL; + } + if (kernel_if) { + delete kernel_if; + kernel_if = NULL; + } + if (pll) { + delete pll; + pll = NULL; + } + if (temp_sensor) { + delete temp_sensor; + temp_sensor = NULL; + } + if (hostch_ver) { + delete hostch_ver; + hostch_ver = NULL; + } + if (dma) { + delete dma; + dma = NULL; + } +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h new file mode 100644 index 0000000..4db5599 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_mm_io.h @@ -0,0 +1,109 @@ +#ifndef ACL_PCIE_MM_IO_H +#define ACL_PCIE_MM_IO_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_mm_io.h --------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to handle memory mapped IO over PCIe. */ +/* The actual implementation of the class lives in the acl_pcie_mm_io.cpp, */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#if defined(LINUX) +typedef int fpga_handle; +#define FPGA_OK 0 +#endif // LINUX + +#ifdef DLA_MMD +#include "acl_pcie.h" +#define ACL_DLA_CSR_OFFSET 0x0000 +#endif +/* + * + */ +class ACL_PCIE_MM_IO_DEVICE { + public: + ACL_PCIE_MM_IO_DEVICE(fpga_handle handle, DWORD bar, KPTR device_offset, const char *name, bool diff_endian = false); + ~ACL_PCIE_MM_IO_DEVICE(); + + DWORD bar_id() { return m_bar; }; + KPTR convert_to_bar_addr(size_t addr) { return addr + m_offset; }; + + // read/write functions to the memory-mapped io device + // return 0 on success, negative on error + int read8(size_t addr, UINT8 *data); + int write8(size_t addr, UINT8 data); + int read16(size_t addr, UINT16 *data); + int write16(size_t addr, UINT16 data); + int read32(size_t addr, UINT32 *data); + int write32(size_t addr, UINT32 data); + int read64(size_t addr, UINT64 *data); + int write64(size_t addr, UINT64 data); + + int read_block(size_t addr, size_t size, void *dst); + int write_block(size_t addr, size_t size, void *src); + + private: + static const int MAX_NAME_LENGTH = 32; + + // Helper functions + inline void *compute_address(void *base, uintptr_t offset); + + char m_name[MAX_NAME_LENGTH]; + fpga_handle m_handle; + DWORD m_bar; + KPTR m_offset; + bool m_diff_endian; // indicates if the host and this device have different endianess +}; + +/* + * Utility functions to clean up the various address translations for reads/writes + */ +class ACL_PCIE_MM_IO_MGR { + private: + ACL_PCIE_MM_IO_MGR &operator=(const ACL_PCIE_MM_IO_MGR &) { return *this; } + + ACL_PCIE_MM_IO_MGR(const ACL_PCIE_MM_IO_MGR &src) {} + + public: + ACL_PCIE_MM_IO_MGR(fpga_handle handle); + ~ACL_PCIE_MM_IO_MGR(); + + ACL_PCIE_MM_IO_DEVICE *mem; + ACL_PCIE_MM_IO_DEVICE *pcie_cra; + ACL_PCIE_MM_IO_DEVICE *dma; + ACL_PCIE_MM_IO_DEVICE *window; + ACL_PCIE_MM_IO_DEVICE *version; + ACL_PCIE_MM_IO_DEVICE *pr_base_id; + ACL_PCIE_MM_IO_DEVICE *pr_region_ctrl; + ACL_PCIE_MM_IO_DEVICE *quartus_ver; + ACL_PCIE_MM_IO_DEVICE *cade_id; + ACL_PCIE_MM_IO_DEVICE *uniphy_status; + ACL_PCIE_MM_IO_DEVICE *uniphy_reset; + ACL_PCIE_MM_IO_DEVICE *kernel_if; + ACL_PCIE_MM_IO_DEVICE *pll; + ACL_PCIE_MM_IO_DEVICE *temp_sensor; + ACL_PCIE_MM_IO_DEVICE *hostch_ver; +}; + +#endif // ACL_PCIE_MM_IO_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp new file mode 100644 index 0000000..855d6ba --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.cpp @@ -0,0 +1,67 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- acl_pcie_timer.cpp ------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the class to query the host's system timer. */ +/* The declaration of the class lives in the acl_pcie_timer.h */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_pcie_timer.h" +#include "acl_pcie.h" + +// other standard header files +#include <fstream> + +ACL_PCIE_TIMER::ACL_PCIE_TIMER() : m_ticks_per_second(0) { +#if defined(WINDOWS) + // Cache the performance counter frequency + LARGE_INTEGER li; + QueryPerformanceFrequency(&li); + m_ticks_per_second = li.QuadPart; + + ACL_PCIE_ASSERT(m_ticks_per_second != 0, "m_ticks_per_second == 0!\n"); +#endif // WINDOWS +} + +ACL_PCIE_TIMER::~ACL_PCIE_TIMER() {} + +cl_ulong ACL_PCIE_TIMER::get_time_ns() { +#if defined(WINDOWS) + const INT64 NS_PER_S = 1000000000; + LARGE_INTEGER li; + + QueryPerformanceCounter(&li); + INT64 ticks = li.QuadPart; + double seconds = ticks / (double)m_ticks_per_second; + + return static_cast<cl_ulong>(seconds * NS_PER_S + 0.5); +#endif // WINDOWS +#if defined(LINUX) + struct timespec a; + const cl_ulong NS_PER_S = 1000000000; + clock_gettime(CLOCK_REALTIME, &a); + + return static_cast<cl_ulong>(a.tv_nsec) + static_cast<cl_ulong>(a.tv_sec * NS_PER_S); +#endif // LINUX +} diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h new file mode 100644 index 0000000..646d681 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/acl_pcie_timer.h @@ -0,0 +1,50 @@ +#ifndef ACL_PCIE_TIMER_H +#define ACL_PCIE_TIMER_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_pcie_timer.h --------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) OpenCL MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file declares the class to query the host's system timer. */ +/* The actual implementation of the class lives in the acl_pcie_timer.cpp */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#ifdef DLA_MMD +// don't assume opencl has been installed +#include "acl_pcie.h" +typedef UINT64 cl_ulong; +#endif + +class ACL_PCIE_TIMER { + public: + ACL_PCIE_TIMER(); + ~ACL_PCIE_TIMER(); + + // function to query the host's system timer + cl_ulong get_time_ns(); + + private: + INT64 m_ticks_per_second; +}; + +#endif // ACL_PCIE_TIMER_H diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h new file mode 100644 index 0000000..ffecc32 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/host/version.h @@ -0,0 +1 @@ +#define ACL_DRIVER_VERSION "20.4.d41d8cd98f00b204e9800998ecf8427e" diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h new file mode 100644 index 0000000..6d5c85e --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/aocl_mmd.h @@ -0,0 +1,640 @@ +#ifndef AOCL_MMD_H +#define AOCL_MMD_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifdef DLA_MMD +#include <cstddef> //size_t +#include <cstdint> //uint32_t +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Support for memory mapped ACL devices. + * + * Typical API lifecycle, from the perspective of the caller. + * + * 1. aocl_mmd_open must be called first, to provide a handle for further + * operations. + * + * 2. The interrupt and status handlers must be set. + * + * 3. Read and write operations are performed. + * + * 4. aocl_mmd_close may be called to shut down the device. No further + * operations are permitted until a subsequent aocl_mmd_open call. + * + * aocl_mmd_get_offline_info can be called anytime including before + * open. aocl_mmd_get_info can be called anytime between open and close. + */ + +#ifndef AOCL_MMD_CALL +#if defined(_WIN32) +#define AOCL_MMD_CALL __declspec(dllimport) +#else +#define AOCL_MMD_CALL __attribute__((visibility ("default"))) +#endif +#endif + +#ifndef WEAK +#if defined(_WIN32) +#define WEAK +#else +/* This normally comes with "__attribute__((weak))" but for reasons not presently + * understood, the shared library is not properly loaded on Ubuntu18 when the functions + * are weak. + */ +#define WEAK +#endif +#endif + +/* The MMD API's version - the runtime expects this string when + * AOCL_MMD_VERSION is queried. This changes only if the API has changed */ +#define AOCL_MMD_VERSION_STRING "20.3" + +/* Memory types that can be supported - bitfield. Other than physical memory + * these types closely align with the OpenCL SVM types. + * + * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate + * directly with physical memory such as DDR, QDR, etc. + * + * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires explicit function calls from the user + * to synchronize the cache between the host processor and the FPGA. This level + * of SVM is not currently supported by Altera except as a subset of + * SVM_FINE_GAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires additional information from the user + * and/or host runtime that can be collected during pointer allocation in order + * to synchronize the cache between the host processor and the FPGA. Once this + * additional data is provided for an SVM pointer, the vendor interface handles + * cache synchronization between the host processor & the FPGA automatically. + * This level of SVM is not currently supported by Altera except as a subset + * of SVM_FINE_GRAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for + * caching SVM pointer data and does not require any additional information to + * synchronize the cache between the host processor and the FPGA. The vendor + * interface handles cache synchronization between the host processor & the + * FPGA automatically for all SVM pointers. This level of SVM support is + * currently under development by Altera and some features may not be fully + * supported. + */ +#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0) +#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1) +#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2) +#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3) + +/* program modes - bitfield + * + * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory + * when this bit is set to 1. If programming can't occur without preserving + * global memory contents, the program function must fail, in which case the + * runtime may re-invoke program with this bit set to 0, allowing programming + * to occur even if doing so destroys global memory contents. + * + * more modes are reserved for stacking on in the future + */ +#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0) +typedef int aocl_mmd_program_mode_t; + +typedef void* aocl_mmd_op_t; + +typedef struct { + unsigned lo; /* 32 least significant bits of time value. */ + unsigned hi; /* 32 most significant bits of time value. */ +} aocl_mmd_timestamp_t; + +/* Defines the set of characteristics that can be probed about the board before + * opening a device. The type of data returned by each is specified in + * parentheses in the adjacent comment. + * + * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES + * These two fields can be used to implement multi-device support. The MMD + * layer may have a list of devices it is capable of interacting with, each + * identified with a unique name. The length of the list should be returned + * in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in + * AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open + * for each board name returned in AOCL_MMD_BOARD_NAMES. + */ +typedef enum { + AOCL_MMD_VERSION = 0, /* Version of MMD (char*)*/ + AOCL_MMD_NUM_BOARDS = 1, /* Number of candidate boards (int)*/ + AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/ + AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */ + AOCL_MMD_VENDOR_ID = 4, /* An integer ID for the vendor (int) */ + AOCL_MMD_USES_YIELD = 5, /* 1 if yield must be called to poll hw (int) */ + /* The following can be combined in a bit field: + * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM + * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1 + */ + AOCL_MMD_MEM_TYPES_SUPPORTED = 6, +} aocl_mmd_offline_info_t; + +/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */ +/** + * If not set allocation function is not supported, even if other capabilities are set. + */ +#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0) +/** + * Supports atomic access to the memory by either the host or device. + */ +#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1) +/** + * Supports concurrent access to the memory either by host or device if the + * accesses are not on the same block. Block granularity is defined by + * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this + * granularity + */ +#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2) +/** + * Memory can be accessed by multiple devices at the same time. + */ +#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3) + +/* Defines the set of characteristics that can be probed about the board after + * opening a device. This can involve communication to the device + * + * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1 + * + * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface. + * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int + * + * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each + * kernel interface. If a kernel interface is not clocked by acl_kernel_clk + * then return -1 + * + * */ +typedef enum { + AOCL_MMD_NUM_KERNEL_INTERFACES = 1, /* Number of Kernel interfaces (int) */ + AOCL_MMD_KERNEL_INTERFACES = 2, /* Kernel interface (int*) */ + AOCL_MMD_PLL_INTERFACES = 3, /* Kernel clk handles (int*) */ + AOCL_MMD_MEMORY_INTERFACE = 4, /* Global memory handle (int) */ + AOCL_MMD_TEMPERATURE = 5, /* Temperature measurement (float) */ + AOCL_MMD_PCIE_INFO = 6, /* PCIe information (char*) */ + AOCL_MMD_BOARD_NAME = 7, /* Name of board (char*) */ + AOCL_MMD_BOARD_UNIQUE_ID = 8, /* Unique ID of board (int) */ + AOCL_MMD_CONCURRENT_READS = 9, /* # of parallel reads; 1 is serial*/ + AOCL_MMD_CONCURRENT_WRITES = 10, /* # of parallel writes; 1 is serial*/ + AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/ + AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12, /* Min alignment that the BSP supports for host allocations (size_t) */ + AOCL_MMD_HOST_MEM_CAPABILITIES = 13, /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/ + AOCL_MMD_SHARED_MEM_CAPABILITIES = 14, /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/ + AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15, /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/ + AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/ + AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/ + AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/ +} aocl_mmd_info_t; + +typedef struct { + unsigned long long int exception_type; + void* user_private_info; + size_t user_cb; +} aocl_mmd_interrupt_info; + +typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data); +typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data); +typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status); + +/* Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +AOCL_MMD_CALL int aocl_mmd_get_info(int handle, + aocl_mmd_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +/* Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ +AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK; + +/* Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK; + +/* Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signaled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK; + +/* Set the device interrupt handler for the opened device. + * The device interrupt handler is called whenever the client needs to be notified + * of a device event signaled by the device internals. + * For example, an ECC error has been reported. + * + * Important: Interrupts from the device must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a device interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle, + aocl_mmd_device_interrupt_handler_fn fn, + void* user_data) WEAK; + +/* Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK; + +/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle + * and hence possibly waiting for events to be processed by the device. + * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is + * assumed to provide status/event updates via some other execution thread + * such as through an interrupt handler. + * + * Returns: non-zero if the yield function performed useful work such as + * processing DMA transactions, 0 if there is no useful work to be performed + * + * NOTE: yield may be called continuously as long as it reports that it has useful work + */ +AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK; + +/* Read, write and copy operations on a single interface. + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_read( + int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_write( + int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_copy( + int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK; + +/* Host Channel create operation + * Opens channel between host and kernel. + * + * Arguments: + * channel_name - name of channel to initialize. Same name as used in board_spec.xml + * + * queue_depth - the size in bytes of pinned memory queue in system memory + * + * direction - the direction of the channel + * + * The return value is negative if initialization was unsuccessful, and + * positive otherwise. Positive return value is handle to the channel to be used for + * subsequent calls for the channel. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK; + +/* Host Channel destroy operation + * Closes channel between host and kernel. + * + * Arguments: + * channel - the handle to the channel to close, that was obtained with + * create channel + * + * The return value is 0 if the destroy was successful, and negative + * otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK; + +/* Host Channel get buffer operation + * Provide host with pointer to buffer they can access to write or + * read from kernel, along with space or data available in the buffer + * in bytes. + * + * Arguments: + * channel - the handle to the channel to get the buffer for + * + * buffer_size - the address that this call will write the amount of + * space or data that's available in the buffer, + * depending on direction of the channel, in bytes + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is the pointer to the buffer that host can write + * to or read from. NULL if the status is negative. + */ +AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK; + +/* Host Channel acknowledge buffer operation + * Acknowledge to the channel that the user has written or read data from + * it. This will make the data or additional buffer space available to + * write to or read from kernel. + * + * Arguments: + * channel - the handle to the channel that user is acknowledging + * + * send_size - the size in bytes that the user is acknowledging + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is equal to send_size if send_size was less than or + * equal to the buffer_size from get buffer call. If send_size was + * greater, then return value is the amount that was actually sent. + */ +AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK; + +/* Program the device + * + * The host will guarantee that no operations are currently executing on the + * device. That means the kernels will be idle and no read/write/copy + * commands are active. Interrupts should be disabled and the FPGA should + * be reprogrammed with the data from user_data which has size size. The host + * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler + * again. At this point interrupts can be enabled. + * + * The new handle to the board after reprogram does not have to be the same as + * the one before. + * + * Arguments: + * user_data - The binary contents of the fpga.bin file created during + * Quartus II compilation. + * size - the size in bytes of user_data + * program_mode - bit field for programming attributes. See + * aocl_mmd_program_mode_t definition + * + * Returns: the new non-negative integer handle for the board, otherwise a + * negative value to indicate error. + */ + +#ifdef DLA_MMD +AOCL_MMD_CALL int aocl_mmd_save_pcie(int handle) WEAK; +AOCL_MMD_CALL int aocl_mmd_restore_pcie(int handle) WEAK; +// CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to +// reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask +// the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared). +// BEWARE: reprogramming will invalidate the handle +AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename, const bool skipSaveRestore = false) WEAK; +#else +AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK; +#endif + +/** Error values*/ +#define AOCL_MMD_ERROR_SUCCESS 0 +#define AOCL_MMD_ERROR_INVALID_HANDLE -1 +#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2 +#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3 +#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4 +#define AOCL_MMD_ERROR_INVALID_POINTER -5 +#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6 + +/** Memory properties*/ +typedef enum { + /** + * Specifies the name of a global memory that can be found in the + * board_spec.xml file for the BSP. Allocations will be allocated to this + * global memory interface. + */ + AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1, + /** + * Specifies the index of a bank inside the global memory interface that can be found in + * the board_spec.xml file for the BSP. Allocations will be allocated to this + * memory bank. It is invalid to specify this property without also specifying + * AOCL_MMD_GLOBAL_MEMORY_INTERFACE. + */ + AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK +} aocl_mmd_mem_properties_t; + +/** + * Host allocations provide memory that is allocated on the host. Host + * allocations are accessible by the host and one or more devices. + * The same pointer to a host allocation may be used on the host and all + * supported devices; they have address equivalence. This memory must be + * deallocated with aocl_mmd_free(); + * + * Once the device has signaled completion through + * aocl_mmd_interrupt_handler_fn() the host can assume it has access to the + * latest contents of the memory, allocated by this call. + * + * @param handles Handles for devices that will need access to this memory + * @param num_devices Number of devices in the handles + * @param size The size of the memory region + * @param alignment The alignment in bytes of the allocation + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported values are + * described above. Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return valid pointer, on error NULL + */ +AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles, + size_t num_devices, + size_t size, + size_t alignment, + aocl_mmd_mem_properties_t* properties, + int* error) WEAK; + +/** + * Frees memory that has been allocated by MMD + * + * @param mem The pointer to the memory region. Must be a pointer that is + * allocated by the MMD. + * @return AOCL_MMD_ERROR_SUCCESS if success, else error code + */ +AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK; + +/** + * Allocate memory that is owned by the device. This pointer can only be + * accessed by the kernel; can't be accessed by the host. The host is able to + * manipulate the pointer (e.g. increment it) just not access the underlying + * data. This memory must be deallocated by aocl_mmd_free(); + * + * @param handle Device that will have access to this memory + * @param size The size of the memory region + * @param alignment The alignment in bytes of the memory region + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported values are + * described above. Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return Pointer that can be passed into the kernel. NULL on failure. + */ +AOCL_MMD_CALL void* aocl_mmd_device_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK; + +/** + * Shared allocations may migrate between the host and one or more associated + * device. The same pointer to a shared allocation may be used on the host and + * the supported device; they have address equivalence. + * + * If the device does not support concurrent access to memory allocated by + * aocl_mmd_shared_alloc() then a call must be made to + * aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should + * be migrated to the device before the device accesses this memory. For + * example, a call to aocl_mmd_shared_mem_migrate() should be made before a + * kernel accessing this memory is launched). Conversely, + * aocl_mmd_shared_mem_migrate() should be called again to indicate that the + * shared allocation should be migrated to the host before the host accesses + * this memory again. If the device supports concurrent access to memory + * allocated with aocl_mmd_shared_alloc(), then the call to + * aocl_mmd_shared_mem_migrate() is not necessary, but may still be made. In + * the case of concurrent access, it is the responsibility of the MMD to ensure + * both the device and host can access aocl_mmd_shared_alloc() allocations at + * all times. + * + * Memory allocated by aocl_mmd_shared_alloc() must be deallocated with + * aocl_mmd_free(). + * + * @param handle Device that will have access to this memory + * @param size The size of the memory region + * @param alignment The alignment in bytes of the memory region + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported properties are + * listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_. + * Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return valid pointer, on error NULL + */ +AOCL_MMD_CALL void* aocl_mmd_shared_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK; + +typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t; + +/** + * A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared + * allocations any time the accessor of the allocation changes. For example, + * aocl_mmd_shared_migrate() should be called indicating that the allocation + * should be migrated to the device before a kernel accessing the allocation + * is launched on the device. Similarly, aocl_mmd_shared_migrate() should be + * called indicating that the allocation is migrated to the host before the + * host accesses the memory after kernel completion. + * + * For concurrent allocations this call may be used as a performance hint, but + * is not strictly required for functionality. + * + * @param handle Device that will have access to this memory + * @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc() + * @param size In bytes, the size of the migration. Must be of multiple of a + * page boundary that the BSP supports. + * @param destination The destination of migration + * @return The error code defined by AOCL_MMD_ERROR* + */ +AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, + void* shared_ptr, + size_t size, + aocl_mmd_migrate_t destination) WEAK; + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK; +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK; +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK; + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; + +// Get the PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK; +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h new file mode 100644 index 0000000..dc3eae2 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/access.h @@ -0,0 +1,100 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file access.h + * @brief Functions to acquire, release, and reset OPAE FPGA resources + */ + +#ifndef __FPGA_ACCESS_H__ +#define __FPGA_ACCESS_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Open an FPGA object + * + * Acquires ownership of the FPGA resource referred to by 'token'. + * + * Most often this will be used to open an accelerator object to directly interact + * with an accelerator function, or to open an FPGA object to perform + * management functions. + * + * @param[in] token Pointer to token identifying resource to acquire + * ownership of + * @param[out] handle Pointer to preallocated memory to place a handle in. + * This handle will be used in subsequent API calls. + * @param[in] flags One of the following flags: + * * FPGA_OPEN_SHARED allows the resource to be opened + * multiple times (not supported in ASE) + * @returns FPGA_OK on success. FPGA_NOT_FOUND if the resource for + * 'token' could not be found. FPGA_INVALID_PARAM if + * 'token' does not refer to a resource that can be + * opened, or if either argument is NULL or invalid. + * FPGA_EXCEPTION if an internal exception occurred while + * creating the handle. FPGA_NO_DRIVER if the driver is + * not loaded. FPGA_BUSY if trying to open a resource that + * has already been opened in exclusive mode. + * FPGA_NO_ACCESS if the current process' privileges are + * not sufficient to open the resource. + */ + __FPGA_API__ fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, + int flags); + +/** + * Close a previously opened FPGA object + * + * Relinquishes ownership of a previously fpgaOpen()ed resource. This enables + * others to acquire ownership if the resource was opened exclusively. + * Also deallocates / unmaps MMIO and UMsg memory areas. + * + * @param[in] handle Handle to previously opened FPGA object + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does + * not refer to an acquired resource, or if handle is NULL. + * FPGA_EXCEPTION if an internal error occurred while + * accessing the handle. + */ +__FPGA_API__ fpga_result fpgaClose(fpga_handle handle); + +/** + * Reset an FPGA object + * + * Performs an accelerator reset. + * + * @param[in] handle Handle to previously opened FPGA object + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does + * not refer to an acquired resource or to a resoure that + * cannot be reset. FPGA_EXCEPTION if an internal error + * occurred while trying to access the handle or resetting + * the resource. + */ +__FPGA_API__ fpga_result fpgaReset(fpga_handle handle); + +END_C_DECL + +#endif // __FPGA_ACCESS_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h new file mode 100644 index 0000000..e848182 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/buffer.h @@ -0,0 +1,154 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file buffer.h + * @brief Functions for allocating and sharing system memory with an FPGA + * accelerator + * + * To share memory between a software application and an FPGA accelerator, + * these functions set up system components (e.g. an IOMMU) to allow + * accelerator access to a provided memory region. + * + * There are a number of restrictions on what memory can be shared, depending + * on platform capabilities. Usually, FPGA accelerators to not have access to + * virtual address mappings of the CPU, so they can only access physical + * addresses. To support this, the OPAE C library on Linux uses hugepages to + * allocate large, contiguous pages of physical memory that can be shared with + * an accalerator. It also supports sharing memory that has already been + * allocated by an application, as long as that memory satisfies the + * requirements of being physically contigous and page-aligned. + */ + +#ifndef __FPGA_BUFFER_H__ +#define __FPGA_BUFFER_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Prepare a shared memory buffer + * + * Prepares a memory buffer for shared access between an accelerator and the calling + * process. This may either include allocation of physcial memory, or + * preparation of already allocated memory for sharing. The latter case is + * indicated by supplying the FPGA_BUF_PREALLOCATED flag. + * + * This function will ask the driver to pin the indicated memory (make it + * non-swappable), and program the IOMMU to allow access from the accelerator. If the + * buffer was not pre-allocated (flag FPGA_BUF_PREALLOCATED), the function + * will also allocate physical memory of the requested size and map the + * memory into the caller's process' virtual address space. It returns in + * 'wsid' an fpga_buffer object that can be used to program address registers + * in the accelerator for shared access to the memory. + * + * When using FPGA_BUF_PREALLOCATED, the input len must be a non-zero multiple + * of the page size, else the function returns FPGA_INVALID_PARAM. When not + * using FPGA_BUF_PREALLOCATED, the input len is rounded up to the nearest + * multiple of page size. + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] len Length of the buffer to allocate/prepare in bytes + * @param[inout] buf_addr Virtual address of buffer. Contents may be NULL (OS + * will choose mapping) or non-NULL (OS will take + * contents as a hint for the virtual address). + * @param[out] wsid Handle to the allocated/prepared buffer to be used + * with other functions + * @param[in] flags Flags. FPGA_BUF_PREALLOCATED indicates that memory + * pointed at in '*buf_addr' is already allocated an + * mapped into virtual memory. + * @returns FPGA_OK on success. FPGA_NO_MEMORY if the requested memory could + * not be allocated. FPGA_INVALID_PARAM if invalid parameters were provided, or + * if the parameter combination is not valid. FPGA_EXCEPTION if an internal + * exception occurred while trying to access the handle. + */ +__FPGA_API__ fpga_result fpgaPrepareBuffer(fpga_handle handle, + uint64_t len, + void **buf_addr, uint64_t *wsid, int flags); + +/** + * Release a shared memory buffer + * + * Releases a previously prepared shared buffer. If the buffer was allocated + * using fpgaPrepareBuffer (FPGA_BUF_PREALLOCATED was not specified), this call + * will deallocate/free that memory. Otherwise, it will only be returned to + * it's previous state (pinned/unpinned, cached/non-cached). + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] wsid Handle to the allocated/prepared buffer + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were + * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an + * internal exception occurred while trying to access the handle. + */ +__FPGA_API__ fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid); + +/** + * Retrieve base IO address for buffer + * + * This function is used to acquire the physical base address (on some platforms + * called IO Virtual Address or IOVA) for a shared buffer identified by wsid. + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] wsid Buffer handle / workspace ID referring to the buffer for + * which the IO address is requested + * @param[out] ioaddr Pointer to memory where the IO address will be returned + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were + * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an + * internal exception occurred while trying to access the handle. + * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer. + */ +__FPGA_API__ fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, + uint64_t *ioaddr); + +/** + * Retrieve physical address for buffer + * + * This function is used to acquire the physical addresses in a scatter gather + * list form for a shared buffer identified by wsid. + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] wsid Buffer handle / workspace ID referring to the buffer for + * which the physical address is requested + * @param[out] num_pages Number of physical pages + * @param[out] sglist SG list structure where physical addresses of pages and + * number of bytes in that page used will be returned. + * + * Note: Call this API with sg_list as NULL to update num_pages. Allocate upto + * (num_pages * sg_list) memory and call the API again with a pointer to this + * memory location as the last argument to retrieve the sg_list struct. + * + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if invalid parameters were + * provided, or if the parameter combination is not valid. FPGA_EXCEPTION if an + * internal exception occurred while trying to access the handle. + * FPGA_NOT_FOUND if `wsid` does not refer to a previously shared buffer. + */ +__FPGA_API__ fpga_result fpgaGetPhysicalAddress(fpga_handle handle, uint64_t wsid, uint64_t *num_pages, + void *sglist); + +END_C_DECL + +#endif // __FPGA_BUFFER_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h new file mode 100644 index 0000000..8febd44 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/dma.h @@ -0,0 +1,144 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file dma.h + * @brief Functions to acquire, release, and reset OPAE FPGA DMA resources + */ + +#ifndef __DMA_ACCESS_H__ +#define __DMA_ACCESS_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/* +* The DMA driver supports host to FPGA, FPGA to host +* and FPGA to FPGA transfers. The FPGA interface can +* be streaming or memory-mapped. Streaming interfaces +* are not currently +* supported. +*/ +typedef enum { + HOST_TO_FPGA_MM = 0, + FPGA_TO_HOST_MM, + FPGA_TO_FPGA_MM, + FPGA_MAX_TRANSFER_TYPE, +}fpga_dma_transfer; + + +typedef enum +{ + DMA_OPEN = 1, + DMA_BUSY, + DMA_CLOSED +}fpga_dma_status; + +/* + * Dma handle in user space that will be populated during fpgaDmaOpen call. + */ +typedef struct _fpga_dma_handle +{ + // + // Stores the handle to the fpga that was opened after fpgaOpen + // + fpga_handle fpga_h; + + // + // Stores the current status of the DMA AFC + // Set to the following values: + // DMA_OPEN - After call to fpgaDmaOpen() and when fpgaDmaTransferSync() exits + // DMA_BUSY - When fpgaDmaTransferSync() is called + // + uint64_t dma_status; +}dma_handle, *fpga_dma_handle; + + + +/** +* +* Opens a handle to DMA +* Sets the status of DMA engine to DMA_OPEN +* @param[in] handle Handle to previously opened FPGA object +* @param[in] dma_h DMA handle allocated by the user +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does +* not refer to an acquired resource. +* +*/ +__FPGA_API__ +fpga_result +fpgaDmaOpen( + fpga_handle handle, + fpga_dma_handle *dma_h +); + +/** +* +* Closes a handle to DMA +* Sets the status of DMA engine to DMA_CLOSED +* @param[in] handle Handle to previously opened FPGA object +* @param[in] dma_h DMA handle allocated by the user +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does +* not refer to an acquired resource. +* +*/ +__FPGA_API__ +fpga_result +fpgaDmaClose( + fpga_dma_handle dma_h +); + + +/** +* +* Performs a synchronous DMA transfer between FPGA and host memory. +* +* @param[in] handle Handle to previously opened FPGA object +* @param[in] dst Destination address for the data transfer +* @param[in] src Source address for the data transfer +* @param[in] count Length of data to be transferred from src to dst +* @param[in] flag Flag to indicate nature of data transfer. Flag types = + HOST_TO_FPGA_MM and FPGA_TO_HOST_MM. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does +* not refer to an acquired resource or to a resoure that +* cannot be reset. FPGA_EXCEPTION if an internal error +* occurred while trying to access the handle or resetting +* the resource. +*/ +__FPGA_API__ +fpga_result +fpgaDmaTransferSync( + fpga_dma_handle handle, + ULONG64 dst, + ULONG64 src, + ULONG64 count, + ULONG64 flag +); + +END_C_DECL + +#endif // __DMA_ACCESS_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h new file mode 100644 index 0000000..ee3349b --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/enum.h @@ -0,0 +1,129 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file enum.h + * @brief APIs for resource enumeration and managing tokens + * + * These APIs are the first step for any application using OPAE to discover + * resources that are present on the system. They allow selective enumeration + * (i.e. getting a list of resources that match a given list of criteria) and + * methods to manage the lifecycle of tokens generated by fpgaEnumerate(). + */ + +#ifndef __FPGA_ENUM_H__ +#define __FPGA_ENUM_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Enumerate FPGA resources present in the system + * + * This call allows the user to query the system for FPGA resources that match + * a certain set of criteria, e.g. all accelerators that are assigned to a host + * interface and available, all FPGAs of a specific type, etc. + * + * fpgaEnumerate() will create a number of `fpga_token`s to represent the + * matching resources and populate the array `tokens` with these tokens. The + * `max_tokens` argument can be used to limit the number of tokens + * allocated/returned by fpgaEnumerate(); i.e., the number of tokens in the + * returned `tokens` array will be either `max_tokens` or `num_matches` (the + * number of resources matching the filter), whichever is smaller. Use + * fpgaDestroyToken() to destroy tokens that are no longer needed. + * + * To query the number of matches for a particular set of filters (e.g. to + * allocate a `tokens` array of the appropriate size), call fpgaEnumerate() + * with the parameter `tokens` set to NULL; this will only return the number of + * matches in `num_matches`. + * + * @Note fpgaEnumerate() will allocate memory for the created tokens returned + * in `tokens`. It is the responsibility of the using application to free this + * memory after use by calling fpgaDestroyToken() for each of the returned + * tokens. + * + * @param[in] filters Array of `fpga_properties` objects describing the + * properties of the objects that should be returned. A + * resource is considered matching if its properties + * match any one of the supplied filters. Passing NULL + * will match all FPGA resources present in the system. + * @param[in] num_filters Number of entries in the `filters` array. + * @param[out] tokens Pointer to an array of fpga_token variables to be + * populated. If NULL is supplied, fpgaEnumerate() will + * not create any tokens, but it will return the + * number of possible matches in `num_match`. + * @param[in] max_tokens Maximum number of tokens that fpgaEnumerate() shall + * return (length of `tokens` array). There may be more + * or fewer matches than this number; `num_matches` is + * set to the number of actual matches. + * @param[out] num_matches Number of resources matching the `filter` criteria. + * This number can be higher than the number of tokens + * returned in the `tokens` array (depending on the + * value of `max_tokens`). + * @returns FPGA_OK on success. + * FPGA_INVALID_PARAM if invalid pointers or objects + * are passed into the function. + * FPGA_NO_DRIVER if OPAE can't find the respective + * enumeration data structures usually provided by the + * driver. + * FPGA_NO_MEMORY if there was not enough memory to + * create tokens. + */ +__FPGA_API__ fpga_result fpgaEnumerate(const fpga_properties *filters, + uint32_t num_filters, fpga_token *tokens, + uint32_t max_tokens ,uint32_t *num_matches); + +/** + * Clone a fpga_token object + * + * Creates a copy of an fpga_token object. + * + * @Note This call creates a new token object and allocates memory for it. It + * is the responsibility of the using application to free this memory after use + * by calling fpgaDestroyToken() for the cloned token. + * + * @param[in] src fpga_token object to copy + * @param[out] dst New fpga_token object cloned from 'src' + * @returns FPGA_OK on success + */ +__FPGA_API__ fpga_result fpgaCloneToken(fpga_token src, fpga_token *dst); + +/** + * Destroy a Token + * + * This function destroys a token created by fpgaEnumerate() and frees the + * associated memory. + * + * @param[in] token fpga_token to destroy + * @returns FPGA_OK on success + */ +__FPGA_API__ fpga_result fpgaDestroyToken(fpga_token *token); + +END_C_DECL + +#endif // __FPGA_ENUM_H__ + diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h new file mode 100644 index 0000000..3d53554 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/event.h @@ -0,0 +1,151 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file event.h + * @brief Functions for registering events and managing the lifecycle for + * `fpga_event_handle`s. + * + * OPAE provides an interface to asynchronous events that can be generated by + * different FPGA resources. The event API provides functions to register for + * these events; associated with every event a process has registered for is an + * fpga_event_handle, which encapsulates the OS-specific data structure for + * event objects. On Linux, an fpga_event_handle can be used as a file + * descriptor and passed to select(), poll(), epoll() and similar functions to + * wait for asynchronous events. + */ + +#ifndef __FPGA_EVENT_H__ +#define __FPGA_EVENT_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Initialize an event_handle + * + * Platform independent way to initialize an event_handle used for + * notifications from the driver to application. For Linux, this function + * creates an eventfd and returns the eventfd file descriptor in + * `*event_handle`. + * + * @param[out] event_handle Pointer to event handle variable. + * + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL. + * FPGA_NOT_SUPPORTED if platform does not support events. + */ +__FPGA_API__ fpga_result fpgaCreateEventHandle(fpga_event_handle *event_handle); + +/** + * Destroy an event_handle + * + * Destroy handle and free resources. On Linux this corresponds + * to closing the file descriptor pointed to by handle + * + * @param[in] event_handle Pointer to handle to be destroyed + * + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is NULL. + */ +__FPGA_API__ fpga_result fpgaDestroyEventHandle(fpga_event_handle *event_handle); + +/** + * Register an FPGA event + * + * This function tells the driver that the caller is interested in notification + * for the event specified by the type and flags pair. + * + * The event_handle points to an OS specific mechanism for event notification. + * An event_handle is associated with only a single event. + * + * @todo define if calling fpgaRegisterEvent multiple times with the + * same event_handle is an error condition or if it is silently ignored. + * + * @note This function is currently not supported. + * + * @param[in] handle Handle to previously opened FPGA resource. + * @param[in] event_type Type of event + * @param[in] event_handle Handle to previously opened resource for event + * notification. + * @param[in] flags Optional argument for specifying additional + * information about event. For example irq number + * for interrupt events. + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does not refer to + * a resource supporting the requested event, or if event_handle is not valid. + * FPGA_EXCEPTION if an internal exception occurred while accessing the handle + * or the event_handle. On Linux: FPGA_NO_DAEMON if the driver does not support the + * requested event and there is no FPGA Daemon (fpgad) running to proxy it. + */ +__FPGA_API__ fpga_result fpgaRegisterEvent(fpga_handle handle, + fpga_event_type event_type, + fpga_event_handle event_handle, + uint32_t flags); + +/** + * Unregister an FPGA event + * + * This function tells the driver that the caller is no longer interested in + * notification for the event associated with the event_handle + * + * The event_handle points to an OS specific mechanism for event notification. + * An event_handle is associated with only a single event. + * + * @todo define if calling fpgaUnregisterEvent multiple times with the + * same event_handle is an error condition or if it is silently ignored. + * + * @note This function is currently not supported. + * + * @param[in] handle Handle to previously opened FPGA resource. + * @param[in] event_type Type of event. + * @param[in] event_handle Handle to previously opened resource for event + * notification. + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if handle does + * not refer to a resource supporting the requested event, + * or if event_handle is not valid. FPGA_EXCEPTION if an + * internal error occurred accessing the handle or the + * event_handle. + */ +__FPGA_API__ fpga_result fpgaUnregisterEvent(fpga_handle handle, fpga_event_type event_type, + fpga_event_handle event_handle); + +/** +* Get OS object from event handle +* +* Check validity of event handle, and get the OS object used to +* subscribe and unsubscribe to events. On Linux, the obkect corresponds +* to a file descriptor. +* +* @param[in] event_handle Event handle to get the descriptor value from +* @param[out] fd integer to store the descriptor value +* +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if `event_handle` is invalid. +*/ +__FPGA_API__ fpga_result fpgaGetOSObjectFromEventHandle(const fpga_event_handle event_handle, + int *fd); + +END_C_DECL + +#endif // __FPGA_EVENT_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h new file mode 100644 index 0000000..f7a2c5c --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/flash.h @@ -0,0 +1,87 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file flash.h + * @brief Functions to erase the flash memory and reconfigure a slot with a new bitstream . + */ + +#ifndef __FLASH_H__ +#define __FLASH_H__ + +BEGIN_C_DECL + +/** +* +* Erase flash memory +* +* This function erases the flash memory of the FPGA device +* +* Arguments: +* @param[in] fpga_handle handle to previously opened FPGA_DEVICE resource +* +* Return Value: +* FPGA_OK on success. +* FPGA_INVALID_PARAM if the handle does not refer to an owned resource. +* FPGA_NOT_FOUND if this host interface number is not found . +* FPGA_NOT_SUPPORTED if funcionality not supported +* +**/ +__FPGA_API__ fpga_result +fpgaEraseFlash( + fpga_handle fpga_handle + ); + + +/** +* Writes flash memory +* +* This function programs the flash chip on the FPGA with the provided bitstream. +* +* Arguments: +* @param[in] handle handle to an FPGA_DEVICE resource +* @param[in] flashBitstream pointer to memory holding the flash bitstream +* @param[in] flashBitstreamLen length of the bitstream in bytes +* @param[in] offset offset in flash controller to begin writing from +* +* Return Value: +* FPGA_OK on success. +* FPGA_INVALID_PARAM if the handle does not refer to an owned resource. +* FPGA_NOT_FOUND if this host interface number is not found . +* FPGA_NOT_SUPPORTED if funcionality not supported. +*/ + +__FPGA_API__ fpga_result +fpgaWriteFlash( + fpga_handle handle, + PUINT8 flashBitstream, + UINT64 flashBitstreamLen, + UINT64 offset +); + +END_C_DECL + +#endif // __FLASH_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h new file mode 100644 index 0000000..e6668e8 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/fpga.h @@ -0,0 +1,60 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * \file fpga.h + * \brief FPGA API + * + * This conveniently includes all APIs that a part of the OPAE release (base and + * extensions). + */ + +#ifndef __FPGA_FPGA_H__ +#define __FPGA_FPGA_H__ + +#define FPGA_API_VERSION_MAJOR 0 +#define FPGA_API_VERSION_MINOR 1 + +#ifdef _WIN32 +#include <Windows.h> +#endif + +#include <opae/types.h> +#include <opae/access.h> +#include <opae/buffer.h> +#include <opae/dma.h> +#include <opae/enum.h> +#include <opae/event.h> +#include <opae/flash.h> +#include <opae/manage.h> +#include <opae/mmio.h> +#include <opae/properties.h> +#include <opae/umsg.h> +#include <opae/utils.h> +#include <opae/version.h> + +#endif // __FPGA_FPGA_H__ + diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h new file mode 100644 index 0000000..365cdaf --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/macrodefs.h @@ -0,0 +1,70 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file macrodefs.h + * @brief Definitions of conveinence macros for the OPAE C API + * + * This file defines convenience macros for the OPAE C API functions. + */ + +#ifndef __FPGA_MACRODEFS_H__ +#define __FPGA_MACRODEFS_H__ + +// Check for conflicting definitions +#ifdef BEGIN_C_DECL +#error BEGIN_C_DECL already defined, but used by the OPAE library +#endif + +#ifdef END_C_DECL +#error END_C_DECL already defined, but used by the OPAE library +#endif + +#ifdef __FPGA_API__ +#error __FPGA_API__ already defined, but used by the OPAE library +#endif + +// Macro for symbol visibility +#ifdef _WIN32 +#ifdef FpgaLib_EXPORTS +#define __FPGA_API__ __declspec(dllexport) +#else +#define __FPGA_API__ __declspec(dllimport) +#endif +#else +#define __FPGA_API__ __attribute__((visibility("default"))) +#endif + +// Macro for disabling name mangling +#ifdef __cplusplus +#define BEGIN_C_DECL extern "C" { +#define END_C_DECL } +#else +#define BEGIN_C_DECL +#define END_C_DECL +#endif + +#endif // __FPGA_MACRODEFS_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h new file mode 100644 index 0000000..f93a1b1 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/manage.h @@ -0,0 +1,176 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file manage.h + * @brief Functions for managing FPGA configurations + * + * FPGA accelerators can be reprogrammed at run time by providing new partial + * bitstreams ("green bitstreams"). This file defines API functions for + * programming green bitstreams as well as for assigning accelerators to host + * interfaces for more complex deployment setups, such as virtualized systems. + */ + +#ifndef __FPGA_MANAGE_H__ +#define __FPGA_MANAGE_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** +* Assign Port to a host interface. +* +* This function assign Port to a host interface for subsequent use. Only +* Port that have been assigned to a host interface can be opened by +* fpgaOpen(). +* +* @param[in] fpga Handle to an FPGA object previously opened that +* both the host interface and the slot belong to +* @param[in] interface_num Host interface number +* @param[in] slot_num Slot number +* @param[in] flags Flags (to be defined) +* @returns FPGA_OK on success +* FPGA_INVALID_PARAM if input parameter combination +* is not valid. +* FPGA_EXCEPTION if an exception occcurred accessing +* the `fpga` handle. +* FPGA_NOT_SUPPORTED if driver does not support +* assignment. +*/ +__FPGA_API__ fpga_result fpgaAssignPortToInterface(fpga_handle fpga, + uint32_t interface_num, + uint32_t slot_num, + int flags); + +/** + * Assign an accelerator to a host interface + * + * This function assigns an accelerator to a host interface for subsequent use. Only + * accelerators that have been assigned to a host interface can be opened by + * fpgaOpen(). + * + * @note This function is currently not supported. + * + * @param[in] fpga Handle to an FPGA object previously opened that + * both the host interface and the accelerator belong to + * @param[in] afc Accelerator to assign + * @param[in] host_interface Host interface to assign accelerator to + * @param[in] flags Flags (to be defined) + * @returns FPGA_OK on success + */ +__FPGA_API__ fpga_result fpgaAssignToInterface(fpga_handle fpga, + fpga_token afc, + uint32_t host_interface, + int flags); + +/** + * Unassign a previously assigned accelerator + * + * This function removes the assignment of an accelerator to an host interface (e.g. to + * be later assigned to a different host interface). As a consequence, the accelerator + * referred to by token 'accelerator' will be reset during the course of this function. + * + * @note This function is currently not supported. + * + * @param[in] fpga Handle to an FPGA object previously opened that + * both the host interface and the accelerator belong to + * @param[in] afc Accelerator to unassign/release + * @returns FPGA_OK on success + */ +__FPGA_API__ fpga_result fpgaReleaseFromInterface(fpga_handle fpga, + fpga_token afc); + +/** + * Reconfigure a slot + * + * Sends a green bitstream file to an FPGA to reconfigure a specific slot. This + * call, if successful, will overwrite the currently programmed AFU in that + * slot with the AFU in the provided bitstream. + * + * As part of the reconfiguration flow, all accelerators associated with this slot will + * be unassigned and reset. + * + * @param[in] fpga Handle to an FPGA object previously opened + * @param[in] slot Token identifying the slot to reconfigure + * @param[in] bitstream Pointer to memory holding the bitstream + * @param[in] bitstream_len Length of the bitstream in bytes + * @param[in] flags Flags (to be defined) + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters + * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the + * handle or while sending the bitstream data to the driver. FPGA_RECONF_ERROR + * on errors reported by the driver (such as CRC or protocol errors). + */ +__FPGA_API__ fpga_result fpgaReconfigureSlot(fpga_handle fpga, + uint32_t slot, + const uint8_t *bitstream, + size_t bitstream_len, int flags); + +/** + * Process device specific commands + * + * Sends a device specific command to the driver and driver performs that action + * and returns if needed with the data. + * + * @param[in] fpga Handle to an FPGA object previously opened + * @param[in] cmd GUID identifying the command to process + * @param[in] buffer Pointer to memory where data will be returned. + * @param[in] buffer_len Length of the buffer passed. + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters + * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the + * handle or while sending the data to the driver. + */ +__FPGA_API__ fpga_result fpgaProcessDeviceCmd(fpga_handle fpga, + fpga_guid cmd, + void *arg, + void *buffer, + size_t buffer_len); + +/** + * Enumerate all the commands supported by the device. + * + * To enumerate all the commands supported by a specific device, call this + * function by passing NULL to buffer arg and it returns the number of bytes + * that needs to be allocated to get all the commands. + * + * Then allocate buffer for that size and call this function to get the list + * of all device supported CMDs. + * + * @param[in] fpga Handle to an FPGA object previously opened + * @param[in] cmds Pointer to memory where cmds will be returned. + * @param[in] num_cmds Pointer to memory where num cmds will be returned. + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if the provided parameters + * are not valid. FPGA_EXCEPTION if an internal error occurred accessing the + * handle or while sending the data to the driver. + */ +__FPGA_API__ fpga_result fpgaGetSupportedCommands(fpga_handle fpga, + fpga_guid *cmds, + uint32_t *num_cmds); + +END_C_DECL + +#endif // __FPGA_MANAGE_H__ + diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h new file mode 100644 index 0000000..7c26d3f --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/mmio.h @@ -0,0 +1,342 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file mmio.h + * @brief Functions for mapping and accessing MMIO space + * + * Most FPGA accelerators provide access to control registers through + * memory-mappable address spaces, commonly referred to as "MMIO spaces". This + * file provides functions to map, unmap, read, and write MMIO spaces. + * + * Note that an accelerator may have multiple MMIO spaces, denoted by the + * `mmio_num` argument of the APIs below. The meaning and properties of each + * MMIO space are up to the accelerator designer. + */ + +#ifndef __FPGA_MMIO_H__ +#define __FPGA_MMIO_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Write 64 bit value to MMIO space + * + * This function will write to MMIO space of the target object at a specified + * offset. + * + * In order to access a resource's MMIO space using this function, it has to be + * mapped to the application's address space using fpgaMapMMIO(). + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] mmio_num Number of MMIO space to access + * @param[in] offset Byte offset into MMIO space + * @param[in] value Value to write (64 bit) + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space + * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. + */ +__FPGA_API__ fpga_result fpgaWriteMMIO64(fpga_handle handle, + uint32_t mmio_num, uint64_t offset, + uint64_t value); + +/** + * Read 64 bit value from MMIO space + * + * This function will read from MMIO space of the target object at a specified + * offset. + * + * In order to access a resource's MMIO space using this function, it has to be + * mapped to the application's address space using fpgaMapMMIO(). + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] mmio_num Number of MMIO space to access + * @param[in] offset Byte offset into MMIO space + * @param[out] value Pointer to memory where read value is returned (64 bit) + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space + * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. + */ +__FPGA_API__ fpga_result fpgaReadMMIO64(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, uint64_t *value); + +/** + * Write 32 bit value to MMIO space + * + * This function will write to MMIO space of the target object at a specified + * offset. + * + * In order to access a resource's MMIO space using this function, it has to be + * mapped to the application's address space using fpgaMapMMIO(). + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] mmio_num Number of MMIO space to access + * @param[in] offset Byte offset into MMIO space + * @param[in] value Value to write (32 bit) + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space + * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. + */ +__FPGA_API__ fpga_result fpgaWriteMMIO32(fpga_handle handle, + uint32_t mmio_num, uint64_t offset, + uint32_t value); + +/** + * Read 32 bit value from MMIO space + * + * This function will read from MMIO space of the target object at a specified + * offset. + * + * In order to access a resource's MMIO space using this function, it has to be + * mapped to the application's address space using fpgaMapMMIO(). + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[in] mmio_num Number of MMIO space to access + * @param[in] offset Byte offset into MMIO space + * @param[out] value Pointer to memory where read value is returned (32 bit) + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. FPGA_NOT_FOUND if the MMIO space + * `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. + */ +__FPGA_API__ fpga_result fpgaReadMMIO32(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, uint32_t *value); + +/** + * Map MMIO space + * + * This function will return a pointer to the specified MMIO space of the + * target object in process virtual memory. Some MMIO spaces may be restricted + * to privileged processes, depending on the used handle and type. + * + * After mapping the respective MMIO space, you can access it either through + * direct pointer operations (observing supported access sizes and alignments + * of the target platform and accelerator), or by using fpgaReadMMIO32(), + * fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(), fpgaReadMmio() + * and fpgaWriteMmio(). + * + * @note This call only supports returning an actual mmio_ptr for hardware + * targets, not for ASE simulation. Use fpgaReadMMIO32(), fpgaWriteMMIO32(), + * fpgeReadMMIO64(), and fpgaWriteMMIO64() if you need ASE simulation + * capabilities. You will still need to call fpgaMapMMIO() before using these + * functions, though. + * + * If the caller passes in NULL for mmio_ptr, no virtual address will be + * returned. This implies that all accesses will be performed through + * fpgaReadMMIO32(), fpgaWriteMMIO32(), fpgeReadMMIO64(), fpgaWriteMMIO64(), + * fpgaReadMmio() and fpgaWriteMmio(). This is the only supported case for ASE. + * + * The number of available MMIO spaces can be retrieved through the num_mmio + * property (fpgaPropertyGetNumMMIO()). + * + * @param[in] handle Handle to previously opened resource + * @param[in] mmio_num Number of MMIO space to access + * @param[out] mmio_ptr Pointer to memory where a pointer to the MMIO space + * will be returned. May be NULL, in which case no pointer + * is returned. + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. FPGA_NO_ACCESS if the process' + * permissions are not sufficient to map the requested MMIO space. + */ +__FPGA_API__ fpga_result fpgaMapMMIO(fpga_handle handle, + uint32_t mmio_num, uint64_t **mmio_ptr); + +/** + * Unmap MMIO space + * + * This function will unmap a previously mapped MMIO space of the target opject, + * rendering any pointers to it invalid. + * + * @note This call is only supported by hardware targets, not by ASE + * simulation. + * + * @param[in] handle Handle to previously opened resource + * @param[in] mmio_num Number of MMIO space to access + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied + * parameters is invalid. FPGA_EXCEPTION if an internal exception occurred + * while trying to access the handle. + */ +__FPGA_API__ fpga_result fpgaUnmapMMIO(fpga_handle handle, + uint32_t mmio_num); + +/** +* Reads the value from MMIO space. +* +* This function will read from MMIO space of the target object at a specified +* offset and length. +* +* In order to access a resource's MMIO space using this function, it has to be +* mapped to the application's address space using fpgaMapMMIO(). +* +* @param[in] handle Handle to previously opened accelerator resource +* @param[in] mmio_num Number of MMIO space to access +* @param[in] offset Byte offset into MMIO space +* @param[out] buffer Pointer to memory where read value is returned +* @param[in] length Length of the MMIO to read. +* @param[in] accessType Read MMIO as 8/16/32/64-bit reads. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space +* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. +*/ +__FPGA_API__ fpga_result fpgaReadMmioType(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, + void* buffer, + uint32_t length, + uint32_t accessType); + +/** +* Write the value to MMIO space. +* +* This function will write to MMIO space of the target object at a specified +* offset and length. +* +* In order to access a resource's MMIO space using this function, it has to be +* mapped to the application's address space using fpgaMapMMIO(). +* +* @param[in] handle Handle to previously opened accelerator resource +* @param[in] mmio_num Number of MMIO space to access +* @param[in] offset Byte offset into MMIO space +* @param[in] buffer Pointer to memory from where data to be written. +* @param[in] length Length of the MMIO to write. +* @param[in] accessType Write MMIO as 8/16/32/64-bit writes. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space +* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. +*/ +__FPGA_API__ fpga_result fpgaWriteMmioType(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, + void* buffer, + uint32_t length, + uint32_t accessType); + + +/** +* Reads the value from MMIO space. +* +* This function will read from MMIO space of the target object at a specified +* offset and length. +* +* In order to access a resource's MMIO space using this function, it has to be +* mapped to the application's address space using fpgaMapMMIO(). +* +* @param[in] handle Handle to previously opened accelerator resource +* @param[in] mmio_num Number of MMIO space to access +* @param[in] offset Byte offset into MMIO space +* @param[out] buffer Pointer to memory where read value is returned +* @param[in] length Length of the MMIO to read. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space +* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. +*/ +__FPGA_API__ fpga_result fpgaReadMmio(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, + void *buffer, + uint32_t length); + +/** +* Write the value to MMIO space. +* +* This function will write to MMIO space of the target object at a specified +* offset and length. +* +* In order to access a resource's MMIO space using this function, it has to be +* mapped to the application's address space using fpgaMapMMIO(). +* +* @param[in] handle Handle to previously opened accelerator resource +* @param[in] mmio_num Number of MMIO space to access +* @param[in] offset Byte offset into MMIO space +* @param[in] buffer Pointer to memory from where data to be written. +* @param[in] length Length of the MMIO to write. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. FPGA_NOT_FOUND if the MMIO space +* `mmio_num` was not mapped using fpgaMapMMIO() before calling this function. +*/ +__FPGA_API__ fpga_result fpgaWriteMmio(fpga_handle handle, + uint32_t mmio_num, + uint64_t offset, + void *buffer, + uint32_t length); + +/** +* Read the config space of the device. +* +* This function will read the configuration space of the FPGA device +* +* @note This call is only supported by PCIe hardware targets, not by ASE +* simulation. +* +* @param[in] handle Handle to previously opened resource +* @param[in] offset Offset within the config space of the device. +* @param[in] buffer Pointer to the buffer where data read will be returned. +* @param[in] length Number of bytes to read. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. +*/ +__FPGA_API__ fpga_result fpgaReadPciConfigSpace(fpga_handle handle, + uint32_t offset, + void* buffer, + uint32_t length); + +/** +* Write to config space of the device. +* +* This function will write to configuration space of the FPGA device +* +* @note This call is only supported by PCIe hardware targets, not by ASE +* simulation. +* +* @param[in] handle Handle to previously opened resource +* @param[in] offset Offset within the config space of the device. +* @param[in] buffer Pointer to the buffer where data read will be returned. +* @param[in] length Number of bytes to read. +* @returns FPGA_OK on success. FPGA_INVALID_PARAM if any of the supplied +* parameters is invalid. FPGA_EXCEPTION if an internal exception occurred +* while trying to access the handle. +*/ +__FPGA_API__ fpga_result fpgaWritePciConfigSpace(fpga_handle handle, + uint32_t offset, + void* buffer, + uint32_t length); + +END_C_DECL + +#endif // __FPGA_MMIO_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h new file mode 100644 index 0000000..03e5e79 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/properties.h @@ -0,0 +1,689 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file properties.h + * @brief Functions for examining and manipulating `fpga_properties` objects + * + * In OPAE, `fpga_properties` objects are used both for obtaining information + * about resources and for selectively enumerating resources based on their + * properties. This file provides accessor functions (get/set) to allow reading + * and writing individual items of an `fpga_properties` object. Generally, not + * all object types supported by OPAE carry all properties. If you call a + * property accessor method on a `fpga_properties` object that does not support + * this particular property, it will return FPGA_INVALID_PARAM. + * + * # Accessor Return Values + * In addition to the return values specified in the documentation below, all + * accessor functions return FPGA_OK on success, FPGA_INVALID_PARAM if you pass + * NULL or invalid parameters (i.e. non-initialized properties objects), + * FPGA_EXCEPTION if an internal exception occurred trying to access the + * properties object, FPGA_NOT_FOUND if the requested property is not part of + * the supplied properties object. + */ + +#ifndef __FPGA_PROPERTIES_H__ +#define __FPGA_PROPERTIES_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Create a fpga_properties object + * + * Initializes the memory pointed at by `prop` to represent a properties + * object, and populates it with the properties of the resource referred to by + * `token`. Individual properties can then be queried using fpgaPropertiesGet*() + * accessor functions. + * + * If `token` is NULL, an "empty" properties object is created to be used as a + * filter for fpgaEnumerate(). All individual fields are set to `don`t care`, + * which implies that the fpga_properties object would match all FPGA resources + * if used for an fpgaEnumerate() query. The matching criteria can be further + * refined by using fpgaSet* functions on the properties object, or the + * object can be populated with the actual properties of a resource by using + * fpgaUpdateProperties(). + * + * @Note fpgaGetProperties() will allocate memory for the created properties + * object returned in `prop`. It is the responsibility of the using application + * to free this memory after use by calling fpgaDestroyProperties(). + * + * @param[in] token Token to get properties for. Can be NULL, which will + * create an empty properties object to be used as a + * filter for fpgaEnumerate(). + * @param[out] prop Pointer to a variable of type fpga_properties + * @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated + * to create the `fpga_properties` object. FPGA_EXCEPTION if an exception + * happend while initializing the `fpga_properties` object. + */ +__FPGA_API__ fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop); + +/** + * Update a fpga_properties object + * + * Populates the properties object 'prop' with properties of the resource + * referred to by 'token'. Unlike fpgaGetProperties(), this call will not create + * a new properties object or allocate memory for it, but use a previously + * created properties object. + * + * @param[in] token Token to retrieve properties for + * @param[in] prop fpga_properties object to update + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `token` or `prop` are not + * valid objects. FPGA_NOT_FOUND if the resource referred to by `token` was + * not found. FPGA_NO_DRIVER if not driver is loaded. FPGA_EXCEPTION if an + * internal exception occured when trying to update `prop`. + */ +__FPGA_API__ fpga_result fpgaUpdateProperties(fpga_token token, fpga_properties prop); + +/** + * Clear a fpga_properties object + * + * Sets all fields of the properties object pointed at by 'prop' to 'don't + * care', which implies that the fpga_properties object would match all FPGA + * resources if used for an fpgaEnumerate() query. The matching criteria can be + * further refined by using fpgaSet* functions on the properties object. + * + * Instead of creating a new fpga_properties object every time, this function + * can be used to re-use fpga_properties objects from previous queries. + * + * @param[in] prop fpga_properties object to clear + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `prop` is not a valid + * object. FPGA_EXCEPTION if an * internal exception occured when trying to + * access `prop`. + */ +__FPGA_API__ fpga_result fpgaClearProperties(fpga_properties prop); + +/** + * Clone a fpga_properties object + * + * Creates a copy of an fpga_properties object. + * + * @Note This call creates a new properties object and allocates memory for it. + * Both the 'src' and the newly created 'dst' objects will eventually need to be + * destroyed using fpgaDestroyProperties(). + * + * @param[in] src fpga_properties object to copy + * @param[out] dst New fpga_properties object cloned from 'src' + * @returns FPGA_OK on success. FPGA_INVALID_PARAM if `src` is not a valid + * object, or if `dst` is NULL. FPGA_NO_MEMORY if there was not enough memory + * to allocate an `fpga_properties` object for `dst`. FPGA_EXCEPTION if an + * internal exception occurred either accessing `src` or updating `dst`. + */ +__FPGA_API__ fpga_result fpgaCloneProperties(fpga_properties src, fpga_properties *dst); + +/** + * Destroy a fpga_properties object + * + * Destroys an existing fpga_properties object that the caller has previously + * created using fpgaGetProperties() or fpgaCloneProperties(). + * + * @param[inout] prop Pointer to the fpga_properties object to destroy + * @returns FPGA_OK on success. FPGA_INVALID_PARAM is `prop` is not a valid + * object. FPGA_EXCEPTION if an internal exception occurrred while trying to + * access `prop`. + */ +__FPGA_API__ fpga_result fpgaDestroyProperties(fpga_properties *prop); + +/** + * Get the token of the parent object + * + * Returns the token of the parent of the queried resource in '*parent'. + * + * @param[in] prop Properties object to query + * @param[out] parent Pointer to a token variable of the resource 'prop' is + * associated with + * @returns FPGA_NOT_FOUND if resource does not have a + * parent (e.g. an FPGA_DEVICE resource does not have parents). Also see + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetParent(const fpga_properties prop, + fpga_token *parent); + +/** + * Set the token of the parent object + * + * @param[in] prop Properties object to modify + * @param[out] parent Pointer to a token variable of the resource 'prop' is + * associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetParent(fpga_properties prop, + fpga_token parent); + +/** + * Get the object type of a resource + * + * Returns the object type of the queried resource. + * + * @param[in] prop Properties object to query + * @param[out] objtype Pointer to an object type variable of the resource + * 'prop' is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetObjectType(const fpga_properties prop, + fpga_objtype *objtype); + +/** + * Set the object type of a resource + * + * Sets the object type of the resource. * Currently supported object types are + * FPGA_DEVICE and FPGA_ACCELERATOR. + * + * @param[in] prop Properties object to modify + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetObjectType(fpga_properties prop, + fpga_objtype objtype); + +/** + * Get the PCI bus number of a resource + * + * Returns the bus number the queried resource. + * + * @param[in] prop Properties object to query + * @param[out] bus Pointer to a PCI bus variable of the resource 'prop' + * is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetBus(const fpga_properties prop, uint8_t *bus); + +/** + * Set the PCI bus number of a resource + * + * @param[in] prop Properties object to modify + * @param[in] bus PCI bus number of the resource 'prop' is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetBus(fpga_properties prop, uint8_t bus); + +/** + * Get the PCI device number of a resource + * + * Returns the device number the queried resource. + * + * @param[in] prop Properties object to query + * @param[out] device Pointer to a PCI device variable of the resource 'prop' + * is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetDevice(const fpga_properties prop, + uint8_t *device); + +/** + * Set the PCI device number of a resource + * + * Enforces the limitation on the number of devices as specified in the + * PCI spec. + * + * @param[in] prop Properties object to modify + * @param[in] device PCI device number of the resource 'prop' is associated + * with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetDevice(fpga_properties prop, + uint8_t device); + +/** + * Get the PCI function number of a resource + * + * Returns the function number the queried resource. + * + * @param[in] prop Properties object to query + * @param[out] function Pointer to PCI function variable of the + * resource 'prop' is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetFunction(const fpga_properties prop, + uint8_t *function); + +/** + * Set the PCI function number of a resource + * + * Enforces the limitation on the number of functions as specified in the + * PCI spec. + * + * @param[in] prop Properties object to modify + * @param[in] function PCI function number of the resource 'prop' is + * associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetFunction(fpga_properties prop, + uint8_t function); + +/** + * Get the socket id of a resource + * + * Returns the socket id of the queried resource. + * + * @param[in] prop Properties object to query + * @param[out] socket_id Pointer to a socket id variable of the + * resource 'prop' + * is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + * See also "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetSocketID(const fpga_properties prop, + uint8_t *socket_id); + +/** + * Set the socket id of the resource + * + * @param[in] prop Properties object to modify + * @param[in] socket_id Socket id of the resource 'prop' is + * associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetSocketID(fpga_properties prop, + uint8_t socket_id); + +/** + * Get the device id of the resource + * + * @param[in] prop Properties object to query + * @param[out] device_id Pointer to a device id variable of the + * resource 'prop' is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetDeviceID(const fpga_properties prop, + uint32_t *device_id); + +/** + * Set the device id of the resource + * + * @param[in] prop Properties object to modify + * @param[in] device_id Device id of the resource 'prop' is associated with + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetDeviceID(fpga_properties prop, + uint32_t device_id); + +/** + * Get the number of slots of an FPGA resource property + * + * Returns the number of slots present in an FPGA. + * + * @param[in] prop Properties object to query - must be of type FPGA_DEVICE + * @param[out] num_slots Pointer to number of slots variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetNumSlots(const fpga_properties prop, + uint32_t *num_slots); + +/** + * Set the number of slots of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type + * FPGA_DEVICE + * @param[in] num_slots Number of slots of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetNumSlots(fpga_properties prop, + uint32_t num_slots); + +/** + * Get the BBS ID of an FPGA resource property + * + * Returns the blue bitstream id of an FPGA. + * + * @param[in] prop Properties object to query - must be of type FPGA_DEVICE + * @param[out] bbs_id Pointer to a bbs id variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetBBSID(const fpga_properties prop, + uint64_t *bbs_id); + +/** + * Set the BBS ID of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type + * FPGA_DEVICE + * @param[in] bbs_id Blue bitstream id of the FPGA resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetBBSID(fpga_properties prop, + uint64_t bbs_id); + +/** + * Get the BBS Version of an FPGA resource property + * + * Returns the blue bitstream version of an FPGA. + * + * @param[in] prop Properties object to query - must be of type + * FPGA_DEVICE + * @param[out] bbs_version Pointer to a bbs version variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetBBSVersion(const fpga_properties prop, + fpga_version *bbs_version); + +/** + * Set the BBS Version of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type + * FPGA_DEVICE + * @param[in] bbs_version Blue bitstream version of the FPGA resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetBBSVersion(fpga_properties prop, + fpga_version version); + +/** + * Get the vendor id of an FPGA resource property + * + * Returns the vendor id of an FPGA. + * + * @param[in] prop Properties object to query - must be of type FPGA_DEVICE + * @param[out] vendor_id Pointer to a vendor id variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesGetVendorID(const fpga_properties prop, + uint16_t *vendor_id); + +/** + * Set the vendor id of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type FPGA_DEVICE + * @param[in] vendor_id Vendor id of the FPGA resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesSetVendorID(fpga_properties prop, + uint16_t vendor_id); + +/** + * Get the model of an FPGA resource property + * + * Returns the model of an FPGA. + * + * @param[in] prop Properties object to query - must be of type FPGA_DEVICE + * @param[in] model Model of the FPGA resource (string of minimum + * FPGA_MODEL_LENGTH length + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesGetModel(const fpga_properties prop, + char *model); + +/** + * Set the model of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type FPGA_DEVICE + * @param[in] model Model of the FPGA resource (string of maximum + * FPGA_MODEL_LENGTH length + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesSetModel(fpga_properties prop, + char *model); + +/** + * Get the local memory size of an FPGA resource property + * + * Returns the local memory size of an FPGA. + * + * @param[in] prop Properties object to query - must be of type FPGA_DEVICE + * @param[out] lms Pointer to a memory size variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties prop, + uint64_t *lms); + +/** + * Set the local memory size of an FPGA resource property + * + * @param[in] prop Properties object to modify - must be of type FPGA_DEVICE + * @param[in] lms Local memory size of the FPGA resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesSetLocalMemorySize(fpga_properties prop, + uint64_t lms); + +/** + * Get the capabilities FPGA resource property + * + * Returns the capabilities of an FPGA. + * Capabilities is a bitfield value + * + * @param[in] prop Properties object to query - must be of type + * FPGA_DEVICE + * @param[out] capabilities Pointer to a capabilities variable of the FPGA + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesGetCapabilities(const fpga_properties prop, + uint64_t *capabilities); + +/** + * Set the capabilities of an FPGA resource property + * + * Capabilities is a bitfield value + * + * @param[in] prop Properties object to modify - must be of type + * FPGA_DEVICE + * @param[in] capabilities Capabilities of the FPGA resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_DEVICE. See also + * "Accessor Return Values" in [properties.h](#properties-h). + * + * @note This API is not currently supported. + */ +__FPGA_API__ fpga_result fpgaPropertiesSetCapabilities(fpga_properties prop, + uint64_t capabilities); + +/** + * Get the GUID of a resource + * + * Returns the GUID of an FPGA or accelerator object. + * + * For an accelerator, the GUID uniquely identifies a specific accelerator context type, + * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID + * is used to identify a certain instance of an FPGA, e.g. to determine whether + * a given bitstream would be compatible. + * + * @param[in] prop Properties object to query + * @param[out] guid Pointer to a GUID of the slot variable + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetGUID(const fpga_properties prop, + fpga_guid *guid); + +/** + * Set the GUID of a resource + * + * Sets the GUID of an FPGA or accelerator object. + * + * For an accelerator, the GUID uniquely identifies a specific accelerator context type, + * i.e. different accelerators will have different GUIDs. For an FPGA, the GUID + * is used to identify a certain instance of an FPGA, e.g. to determine whether + * a given bitstream would be compatible. + * + * @param[in] prop Properties object to modify + * @param[out] guid Pointer to a GUID of the slot variable + * @returns See "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid); + +/** + * Get the number of mmio spaces + * + * Returns the number of mmio spaces of an AFU properties structure. + * + * @param[in] prop Properties object to query - must be of type FPGA_ACCELERATOR + * @param[out] mmio_spaces Pointer to a variable for number of mmio spaces + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetNumMMIO(const fpga_properties prop, + uint32_t *mmio_spaces); + +/** + * Set the number of mmio spaces + * + * Sets the number of mmio spaces of an AFU properties structure. + * + * @param[in] prop Properties object to modify - must be of type FPGA_ACCELERATOR + * @param[in] mmio_spaces Number of MMIO spaces of the accelerator + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetNumMMIO(fpga_properties prop, + uint32_t mmio_spaces); + +/** + * Get the number of interrupts + * + * Returns the number of interrupts of an accelerator properties structure. + * + * @param[in] prop Properties object to query - must be of type FPGA_ACCELERATOR + * @param[out] mmio_spaces Pointer to a variable for number of interrupts + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetNumInterrupts(const fpga_properties prop, + uint32_t *num_interrupts); + +/** + * Set the number of mmio spaces + * + * Sets the number of interrupts of an accelerator properties structure. + * + * @param[in] prop Properties object to modify - must be of type FPGA_ACCELERATOR + * @param[in] mmio_spaces Number of interrupts of the accelerator + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetNumInterrupts(fpga_properties prop, + uint32_t num_interrupts); + +/** + * Get the state of a accelerator resource property + * + * Returns the accelerator state of a accelerator. + * + * @param[in] prop Properties object to query - must be of type FPGA_ACCELERATOR + * @param[out] status Pointer to a accelerator state variable of the accelerator + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesGetAcceleratorState(const fpga_properties prop, + fpga_accelerator_state *state); + + +/** + * Set the state of an accelerator resource property + * + * @param[in] prop Properties object to modify - must be of type FPGA_ACCELERATOR + * @param[in] status accelerator state of the accelerator resource + * @returns FPGA_INVALID_PARAM if object type is not FPGA_ACCELERATOR. See also + * "Accessor Return Values" in [properties.h](#properties-h). + */ +__FPGA_API__ fpga_result fpgaPropertiesSetAcceleratorState(fpga_properties prop, + fpga_accelerator_state state); + +/** +* Get the object ID of a resource +* +* Returns the object ID of a resource. The object ID is a 64 bit identifier +* that is unique within a single node or system. It represents a similar +* concept as the token, but can be used across processes (e.g. passed on the +* command line). +* +* @param[in] prop Properties object to query +* @param[out] object_id Pointer to a 64bit memory location to store the object +* ID in +* @returns See "Accessor Return Values" in [properties.h](#properties-h). +*/ +__FPGA_API__ fpga_result fpgaPropertiesGetObjectID(fpga_properties prop, + uint64_t *object_id); + + +/** +* Set the object ID of a resource +* +* Sets the object ID of a resource. The object ID is a 64 bit identifier +* that is unique within a single node or system. It represents a similar +* concept as the token, but can be used across processes (e.g. passed on the +* command line). +* +* @param[in] prop Properties object to query +* @param[in] object_id A 64bit value to use as the object ID +* @returns See "Accessor Return Values" in [properties.h](#properties-h). +*/ +__FPGA_API__ fpga_result fpgaPropertiesSetObjectID(fpga_properties prop, + uint64_t object_id); + +/** +* Create a fpga_properties object +* +* Initializes the memory pointed at by `prop` to represent a properties +* object, and populates it with the properties of the resource referred to by +* `handle`. Individual properties can then be queried using fpgaPropertiesGet*() +* accessor functions. +* +* @note fpgaGetPropertiesFromHandle() will allocate memory for the created properties +* object returned in `prop`. It is the responsibility of the caller +* to free this memory after use by calling fpgaDestroyProperties(). +* +* @param[in] handle Open handle to get properties for. +* @param[out] prop Pointer to a variable of type fpga_properties +* @returns FPGA_OK on success. FPGA_NO_MEMORY if no memory could be allocated +* to create the `fpga_properties` object. FPGA_EXCEPTION if an exception +* happend while initializing the `fpga_properties` object. +**/ +__FPGA_API__ +fpga_result +fpgaGetPropertiesFromHandle( + fpga_handle handle, + fpga_properties *prop + ); + +END_C_DECL + +#endif // __FPGA_PROPERTIES_H__ + diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h new file mode 100644 index 0000000..481e6ae --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types.h @@ -0,0 +1,173 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file types.h + * @brief Type definitions for FPGA API + * + * OPAE uses the three opaque types fpga_properties, fpga_token, and + * fpga_handle to create a hierarchy of objects that can be used to enumerate, + * reference, acquire, and query FPGA resources. This object model is designed + * to be extensible to account for different FPGA architectures and platforms. + * + * Initialization + * -------------- + * OPAEs management of the opaque types `fpga_properties`, + * `fpga_token`, and `fpga_handle` relies on the proper initialization of + * variables of these types. In other words, before doing anything with a + * variable of one of these opaque types, you need to first initialize them. + * + * The respective functions that initizalize opaque types are: + * + * * fpgaGetProperties() and fpgaCloneProperties() for `fpga_properties` + * * fpgaEnumerate() and fpgaCloneToken() for `fpga_token` + * * fpgaOpen() for `fpga_handle` + * + * This should intuitively make sense - fpgaGetProperties() creates + * `fpga_properties` objects, fpgaEnumerate() creates `fpga_token` objects, + * fpgaOpen() creates `fpga_handle` objects, and fpgaCloneProperties() and + * fpgaCloneToken() clone (create) `fpga_properties` and `fpga_token` objects, + * respectively. + * + * Since these opaque types are interpreted as pointers (they are typedef'd to + * a `void *`), passing an uninitialized opaque type into any function except + * the respective initailzation function will result in undefined behaviour, + * because OPAE will try to follow an invalid pointer. Undefined behaviour in + * this case may include an unexpected error code, or an application crash. + * + */ + +#ifndef __FPGA_TYPES_H__ +#define __FPGA_TYPES_H__ + +#include <stdint.h> +#include <stddef.h> +#include <opae/types_enum.h> + +/** + * Object for expressing FPGA resource properties + * + * `fpga_properties` objects encapsulate all enumerable information about an + * FPGA resources. They can be used for two purposes: selective enumeration + * (discovery) and querying information about existing resources. + * + * For selective enumeration, usually an empty `fpga_properties` object is + * created (using fpgaGetProperties()) and then populated with the desired + * criteria for enumeration. An array of `fpga_properties` can then be passed + * to fpgaEnumerate(), which will return a list of `fpga_token` objects + * matching these criteria. + * + * For querying properties of existing FPGA resources, fpgaGetProperties() can + * also take an `fpga_token` and will return an `fpga_properties` object + * populated with information about the resource referenced by that token. + * + * After use, `fpga_properties` objects should be destroyed using + * fpga_destroyProperties() to free backing memory used by the + * `fpga_properties` object. + */ +typedef void *fpga_properties; + +/** + * Token for referencing FPGA resources + * + * An `fpga_token` serves as a reference to a specific FPGA resource present in + * the system. Holding an `fpga_token` does not constitute ownership of the + * FPGA resource - it merely allows the user to query further information about + * a resource, or to use fpgaOpen() to acquire ownership. + * + * `fpga_token`s are usually returned by fpgaEnumerate() or + * fpgaPropertiesGetParent(), and used by fpgaOpen() to acquire ownership and + * yield a handle to the resource. Some API calls also take `fpga_token`s as + * arguments if they don't require ownership of the resource in question. + */ +typedef void *fpga_token; + +/** + * Handle to an FPGA resource + * + * A valid `fpga_handle` object, as populated by fpgaOpen(), denotes ownership + * of an FPGA resource. Note that ownership can be exclusive or shared, + * depending on the flags used in fpgaOpen(). Ownership can be released by + * calling fpgaClose(), which will render the underlying handle invalid. + * + * Many OPAE C API functions require a valid token (which is synonymous with + * ownership of the resource). + */ +typedef void *fpga_handle; + +/** + * Globally unique identifier (GUID) + * + * GUIDs are used widely within OPAE for helping identify FPGA resources. For + * example, every FPGA resource has a `guid` property, which can be (and in the + * case of FPGA_ACCELERATOR resource primarily is) used for enumerating a resource of a + * specific type. + * + * `fpga_guid` is compatible with libuuid's uuid_t, so users can use libuuid + * functions like uuid_parse() to create and work with GUIDs. + */ +typedef uint8_t fpga_guid[16]; + +/** + * Semantic version + * + * Data structure for expressing version identifiers following the semantic + * versioning scheme. Used in various properties for tracking component + * versions. + */ +typedef struct { + uint8_t major; /**< Major version */ + uint8_t minor; /**< Minor version */ + uint16_t patch; /**< Revision or patchlevel */ +} fpga_version; + +/* + * Scatter Gather list in userspace that will be populated during fpgaGetPhysicalAddress call + */ +typedef struct _sg_element { + uint64_t phys_addr; /**< Starting physical address of this scatter/gather region */ + uint32_t length; /**< length, in bytes, of a physically contiguous SG region */ +} sg_element, *psg_element; + +/** Handle to an event object + * + * OPAE provides an interface to asynchronous events that can be generated by + * different FPGA resources. The event API provides functions to register for + * these events; associated with every event a process has registered for is an + * `fpga_event_handle`, which encapsulates the OS-specific data structure for + * event objects. + * + * On Linux, an `fpga_event_handle` can be used as a file descriptor and passed + * to select(), poll(), epoll() and similar functions to wait for asynchronous + * events. + */ +#ifndef _WIN32 +typedef int fpga_event_handle; +#else +typedef HANDLE fpga_event_handle; +#endif + +#endif // __FPGA_TYPES_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h new file mode 100644 index 0000000..6fc4de2 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/types_enum.h @@ -0,0 +1,196 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * @file types_enum.h + * @brief Definitions of enumerated types for the OPAE C API + * + * This file defines return and error codes, event and object types, states, + * and flags as used or reported by OPAE C API functions. + */ + +#ifndef __FPGA_TYPES_ENUM_H__ +#define __FPGA_TYPES_ENUM_H__ + +#ifdef _WIN32 +#ifdef FpgaLib_EXPORTS +#define __FPGA_API__ __declspec(dllexport) +#else +#define __FPGA_API__ __declspec(dllimport) +#endif +#else +#define __FPGA_API__ __attribute__((visibility("default"))) +#endif + +#ifdef __cplusplus +#define BEGIN_C_DECL extern "C" { +#define END_C_DECL } +#else +#define BEGIN_C_DECL +#define END_C_DECL +#endif + +/** + * OPAE C API function return codes + * + * Every public API function exported by the OPAE C library will return one of + * these codes. Usually, FPGA_OK denotes successful completion of the requested + * operation, while any return code *other* than FPGA_OK indicates an error or + * other deviation from the expected behavior. Users of the OPAE C API should + * always check the return codes of the APIs they call, and not use output + * parameters of functions that did not execute successfully. + + * The fpgaErrStr() function converts error codes into printable messages. + * + * OPAE also has a logging mechanism that allows a developer to get more + * information about why a particular call failed with a specific message. If + * enabled, any function that returns an error code different from FPGA_OK will + * also print out a message with further details. This mechanism can be enabled + * by setting the environment variable `LIBOPAE_LOG` to 1 before running the + * respective application. + */ + +// +// Minimum alignment requirement for DMA BBB +// +#define FPGA_DMA_ALIGN_BYTES 64 + +// +// Maximum size (in bytes0 descriptor of each SGDMA +// block can transfer. For pre-alpha maximum transfer size is +// One Meg minus some bytes. + +#define FPGA_DMA_BUF_SIZE (1020*1024) + +// +// Number of DMA blocks supported by SGDMA. +// Currently only one is supported by pre-alpha +// bitstream +// +#define NDMA 1 + +typedef enum { + FPGA_OK = 0, /**< Operation completed successfully */ + FPGA_INVALID_PARAM, /**< Invalid parameter supplied */ + FPGA_BUSY, /**< Resource is busy */ + FPGA_EXCEPTION, /**< An exception occurred */ + FPGA_NOT_FOUND, /**< A required resource was not found */ + FPGA_NO_MEMORY, /**< Not enough memory to complete operation */ + FPGA_NOT_SUPPORTED, /**< Requested operation is not supported */ + FPGA_NO_DRIVER, /**< Driver is not loaded */ + FPGA_NO_DAEMON, /**< FPGA Daemon (fpgad) is not running */ + FPGA_NO_ACCESS, /**< Insufficient privileges or permissions */ + FPGA_RECONF_ERROR /**< Error while reconfiguring FPGA */ +} fpga_result; + + /* + * FPGA events + * + * OPAE currently defines the following event types that applications can + * register for.Note that not all FPGA resources and target platforms may + * support all event types. + */ +typedef enum +{ + FPGA_NO_EVENT = 0, + FPGA_EVENT_INTERRUPT, /**< Interrupt generated by an accelerator */ + FPGA_EVENT_ERROR, /**< Infrastructure error event */ + FPGA_EVENT_POWER_THERMAL, /**< Infrastructure thermal event */ + FPGA_EVENT_PORT_ERROR, + FPGA_EVENT_FME_ERROR, + FPGA_LIFECYCLE_APPEAR_EVENT, + FPGA_LIFECYCLE_DISAPPEAR_EVENT, + FPGA_EVENT_AFC_INTERRUPT, + FPGA_EVENT_TYPE_MAX, + FPGA_EVENT_AP_EVENT, + FPGA_MAX_EVENT +} fpga_event_type; + +/* TODO: consider adding lifecycle events in the future + * to help with orchestration. Need a complete specification + * before including them in the API. Proposed events: + * FPGA_EVENT_APPEAR + * FPGA_EVENT_DISAPPEAR + * FPGA_EVENT_CHANGE + */ + +/** accelerator state */ +typedef enum { + FPGA_ACCELERATOR_ASSIGNED = 0, /**< accelerator is opened exclusively by another process */ + FPGA_ACCELERATOR_UNASSIGNED, /**< accelerator is free to be opened */ + FPGA_ACCELERATOR_STATE_MAX +} fpga_accelerator_state; + +/** + * OPAE FPGA resources (objects) + * + * These are the FPGA resources currently supported by the OPAE object model. + */ +typedef enum { + /** FPGA_DEVICE objects represent FPGA devices and their management functionality. + * These objects can be opened (typically requires a certain privilege level or + * access permissions) and used for management functions like fpgaReconfigreSlot(). */ + FPGA_DEVICE = 0, + /** FPGA_ACCELERATOR objects represent allocatable units for accessing + * accelerated functions on the FPGA. They are frequently opened for + * interacting via control registers (MMIO), shared memory, or other, + * possibly platform-specific functions. */ + FPGA_ACCELERATOR, + FPGA_OBJTYPE_MAX +} fpga_objtype; + +/** + * Buffer flags + * + * These flags can be passed to the fpgaPrepareBuffer() function. + */ +enum fpga_buffer_flags { + FPGA_BUF_PREALLOCATED = (1u << 0), /**< Use existing buffer */ + FPGA_BUF_QUIET = (1u << 1), /**< Suppress error messages */ + FPGA_BUF_NOCACHE = (1u << 2), + FPGA_BUF_LARGE_PAGE = (1u << 4) /*< For 2MB page support in VTP */ +}; + +/** + * Open flags + * + * These flags can be passed to the fpgaOpen() function. + */ +enum fpga_open_flags { + FPGA_OPEN_SHARED = (1u << 0) /**< Open FPGA resource for shared access */ +}; + +/** + * Reconfiguration flags + * + * These flags can be passed to the fpgaReconfigure() function. + */ +enum fpga_reconf_flags { + /** Reconfigure the slot without checking if it is in use */ + FPGA_RECONF_FORCE = (1u << 0) +}; + +#endif // __FPGA_TYPES_ENUM_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h new file mode 100644 index 0000000..6e073ee --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/umsg.h @@ -0,0 +1,112 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * \file umsg.h + * \brief FPGA UMsg API + */ + +#ifndef __FPGA_UMSG_H__ +#define __FPGA_UMSG_H__ + +#include <opae/types.h> + +BEGIN_C_DECL + +/** + * Get number of Umsgs + * + * Retuns number of umsg supported by AFU. + * + * + * @param[in] handle Handle to previously opened accelerator resource + * @prarm[out] value Returns number of UMsgs + * @returns FPGA_OK on success. + * FPGA_INVALID_PARAM if input parameter combination + * is not valid. + * FPGA_EXCEPTION if input parameter fpga handle is not + * valid. + */ +__FPGA_API__ fpga_result fpgaGetNumUmsg(fpga_handle handle, uint64_t *value); + +/** + * Sets Umsg hint + * + * Writes usmg hint bit. + * + * + * @param[in] handle Handle to previously opened accelerator resource + * @prarm[in] value Value to use for UMsg hint, Umsg hit is N wide bitvector + * where N = number of Umsgs. + * @returns FPGA_OK on success. + * FPGA_INVALID_PARAM if input parameter combination + * is not valid. + * FPGA_EXCEPTION if input parameter fpga handle is not + * valid. + */ +__FPGA_API__ fpga_result fpgaSetUmsgAttributes(fpga_handle handle, + uint64_t value); + +/** + * Trigger Umsg + * + * Writes a 64-bit value to trigger low-latency accelerator notification mechanism + * (UMsgs). + * + * @param[in] handle Handle to previously opened accelerator resource + * @prarm[in] value Value to use for UMsg + * @returns FPGA_OK on success. + * FPGA_INVALID_PARAM if input parameter combination + * is not valid. + * FPGA_EXCEPTION if input parameter fpga handle is not + * valid. + */ +__FPGA_API__ fpga_result fpgaTriggerUmsg(fpga_handle handle, uint64_t value); + +/** + * Access UMsg memory directly + * + * This function will return a pointer to the memory allocated for low latency + * accelerator notifications (UMsgs). + * @note This call is only supported by hardware targets, not by ASE + * simulation. Use fpgaTriggerUmsg() if you need ASE simulation capabilities. + * + * @param[in] handle Handle to previously opened accelerator resource + * @param[out] umsg_ptr Pointer to memory where a pointer to the virtual + * address space will be returned + * @returns FPGA_OK on success. + * FPGA_INVALID_PARAM if input parameter combination + * is not valid. + * FPGA_EXCEPTION if input parameter fpga handle is not + * valid. + * FPGA_NO_MEMORY if memory allocation fails or system + * doesn't configure huge pages. + */ +__FPGA_API__ fpga_result fpgaGetUmsgPtr(fpga_handle handle, uint64_t **umsg_ptr); + +END_C_DECL + +#endif // __FPGA_UMSG_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h new file mode 100644 index 0000000..5b57cbd --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/utils.h @@ -0,0 +1,54 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +/** + * \file utils.h + * \brief Utility functions and macros for the FPGA API + */ + +#ifndef __FPGA_UTILS_H__ +#define __FPGA_UTILS_H__ + +#include <opae/types.h> +#include <stdio.h> + +BEGIN_C_DECL + +/** + * Return human-readable error message + * + * Returns a pointer to a human-readable error message corresponding to the + * provided fpga_error error code. + * + * @param[in] e Error code (as returned by another FPGA API function + * @returns Pointer to a descriptive error message string + */ +__FPGA_API__ const char *fpgaErrStr(fpga_result e); + +END_C_DECL + +#endif // __FPGA_UTILS_H__ + diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h new file mode 100644 index 0000000..66bd18b --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/opae/version.h @@ -0,0 +1,79 @@ +// Copyright(c) 2017 - 2019, Intel Corporation +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of Intel Corporation nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef __FPGA_VERSION_H__ +#define __FPGA_VERSION_H__ + +#include <opae/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Get version information about the OPAE library + * + * Retrieve major version, minor version, and revision information about the + * OPAE library. + * + * @param[out] version FPGA version + * @returns FPGA_INVALID_PARAM if any of the output parameters is NULL, FPGA_OK + * otherwise. + */ +__FPGA_API__ fpga_result fpgaGetOPAECVersion(fpga_version *version); + +/** + * Get version information about the OPAE library as a string + * + * Retrieve major version, minor version, and revision information about the + * OPAE library, encoded in a human-readable string (e.g. "1.0.0"). + * + * @param[out] version_str String to copy version information into + * @param[in] len Length of `version_str` + * @returns FPGA_INVALID_PARAM if `version_str` is NULL, FPGA_EXCEPTION if the + * version string cannot be copied into `version_str`, FPGA_OK otherwise. + */ +__FPGA_API__ fpga_result fpgaGetOPAECVersionString(char *version_str, size_t len); +#define FPGA_VERSION_STR_MAX 10 + +/** + * Get build information about the OPAE library as a string + * + * Retrieve the build identifier of the OPAE library. + * + * @param[out] build_str String to copy build information into + * @param[in] len Length of `build_str` + * @returns FPGA_INVALID_PARAM if `build_str` is NULL, FPGA_EXCEPTION if the + * version string cannot be copied into `build_str`, FPGA_OK otherwise. + */ +__FPGA_API__ fpga_result fpgaGetOPAECBuildString(char *build_str, size_t len); +#define FPGA_BUILD_STR_MAX 41 + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // __FPGA_VERSION_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h new file mode 100644 index 0000000..27f4f1e --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/include/pkg_editor.h @@ -0,0 +1,170 @@ +/* Editor for Altera OpenCL package files + * + * Dmitry Denisenko, June 2012. + * + * This provides higher-level functions for ELF work. + * The idea is to put content into sections, one "piece" of content + * per section, and use section names to identify the content. + * The interface enforces unique section names (not true for generic ELFs) + * and hides all the ugly ELF interface calls and structures. + */ + +#ifndef PKG_FILE_EDITOR_H +#define PKG_FILE_EDITOR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_STRING_LENGTH 100000 + +/* Modes for open_struct acl_pkg_file() call. + * Exactly one of ACL_PKG_READ, ACL_PKG_READ_WRITE must be supplied. + * Other flags may be bitwise OR'd into the mode. + * + * You can combine other modes with ACL_PKG_SHOW_* to control messages. + */ +#define ACL_PKG_READ (1<<0) /* Only reading the package */ +#define ACL_PKG_READ_WRITE (1<<1) /* Expect to read and write the binary. File must already exist. */ +#define ACL_PKG_CREATE (1<<2) /* Also creating. Can only be used with ACL_PKG_READ_WRITE */ + +#define ACL_PKG_SHOW_ERROR (1<<8) /*print errors to stderr*/ +#define ACL_PKG_SHOW_INFO (1<<9) /*print info messages to stdout*/ + +#define ACL_PKG_SECTION_ACL_VERSION ".acl.version" +#define ACL_PKG_SECTION_ACL_BUILD ".acl.build" +#define ACL_PKG_SECTION_QVERSION ".acl.qversion" +#define ACL_PKG_SECTION_HASH ".acl.hash" +#define ACL_PKG_SECTION_BOARD ".acl.board" +#define ACL_PKG_SECTION_COMPILEOPTIONS ".acl.compileoptions" +#define ACL_PKG_SECTION_SOURCE ".acl.source" +#define ACL_PKG_SECTION_LLVMIR ".acl.llvmir" +#define ACL_PKG_SECTION_VERILOG ".acl.verilog" +#define ACL_PKG_SECTION_PROFILE_BASE ".acl.profile_base" +#define ACL_PKG_SECTION_AUTODISCOVERY ".acl.autodiscovery" +#define ACL_PKG_SECTION_RBF ".acl.rbf" +#define ACL_PKG_SECTION_CORE_RBF ".acl.core.rbf" +#define ACL_PKG_SECTION_PERIPH_RBF ".acl.periph.rbf" +#define ACL_PKG_SECTION_BASE_RBF ".acl.base_revision.rbf" +#define ACL_PKG_SECTION_SOF ".acl.sof" +#define ACL_PKG_SECTION_VFABRIC ".acl.vfabric" +#define ACL_PKG_SECTION_PLL_CONFIG ".acl.pll_config" +#define ACL_PKG_SECTION_FPGA_BIN ".acl.fpga.bin" +#define ACL_PKG_SECTION_EMULATOR_OBJ_LINUX ".acl.emulator_object.linux" +#define ACL_PKG_SECTION_EMULATOR_OBJ_WINDOWS ".acl.emulator_object.windows" +#define ACL_PKG_SECTION_AUTODISCOVERY_XML ".acl.autodiscovery.xml" +#define ACL_PKG_SECTION_BOARDSPEC_XML ".acl.board_spec.xml" +#define ACL_PKG_SECTION_PERIPH_HASH ".acl.periph.hash" +#define ACL_PKG_SECTION_PROFILER_XML ".acl.profiler.xml" +#define ACL_PKG_SECTION_COMPILE_REV ".acl.compile_revision" +#define ACL_PKG_SECTION_PCIE_DEV_ID ".acl.pcie.dev_id" +#define ACL_PKG_SECTION_BASE_PERIPH_HASH ".acl.base_revision.periph.hash" +#define ACL_PKG_SECTION_ADJUST_PLLS_OUTPUT ".acl.quartus_report" +#define ACL_PKG_SECTION_KERNEL_ARG_INFO_XML ".acl.kernel_arg_info.xml" +#define ACL_PKG_SECTION_FAST_COMPILE ".acl.fast_compile" + +/* Minimum alignment in memory. */ +#define ACL_PKG_MIN_SECTION_ALIGNMENT 128 + +/* Open and close the pkg file */ +struct acl_pkg_file *acl_pkg_open_file (const char *fname, int mode); +/* You can call close on a NULL pointer: it will do nothing. + * Closing the package file will also free its memory, so you better lose + * the pointer reference. + */ +int acl_pkg_close_file (struct acl_pkg_file *pkg); + +/* Set message output mode: show_mode is some combination of the bits + * in ACL_PKG_SHOW_INFO and ACL_PKG_SHOW_ERROR + */ +void acl_pkg_set_show_mode( struct acl_pkg_file* pkg, int show_mode ); + +/* Open memory image of pkg file. Only good for reading! + * The show_mode argument is an OR combination of zero or more of + * ACL_PKG_SHOW_INFO, + * ACL_PKG_SHOW_ERROR. + */ +struct acl_pkg_file *acl_pkg_open_file_from_memory (char *pkg_image, size_t pkg_image_size, int show_mode); + + +/* Does the given named section exist? + * Returns 1 for yes, 0 for no. + * If the section exists, and size_ret is not-NULL, then the size (in bytes) of the + * section is stored into *size_ret. The size does NOT include NULL terminator, just like strlen(). + */ +int acl_pkg_section_exists (const struct acl_pkg_file *pkg, const char *sect_name, size_t* size_ret); + +/* Return list of ALL (useful) section names in the package. + * The buffer must be pre-allocated by the caller upto max_len bytes. + * Each section name is separated by '\n' + * Returns 1 on success, 0 on failure. + */ +int acl_pkg_section_names (const struct acl_pkg_file *pkg, char *buf, size_t max_len); + + +/* Add a new section with specified content. + * If a section with such name already exists, nothing is done. + * Returns 0 on failure, non-zero on success. + */ +int acl_pkg_add_data_section (struct acl_pkg_file *pkg, const char *sect_name, const void* content, size_t len); +int acl_pkg_add_data_section_from_file (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file); + +/* Read content of an existing section. + * For read_section(), the buffer must be pre-allocated by caller to hold at least len bytes. + * This function will add '\0' at the end, therefore, the 'len' argument passed to this function + * must be one larger than the value returned by acl_pkg_section_exists. + * Returns 0 on failure, non-zero on success. + */ +int acl_pkg_read_section (const struct acl_pkg_file *pkg, const char *sect_name, char *buf, size_t len); +int acl_pkg_read_section_into_file (struct acl_pkg_file *pkg, const char *sect_name, const char *out_file); + +/* Get a transient pointer to a section's data, via buf_ptr. + * The pointer is transient: It might move if you update the package in any way. + * This is a "fast" path in comparison to acl_pkg_read_section, so you + * don't have to allocate space to copy into. + * Returns 0 on failure, non-zero on success. + */ +int acl_pkg_read_section_transient(const struct acl_pkg_file *pkg, const char *sect_name, char** buf_ptr); + +/* Update content of an existing section. + * Old content is discarded. The section must already exist. + * Returns 0 on failure, non-zero on success. + */ +int acl_pkg_update_section (struct acl_pkg_file *pkg, const char *sect_name, const void *new_content, size_t new_len); +int acl_pkg_update_section_from_file (struct acl_pkg_file *pkg, const char *sect_name, const char *in_file); + +/* List all pkg sections to stdout. + * Returns 0 on failure, non-zero on success. + */ +int acl_pkg_list_file_sections (struct acl_pkg_file *pkg); + +/* Read full content of file into a buffer. + * The buffer is allocated by this function but must be freed by the caller. + * File length is returned in the second argument */ +void *acl_pkg_read_file_into_buffer (const char *in_file, size_t *file_size_out); + +/* support for package/unpackage */ + +/* Package the input files and directory trees (NULL terminated list in input_files_dirs) + * and put them into the output file (out_file). + * Returns 0 on failure, non-zero on success + */ +int acl_pkg_pack (const char* out_file, const char** input_files_dirs); + +/* Unpack the input file (or stdin if filename is ACL_PKG_UNPACKAGE_STDIN) + * created by acl_pkg_pack into directory out_dir. + * Returns 0 on failure, non-zero on success + */ +#define ACL_PKG_UNPACKAGE_STDIN "-" +int acl_pkg_unpack (const char* in_file, const char* out_dir); + +/* Unpack the buffer created by acl_pkg_pack into directory out_dir. + * Returns 0 on failure, non-zero on success + */ +int acl_pkg_unpack_buffer (const char* buffer, size_t buffer_size, const char* out_dir); + +#ifdef __cplusplus +} +#endif + +#endif /* PKG_FILE_EDITOR_H */ diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib Binary files differnew file mode 100755 index 0000000..2f26b62 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/FpgaLib.lib diff --git a/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib Binary files differnew file mode 100755 index 0000000..6c7f423 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/de10_agilex/lib/win64/acl_check_sys_cmd.lib diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore new file mode 100644 index 0000000..0948b39 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/.gitignore @@ -0,0 +1,20 @@ +*~ +*# +*.marks +release_build/ +build/ +example_designs/mem_bandwidth/bin/ +example_designs/mem_bandwidth/simulation.tar.gz +example_designs/mem_bandwidth/temp_simulation/ +linux64/lib/ +linux64/libexec/diagnose +linux64/libexec/program +ase/mpf_src +*.pyc +*.swp +*.kwlp +*.kwps +temp_simulation/ +simulation.tar.gz + +backup diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt new file mode 100644 index 0000000..d8bf50d --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/CMakeLists.txt @@ -0,0 +1,59 @@ +# (C) 2017 Intel Corporation. All rights reserved. +# Your use of Intel Corporation's design tools, logic functions and other +# software and tools, and its AMPP partner logic functions, and any output +# files any of the foregoing (including device programming or simulation +# files), and any associated documentation or information are expressly subject +# to the terms and conditions of the Intel Program License Subscription +# Agreement, Intel MegaCore Function License Agreement, or other applicable +# license agreement, including, without limitation, that your use is for the +# sole purpose of programming logic devices manufactured by Intel and sold by +# Intel or its authorized distributors. Please refer to the applicable +# agreement for further details. + +cmake_minimum_required(VERSION 2.8.12) +project(mmd) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") + +# DLA specific modifications made to the MMD +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD") + +# Select PCIE Gen3 x8 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGEN3_x8") + +# from the opencl makefile +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_64BIT -DOPTION3=1 -DACL_USE_DMA=1 -DACL_COMPILER_IS_MSVC=0 -Wall -Wno-unknown-pragmas -DACL_HAS_STDLIB_STDIO") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector -Wformat -Wformat-security -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -DACL_HOST_RUNTIME_IS_STATIC=0") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DACL_OPENCL_HOST_SYS=linux -DACL_OPENCL_HOST_BIT=64 -DACL_TARGET_SYS=linux -DACL_TARGET_BIT=64 -DLINUX -DACL_MAX_DEVICE=128") + +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2 -O3") +enable_language(C ASM) + +set(ASM_OPTIONS "-x assembler-with-cpp") +if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as") +endif() + +set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}") + +if(RUNTIME_POLLING) + add_definitions(-DRUNTIME_POLLING) +endif(RUNTIME_POLLING) + +set(MMD_SRC + ./host/acl_hps.cpp + ./host/mmd_device.cpp + ./host/dma_device.cpp + ./host/uio_device.cpp +) + +add_library(hps_platform_mmd SHARED ${MMD_SRC}) + +target_include_directories(hps_platform_mmd PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + +target_link_libraries(hps_platform_mmd) + +install(TARGETS hps_platform_mmd + LIBRARY DESTINATION lib + COMPONENT hps_platform_mmd +) diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp new file mode 100644 index 0000000..53055ef --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.cpp @@ -0,0 +1,473 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- HPS.cpp ------------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) HPS MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions that are defined in aocl_mmd.h */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "acl_hps.h" + +// other standard header files +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#include <memory> +#include <map> +#include <sstream> +#include <string> +#include <utility> + +#include "mmd_device.h" + +#ifdef DLA_MMD +#include <chrono> +#include <thread> +#endif + +#if defined(LINUX) +#include <fcntl.h> +#include <semaphore.h> +#include <signal.h> +#include <unistd.h> +#endif // LINUX + +#define MAX_HPS_FPGA_DEVICES (1) + +// MAX size of line read from pipe-ing the output of system call to MMD +#define BUF_SIZE 1024 +// MAX size of command passed to system for invoking system call from MMD +#define SYSTEM_CMD_SIZE 4 * 1024 + +#ifndef DLA_MMD +// static helper functions +static bool blob_has_elf_signature(void *data, size_t data_size); +#endif + + +// Function to return the number of boards installed in the system +unsigned int get_offline_num_boards() { + board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES); + return (unsigned int)names.size(); +} + +// Get information about the board using the enum aocl_mmd_offline_info_t for +// offline info (called without a handle), and the enum aocl_mmd_info_t for +// info specific to a certain board. +#define RESULT_INT(X) \ + { \ + *((int *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(int); \ + } +#define RESULT_UNSIGNED(X) \ + { \ + *((unsigned *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(unsigned); \ + } +#define RESULT_SIZE_T(X) \ + { \ + *((size_t *)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(size_t); \ + } +#if defined(WINDOWS) +#define RESULT_STR(X) \ + do { \ + size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1; \ + memcpy_s((void *)param_value, param_value_size, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \ + if (param_size_ret) *param_size_ret = Xlen; \ + } while (0) +#else +#define RESULT_STR(X) \ + do { \ + size_t Xlen = strnlen(X, MAX_NAME_SIZE) + 1; \ + memcpy((void *)param_value, X, (param_value_size <= Xlen) ? param_value_size : Xlen); \ + if (param_size_ret) *param_size_ret = Xlen; \ + } while (0) +#endif +#define ACL_VENDOR_NAME "Intel" +int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void *param_value, + size_t *param_size_ret) { + unsigned int num_boards; + switch (requested_info_id) { + case AOCL_MMD_VERSION: + RESULT_STR(MMD_VERSION); + break; + case AOCL_MMD_NUM_BOARDS: { + num_boards = MAX_HPS_FPGA_DEVICES; + RESULT_INT((int)num_boards); + break; + } + case AOCL_MMD_BOARD_NAMES: { + // Retrieve all the CoreDLA cores in the system + board_names names = mmd_get_devices(MAX_HPS_FPGA_DEVICES); + // Construct a list of all possible devices supported by this MMD layer + std::ostringstream board; + auto name = names.begin(); + while(name != names.end() ) + { + board << *name; + name++; + if( name != names.end() ) + { + board << ";"; + } + } + + RESULT_STR(board.str().c_str()); + break; + } + case AOCL_MMD_VENDOR_NAME: { + RESULT_STR(ACL_VENDOR_NAME); + break; + } + case AOCL_MMD_VENDOR_ID: + RESULT_INT(0); + break; + case AOCL_MMD_USES_YIELD: + RESULT_INT(0); /* TODO: Can we yield? */ + break; + case AOCL_MMD_MEM_TYPES_SUPPORTED: + RESULT_INT(AOCL_MMD_PHYSICAL_MEMORY); /* TODO: Confirm this is the right memory type */ + break; + } + return 0; +} + +// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime +// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure +// the runtime doesn't get to reference them after MMD destructors have been called. +// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does. +// Implemented as a singleton. +class DeviceMapManager final { +public: + typedef std::map<int, mmd_device_ptr> map_handle_to_dev_t; + ~DeviceMapManager() + { + } + + int add_device(const char *name) + { + int handle = idx++; + + mmd_device_ptr spDevice = std::make_shared<mmd_device>(name, handle); + if( spDevice->bValid() ) + { + auto it = handle_to_dev.find(handle); + HPS_ERROR_IF( it != handle_to_dev.end(), return FAILURE, "Error: Handle already used.\n" ); + handle_to_dev.insert({handle, spDevice}); + return handle; + } + return FAILURE; + } + + mmd_device_ptr get_device(const int handle) + { + auto it = handle_to_dev.find(handle); + HPS_ERROR_IF( it == handle_to_dev.end(), return nullptr, "Error: Invalid handle.\n" ); + return it->second; + } + + bool remove_device(const int handle) + { + auto it = handle_to_dev.find(handle); + HPS_ERROR_IF( it == handle_to_dev.end(), return false, "Error: Handle does not exist.\n" ); + handle_to_dev.erase(it); + return true; + } + + DeviceMapManager() + { + } +private: + map_handle_to_dev_t handle_to_dev = {}; + int idx = {0}; +}; +static DeviceMapManager _gDeviceMapManager; + +int aocl_mmd_get_info( + int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void *param_value, size_t *param_size_ret) { + HPS_ERROR_IF(true, + return FAILURE, + "aocl_mmd_get_info not supported on platform. \n"); +} + +#undef RESULT_INT +#undef RESULT_STR + + +// Open and initialize the named device. +int AOCL_MMD_CALL aocl_mmd_open(const char *name) { + return _gDeviceMapManager.add_device(name); +} + +// Close an opened device, by its handle. +int AOCL_MMD_CALL aocl_mmd_close(int handle) { + if ( _gDeviceMapManager.remove_device(handle) ) + return SUCCESS; + return FAILURE; +} + +// Set the interrupt handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void *user_data) { + mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle); + if( nullptr == spDevice ) { + return FAILURE; + } + return spDevice->set_interrupt_handler(fn, user_data); +} + +// Set the device interrupt handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_device_interrupt_handler(int handle, + aocl_mmd_device_interrupt_handler_fn fn, + void *user_data) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + return -1; +} + +// Set the operation status handler for the opened device. +int AOCL_MMD_CALL aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void *user_data) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + return -1; +} + +// Called when the host is idle and hence possibly waiting for events to be +// processed by the device +int AOCL_MMD_CALL aocl_mmd_yield(int handle) +{ + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + return -1; +} + +// Read, write and copy operations on a single interface. +int AOCL_MMD_CALL aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void *dst, int mmd_interface, size_t offset) { + mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle); + if( nullptr == spDevice ) { + return FAILURE; + } + return spDevice->read_block(op, mmd_interface, dst, offset, len); +} + +int AOCL_MMD_CALL +aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void *src, int mmd_interface, size_t offset) { + mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle); + if( nullptr == spDevice ) { + return FAILURE; + } + return spDevice->write_block(op, mmd_interface, src, offset, len); +} + +int AOCL_MMD_CALL +aocl_mmd_copy(int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return -1; +} + +// Initialize host channel specified in channel_name +int AOCL_MMD_CALL aocl_mmd_hostchannel_create(int handle, char *channel_name, size_t queue_depth, int direction) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return -1; +} + +// reset the host channel specified with channel handle +int AOCL_MMD_CALL aocl_mmd_hostchannel_destroy(int handle, int channel) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return -1; +} + +// Get the pointer to buffer the user can write/read from the kernel with +AOCL_MMD_CALL void *aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t *buffer_size, int *status) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return NULL; +} + +// Acknolwedge from the user that they have written/read send_size amount of buffer obtained from get_buffer +size_t AOCL_MMD_CALL aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int *status) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return -1; +} + +#ifdef DLA_MMD +// Reprogram the device given the sof file name +int AOCL_MMD_CALL aocl_mmd_program_sof(int handle, const char *sof_filename) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* We don't support reprogramming the SOF on a HPS device */ + return -1; +} +#else +// Reprogram the device based on the program mode +int AOCL_MMD_CALL aocl_mmd_program(int handle, void *data, size_t data_size, aocl_mmd_program_mode_t program_mode) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* We don't support reprogramming the SOF on a HPS device */ + return -1; +} +#endif +// Shared memory allocator +AOCL_MMD_CALL void *aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long *device_ptr_out) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return NULL; +} + +// Shared memory de-allocator +AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void *host_ptr, size_t size) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + /* Not called by CoreDLA, so not implementing */ + return; +} + +#ifndef DLA_MMD +// This function checks if the input data has an ELF-formatted blob. +// Return true when it does. +static bool blob_has_elf_signature(void *data, size_t data_size) { + bool result = false; + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + if (data && data_size > 4) { + unsigned char *cdata = (unsigned char *)data; + const unsigned char elf_signature[4] = {0177, 'E', 'L', 'F'}; // Little endian + result = (cdata[0] == elf_signature[0]) && (cdata[1] == elf_signature[1]) && (cdata[2] == elf_signature[2]) && + (cdata[3] == elf_signature[3]); + } + return result; +} +#endif + +// Return a positive number when single device open. Otherwise, return -1 +AOCL_MMD_CALL int get_open_handle() { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + return -1; +} + +AOCL_MMD_CALL void *aocl_mmd_host_alloc(int *handles, + size_t num_devices, + size_t size, + size_t alignment, + aocl_mmd_mem_properties_t *properties, + int *error) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL int aocl_mmd_free(void *mem) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + // Not supported on this BSP + return 0; +} + +AOCL_MMD_CALL void *aocl_mmd_device_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL void *aocl_mmd_shared_alloc( + int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t *properties, int *error) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + // Not supported on this BSP + return NULL; +} + +AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, void *shared_ptr, size_t size, aocl_mmd_migrate_t destination) { + printf("%s:%s:%d\n", __FILE__, __PRETTY_FUNCTION__, __LINE__); + // Not supported on this BSP + return 0; +} + +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() +{ + return 1; +} + +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { + return 1ULL << 29; +} + +// AGX7 HPS board uses 333.3325 MHz (1333.33/4) for the DLA DDR Clock +// All other boards use 266.666666 MHz (1066.66666/4) +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { +#ifdef HPS_AGX7 + return 333.332500; +#else + return 266.666666; +#endif +} // MHz + +// Helper functions for the wrapper functions around CSR and DDR +uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { + return (0x1000 * instance) + addr; +} +uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { + return addr; +} + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t *data) { + return aocl_mmd_write( + handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t *data) { + return aocl_mmd_read( + handle, NULL, sizeof(uint32_t), data, HPS_MMD_COREDLA_CSR_HANDLE, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) { + return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void *data) { + return aocl_mmd_read(handle, NULL, length, data, HPS_MMD_MEMORY_HANDLE, dla_get_raw_ddr_address(instance, addr)); +} + +#ifdef STREAM_CONTROLLER_ACCESS +AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) { + mmd_device_ptr spDevice = _gDeviceMapManager.get_device(handle); + if( nullptr == spDevice ) { + return FAILURE; + } + return spDevice->bStreamControllerValid(); +} + +AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void *data) { + return aocl_mmd_write(handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr); +} + +AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) { + return aocl_mmd_read( + handle, NULL, length, data, HPS_MMD_STREAM_CONTROLLER_HANDLE, addr); +} +#endif + +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) { + return 200; +} + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h new file mode 100644 index 0000000..7c85a24 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/acl_hps.h @@ -0,0 +1,111 @@ +#ifndef ACL_HPS_H +#define ACL_HPS_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- acl_hps.h --------------------------------------------------- C++ -*-=== */ +/* */ +/* Intel(R) HPS MMD Driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file defines macros and types that are used inside the MMD driver */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#ifndef ACL_HPS_EXPORT +#define ACL_HPS_EXPORT __declspec(dllimport) +#endif + +#define MMD_VERSION AOCL_MMD_VERSION_STRING + +#include <assert.h> +#include <stddef.h> +#include <stdio.h> +#ifdef DLA_MMD +#include <cstdint> +#endif +#include "aocl_mmd.h" + +#include "hps_types.h" + +#if defined(WINDOWS) +#error Currently not available for windows +#endif + +#if defined(LINUX) +typedef uintptr_t KPTR; +typedef int fpga_handle; +typedef unsigned int fpga_result; +#define FPGA_OK 0 + +typedef unsigned int DWORD; +typedef unsigned long long QWORD; +typedef char INT8; +typedef unsigned char UINT8; +typedef int16_t INT16; +typedef uint16_t UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long long INT64; +typedef unsigned long long UINT64; + +#define INVALID_HANDLE_VALUE ((int)(-1)) + +#define INVALID_DEVICE (-1) +#define WD_STATUS_SUCCESS 0 + +// define for the format string for DWORD type +#define DWORD_FMT_U "%u" +#define DWORD_FMT_X "%x" +#define DWORD_FMT_4X "%04X" + +// define for the format string for size_t type +#define SIZE_FMT_U "%zu" +#define SIZE_FMT_X "%zx" + +#endif // LINUX + +#define MAX_NAME_SIZE (1204) + +#define HPS_ASSERT(COND, ...) \ + do { \ + if (!(COND)) { \ + printf("\nMMD FATAL: %s:%d: ", __FILE__, __LINE__); \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + assert(0); \ + } \ + } while (0) + +#define HPS_ERROR_IF(COND, NEXT, ...) \ + do { \ + if (COND) { \ + printf("\nMMD ERROR: " __VA_ARGS__); \ + fflush(stdout); \ + NEXT; \ + } \ + } while (0) + +#define HPS_INFO(...) \ + do { \ + printf("MMD INFO : " __VA_ARGS__); \ + fflush(stdout); \ + } while (0) + +#endif // ACL_HPS_H diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp new file mode 100644 index 0000000..e403823 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.cpp @@ -0,0 +1,120 @@ +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- dma_device.h ------------------------------------------------- C++ -*-=== */ +/* */ +/* dma device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the dma device objects */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "dma_device.h" +#include <unistd.h> +#include <glob.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <stdio.h> + +#include <memory.h> + +// Copied from Linux driver: /drivers/dma/altera-msgdma.c +#define MSGDMA_DESC_NUM 1024 + +// Same page size as used in /meta-intel-fpga-coredla/recipes-drivers/msgdma-userio/files/msgdma_userio_chr.c +#define PAGE_SIZE 4096 + +////////////////////////////////////////////////////// + +#define ERR(format, ...) \ +printf("%s:%u() **ERROR** : " format, \ + __func__, __LINE__, ##__VA_ARGS__) + +////////////////////////////////////////////////////// +dma_device::dma_device(std::string &name) +{ + _pFile = fopen(name.c_str(), "r+"); + if( _pFile == nullptr ) + { + ERR("dma_device::dma_device failed to open %s\n", name.c_str()); + return; + } + + // Turn off buffering + setvbuf(_pFile, NULL, _IONBF, 0); +} + +dma_device::~dma_device() +{ + if( _pFile ) + { + fclose(_pFile); + _pFile = NULL; + } +} + +int dma_device::read_block(void *host_addr, size_t offset, size_t size) +{ + // Use 32bit seek as DDR memory current < 32bits + if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) { + return FAILURE; + } + + size_t read_size = fread(host_addr, 1, size, _pFile); + return (read_size == size) ? SUCCESS : FAILURE; +} + +int dma_device::write_block(const void *host_addr, size_t offset, size_t size) +{ + // The MSGDMA driver only supports a maximum of 1024 x 4096 = 4MBytes in the worst case scenario, + // in the event that the virtual buffer is fully fragmented. As the buffer gets more fragmented it's + // possible to run out of DMA descriptors. To prevent this, slice the data into 4MB chunks. + + // chunk_size is chosen based on the size of a page (12 bits) and default number of descriptors (1024). + // The descriptor count is reduced by 1 since if the host_addr is not aligned to a page then an extra page + // will be added at the end. This would then increase the descriptor count by 1. + size_t chunk_size = PAGE_SIZE * (MSGDMA_DESC_NUM - 1); + size_t write_size = 0; + + // Use 32bit seek as DDR memory current < 32bits + if( fseek(_pFile, (uint32_t)offset, SEEK_SET) != 0 ) { + return FAILURE; + } + + for (size_t host_addr_offset = 0; host_addr_offset < size; host_addr_offset += chunk_size) { + size_t current_size = chunk_size; + + // If the current address is within one chunk_size from the end of the data, set current_size + // to the bytes left to send + if (size - host_addr_offset < chunk_size) { + current_size = size - host_addr_offset; + } + + size_t current_write_size = fwrite((uint8_t *)host_addr + host_addr_offset, 1, current_size, _pFile); + + if (current_write_size != current_size) { + return FAILURE; + } + + write_size += current_write_size; + } + + return (write_size == size) ? SUCCESS : FAILURE; +} diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h new file mode 100644 index 0000000..24f89e4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/dma_device.h @@ -0,0 +1,56 @@ +#ifndef DMA_DEVICE_H_ +#define DMA_DEVICE_H_ + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- dma_device.h ------------------------------------------------- C++ -*-=== */ +/* */ +/* dma device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the dma device objects */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +#include <vector> +#include <string> +#include <memory> + +#include "hps_types.h" + +class dma_device +{ +public: + dma_device(std::string &name); + ~dma_device(); + + int read_block(void *host_addr, size_t offset, size_t size); + int write_block(const void *host_addr, size_t offset, size_t size); + + bool bValid() { return _pFile != nullptr; }; +private: + + dma_device() = delete; + dma_device(dma_device const&) = delete; + void operator=(dma_device const &) = delete; + + FILE *_pFile = {nullptr}; // File pointer to UIO - Used to indicate the the uio_device is valid +}; +typedef std::shared_ptr<dma_device> dma_device_ptr; + +#endif // DMA_DEVICE_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h new file mode 100644 index 0000000..3f11c4a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/hps_types.h @@ -0,0 +1,44 @@ +#ifndef HPS_TYPES_H_ +#define HPS_TYPES_H_ + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- hps_types.h -------------------------------------------------- C++ -*-=== */ +/* */ +/* Useful HPS Types */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file contains useful type definition */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +#include <vector> +#include <string> + +#define SUCCESS (0) +#define FAILURE (1) + +typedef std::vector<std::string> board_names; + +typedef enum { + HPS_MMD_COREDLA_CSR_HANDLE = 1, // COREDLA CSR Interface + HPS_MMD_MEMORY_HANDLE = 2, // Device Memory transfers + HPS_MMD_STREAM_CONTROLLER_HANDLE = 3 // Stream Controller Interface +} hps_mmd_interface_t; + +#endif // HPS_TYPES_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp new file mode 100644 index 0000000..b52c1d8 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.cpp @@ -0,0 +1,129 @@ +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- mmd_device.h ------------------------------------------------- C++ -*-=== */ +/* */ +/* mmd device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the mmd device object */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +#include "mmd_device.h" + +// Defined names of the UIO Nodes +#define UIO_COREDLA_PREFIX "coredla" +#define STREAM_CONTROLLER_PREFIX "stream_controller" + +// Defined name of the msgdma device +#define DMA_DEVICE_PREFIX "/dev/msgdma_coredla" +#define UIO_DEVICE_PREFIX "uio" + +board_names mmd_get_devices(const int max_fpga_devices) +{ + return uio_get_devices(UIO_COREDLA_PREFIX, max_fpga_devices); +} + + +///////////////////////////////////////////////////////// +mmd_device::mmd_device(std::string name, const int mmd_handle) +: _name(name), _mmd_handle(mmd_handle) { + _spCoredlaDevice = std::make_shared<uio_device>(name, _mmd_handle, true); + int32_t index = extract_index(_name); + if( (index >= 0) && _spCoredlaDevice && _spCoredlaDevice->bValid() ) + { + std::string dma_name(DMA_DEVICE_PREFIX); + dma_name += std::to_string(index); + _spDmaDevice = std::make_shared<dma_device>(dma_name); + + if( (_spDmaDevice==nullptr) || (!_spDmaDevice->bValid()) ) { + _spDmaDevice = nullptr; + return; + } + std::string stream_controller_name = uio_get_device(STREAM_CONTROLLER_PREFIX, index); + if( !stream_controller_name.empty() ) { + // Create a uio_device but don't attach any interrupt support as the stream controller + // does not require interrupts + _spStreamControllerDevice = std::make_shared<uio_device>(stream_controller_name, _mmd_handle, false); + if( _spStreamControllerDevice && !_spStreamControllerDevice->bValid() ) { + // The stream controller does not exist + _spStreamControllerDevice = nullptr; + } + } + } +} + +int mmd_device::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) +{ + if( op ) { + LOG_ERR("op not support : %s\n", __func__ ); + return FAILURE; + } + if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) { + return _spDmaDevice->read_block(host_addr, offset, size); + } else if( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) { + return _spCoredlaDevice->read_block(host_addr, offset, size); + } else if( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) { + if ( _spStreamControllerDevice ) { + return _spStreamControllerDevice->read_block(host_addr, offset, size); + } + } + + return FAILURE; +} + +int mmd_device::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) +{ + if( op ) { + LOG_ERR("op not support : %s\n", __func__ ); + return FAILURE; + } + if( mmd_interface == HPS_MMD_MEMORY_HANDLE ) { + return _spDmaDevice->write_block(host_addr, offset, size); + } else if ( mmd_interface == HPS_MMD_COREDLA_CSR_HANDLE ) { + return _spCoredlaDevice->write_block(host_addr, offset, size); + } else if ( mmd_interface == HPS_MMD_STREAM_CONTROLLER_HANDLE ) { + if( _spStreamControllerDevice ) { + return _spStreamControllerDevice->write_block(host_addr, offset, size); + } + } + return FAILURE; +} + +int mmd_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + if( _spCoredlaDevice ) { + return _spCoredlaDevice->set_interrupt_handler(fn, user_data); + } + return FAILURE; +} + +// Returns the index of a uio device +// If index cannot be found then returns -1 +int mmd_device::extract_index(const std::string name) { + std::string prefix(UIO_DEVICE_PREFIX); + + if (name.length() <= prefix.length() && name.compare(0, prefix.length(), prefix)) { + LOG_ERR("Error parsing device name '%s'\n", name.c_str()); + return -1; + } + + std::string device_num_str = name.substr(prefix.length()); + int32_t index = std::stoi(device_num_str, 0, 10); + return index; +} diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h new file mode 100644 index 0000000..9cb0c71 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/mmd_device.h @@ -0,0 +1,75 @@ +#ifndef MMD_DEVICE_H_ +#define MMD_DEVICE_H_ + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- mmd_device.h ------------------------------------------------- C++ -*-=== */ +/* */ +/* mmd device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the mmd device object */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +#include <memory> +#include <string> + +#include "hps_types.h" +#include "dma_device.h" +#include "uio_device.h" + +#include "aocl_mmd.h" + +// LOG ERRORS +#define MMD_ERR_LOGGING 1 +#ifdef MMD_ERR_LOGGING +#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) +#else +#define LOG_ERR(...) +#endif + +class mmd_device { +public: + mmd_device(std::string name, const int mmd_handle); + + bool bValid() { return _spCoredlaDevice && _spCoredlaDevice->bValid() && _spDmaDevice && _spDmaDevice->bValid(); }; + bool bStreamControllerValid() { return _spCoredlaDevice && _spStreamControllerDevice && _spStreamControllerDevice->bValid(); }; + int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size); + int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size); + + int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void *user_data); +private: + int32_t extract_index(const std::string name); + + mmd_device() = delete; + mmd_device(mmd_device const&) = delete; + void operator=(mmd_device const &) = delete; + std::string _name; + + uio_device_ptr _spCoredlaDevice; + uio_device_ptr _spStreamControllerDevice; + dma_device_ptr _spDmaDevice; + int _mmd_handle; +}; + +typedef std::shared_ptr<mmd_device> mmd_device_ptr; + +extern board_names mmd_get_devices(const int max_fpga_devices); + +#endif // MMD_DEVICE_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp new file mode 100644 index 0000000..95a9567 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.cpp @@ -0,0 +1,469 @@ +// (c) 1992-2021 Intel Corporation. +// Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words +// and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. +// and/or other countries. Other marks and brands may be claimed as the property +// of others. See Trademarks on intel.com for full list of Intel trademarks or +// the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) +// Your use of Intel Corporation's design tools, logic functions and other +// software and tools, and its AMPP partner logic functions, and any output +// files any of the foregoing (including device programming or simulation +// files), and any associated documentation or information are expressly subject +// to the terms and conditions of the Altera Program License Subscription +// Agreement, Intel MegaCore Function License Agreement, or other applicable +// license agreement, including, without limitation, that your use is for the +// sole purpose of programming logic devices manufactured by Intel and sold by +// Intel or its authorized distributors. Please refer to the applicable +// agreement for further details. + +/* ===- uio_device.cpp ----------------------------------------------- C++ -*-=== */ +/* */ +/* uio device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the uio device objects */ +/* */ +/* ===-------------------------------------------------------------------------=== */ + +// common and its own header files +#include "uio_device.h" +#include <unistd.h> +#include <glob.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <stdio.h> +#include <poll.h> + +#include <cinttypes> +#include <memory.h> + +////////////////////////////////////////////////////// +#define UIO_BASE_NAME "uio*" +#define UIO_BASE_PATH "/sys/class/uio/" +#define UIO_BASE_SEARCH UIO_BASE_PATH UIO_BASE_NAME +#define UIO_MAX_PATH (256) + +#define ERR(format, ...) \ +fprintf(stderr, "%s:%u **ERROR** : " format, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +////////////////////////////////////////////////////// +#define MAX_NAME (20) +bool uio_read_sysfs_uint64(const char *device_name, const char *sysfs_name, uint64_t &value) +{ + FILE *fp; + char param_path[UIO_MAX_PATH]; + + if( snprintf(param_path, sizeof(param_path), "%s/%s", device_name, sysfs_name) < 0 ) + { + ERR("Path too long. %s, %s\n", device_name, sysfs_name); + return false; + } + + fp = fopen(param_path, "r"); + if( !fp ) + { + ERR("Failed to fopen - %s\n", param_path); + return false; + } + + if( fscanf(fp, "%" PRIx64, &value) != 1 ) + { + ERR("Failed fscanf - %s\n", param_path); + fclose(fp); + return false; + } + + fclose(fp); + return true; +} + +bool uio_read_sysfs_string(const char *uio_path, const char *sysfs_name, std::string &result) +{ + char uio_name[MAX_NAME]; + FILE *fp; + char param_path[UIO_MAX_PATH]; + + if( snprintf(param_path, sizeof(param_path), "%s/%s", uio_path, sysfs_name) < 0 ) + { + ERR("Path too long. %s, %s\n", uio_path, sysfs_name); + return false; + } + + fp = fopen(param_path, "r"); + if( !fp ) + { + ERR("Failed to fopen - %s\n", param_path); + return false; + } + + int num_read = fread(uio_name, 1, MAX_NAME, fp); + if( num_read <= 0 ) + { + ERR("Failed to read name - %s\n", param_path); + fclose(fp); + return false; + } + + uio_name[num_read-1] = '\0'; // Terminate + result = std::string(uio_name); + fclose(fp); + + return true; +} + +std::string uio_get_device(const std::string prefix, const int32_t index) +{ + glob_t globbuf = {0}; + std::string uio_name; + + int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf); + if( (glob_res == 0) && (globbuf.gl_pathc) ) + { + std::string device_name; + device_name = prefix + std::to_string(index); + + for( size_t i=0; i<globbuf.gl_pathc; i++ ) + { + std::string name; + uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name); + + if( name.find(device_name) != std::string::npos ) + { + // We will return just the device name without the UIO_BASE_PATH + std::string name = std::string(globbuf.gl_pathv[i]); + uio_name = name.substr(sizeof(UIO_BASE_PATH)-1); + } + } + } + return uio_name; +} + +board_names uio_get_devices(const std::string device_name, const int max_devices) +{ + board_names names; + int device = 0; + + glob_t globbuf = {0}; + + int glob_res = glob(UIO_BASE_SEARCH, GLOB_NOSORT, NULL, &globbuf); + if( (glob_res == 0) && (globbuf.gl_pathc) ) + { + for( size_t i=0; (i<globbuf.gl_pathc) && (device < max_devices); i++ ) + { + std::string name; + uio_read_sysfs_string(globbuf.gl_pathv[i], "name", name); + + if( name.find(device_name) != std::string::npos ) + { + // We will return just the device name without the UIO_BASE_PATH + std::string name = std::string(globbuf.gl_pathv[i]); + name = name.substr(sizeof(UIO_BASE_PATH)-1); + names.push_back(name); + device++; + } + } + } + return names; +} + +////////////////////////////////////////////////////////////// +uio_device::uio_device(std::string &name, const int mmd_handle, const bool bEnableIRQ) +: _mmd_handle(mmd_handle) +{ + // Map the first address space + if ( !map_region(name, 0) ) { + ERR("Failed to map region 0 on %s\n", name.c_str()); + return; + } +#ifndef RUNTIME_POLLING + if( bEnableIRQ ) { + _spInterrupt = std::make_shared<uio_interrupt>(_fd, _mmd_handle); + if( !_spInterrupt->initialized() ) { + _spInterrupt = nullptr; // If the uio_interrupt failed to initialize then delete + } + _bIrqEnabled = bEnableIRQ; + } +#endif +} + +bool uio_device::bValid() { + bool bValid = (_fd >=0); +#ifndef RUNTIME_POLLING // If we're not polling check that the interrupt handling is working + if( _bIrqEnabled ) { + bValid |= (_spInterrupt != nullptr); + } +#endif + return bValid; +}; + +uio_device::~uio_device() +{ +#ifndef RUNTIME_POLLING + _spInterrupt = nullptr; // Shutdown the interrupt handler +#endif + unmap_region(); +} + +uint32_t uio_device::read(const uint32_t reg) +{ + // NOT YET IMPLEMENTED + return 0; +} + +void uio_device::write(const uint32_t reg, const uint32_t value) +{ + // NOT YET IMPLEMENTED + return; +} + +// Copies the block of data from the FPGA to the host +// memcpy is not used as this can cause multiple transfers of the AXI bus depending +// on the implementation of memcpy +int uio_device::read_block(void *host_addr, size_t offset, size_t size) +{ + // Support for only 32bit aligned transfers + if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){ + return FAILURE; + } + + // Transfer the data in 32bit chunks + volatile const uint32_t *pDeviceMem32 = reinterpret_cast<volatile const uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset); + uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr); + while (size >= sizeof(uint32_t)) { + *host_addr32++ = *pDeviceMem32++; + size -= sizeof(uint32_t); + } + + return SUCCESS; +} + +// Copies the block of data from the host to the FPGA +// memcpy is not used as this can cause multiple transfers of the AXI bus depending +// on the implementation of memcpy +int uio_device::write_block(const void *host_addr, size_t offset, size_t size) +{ + // Support for only 32bit aligned transfers + if( (offset % sizeof(uint32_t)) || (size % sizeof(uint32_t)) ){ + return FAILURE; + } + + // Transfer the remaining 32bits of data + volatile uint32_t *pDeviceMem32 = reinterpret_cast<volatile uint32_t*>(reinterpret_cast<uint8_t*>(_pPtr) + offset); + const uint32_t *host_addr32 = reinterpret_cast<const uint32_t*>(host_addr); + while( size >= sizeof(uint32_t) ) { + *pDeviceMem32++ = *host_addr32++; + size -= sizeof(uint32_t); + } + return SUCCESS; +} + +int uio_device::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) { +#ifndef RUNTIME_POLLING + if( _spInterrupt ) { + return _spInterrupt->set_interrupt_handler(fn, user_data); + } +#endif + return FAILURE; +} + +///////////////////////////////////////////////////////////////// +void uio_device::unmap_region() +{ + if( _pBase ) + { + munmap(_pBase, _size); + _pBase = nullptr; + } + + if( _fd >= 0 ) + { + close(_fd); + _fd = -1; + } +} + +bool uio_device::map_region( std::string &name, const uint32_t index) +{ + char map_path[UIO_MAX_PATH]; + + std::string uio_params_path(UIO_BASE_PATH); + uio_params_path += name; + + // char device_path[UIO_MAX_PATH]; + // const char *p; + + if( snprintf(map_path, sizeof(map_path), "maps/map%d/size", index ) < 0 ) + { + ERR("Failed to make map addr name.\n"); + return false; + } + if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _size) ) + { + ERR("Failed to read size\n"); + return false; + } + // Make sure that the size doesn't exceed 32bits, as this will fail the mapping + // call on 32bit systems + if( _size > UINT32_MAX ) { + ERR("Invalid size value\n"); + return false; + } + + if( snprintf(map_path, sizeof(map_path), "maps/map%d/offset", index ) < 0 ) + { + ERR("Failed to make map offset name.\n"); + return false; + } + if( !uio_read_sysfs_uint64(uio_params_path.c_str(), map_path, _offset) ) + { + ERR("Failed to read offset\n"); + return false; + } + + std::string uio_dev_path("/dev/"); + uio_dev_path += name; + + _fd = open(uio_dev_path.c_str(), O_RDWR ); + if( _fd < 0 ) + { + ERR("Failed to open - %s\n", uio_dev_path.c_str()); + return false; + } + // Map the region into userspace + // The base of the region is the page_size offset of the index + uint32_t page_size = (uint32_t)sysconf(_SC_PAGESIZE); + + _pBase = (uint8_t*)mmap(NULL, (size_t)_size, PROT_READ|PROT_WRITE, MAP_SHARED, _fd, (off_t) (index * page_size)); + if( _pBase == MAP_FAILED ) + { + ERR("Failed to map uio region.\n"); + close(_fd); + _fd = -1; + return false; + } + // CST base address is at _pBase + _offset + _pPtr = (uint32_t*)(_pBase + _offset); + + return true; +}; + +#ifndef RUNTIME_POLLING +/////////////////////////////////////////////////////////////////////////////////// +uio_interrupt::uio_interrupt(const int fd, const int mmd_handle) +: _device_fd(fd), _mmd_handle(mmd_handle) { + if( is_irq_available() ) { + // Create a eventfd_object to be used for shutting down the work_thread + _spShutdown_event = std::make_shared<eventfd_object>(); + if( _spShutdown_event->initialized() ) { + _pThread = new std::thread(work_thread, std::ref(*this)); + } else { + _spShutdown_event = nullptr; + } + } else { + ERR("No device interrupt found.\n"); + } +} + +uio_interrupt::~uio_interrupt() { + // kill the thread + if (_pThread && _spShutdown_event) { + // send message to thread to end it + _spShutdown_event->notify(1); + + // join with thread until it ends + _pThread->join(); + + delete _pThread; + _pThread = NULL; + + _spShutdown_event = nullptr; + } +} + +bool uio_interrupt::is_irq_available() { + // Disable the interrupt handling, this will fail if the IRQ has not been setup correctly. + // For example devicetree is incorrect. + return disable_irq(); +} + +bool uio_interrupt::enable_irq() { + // Enable interrupts from the device + uint32_t info = 1; + ssize_t nb = write(_device_fd, &info, sizeof(info)); + if( nb != (ssize_t)sizeof(info) ) { + ERR( "Failed in enable CoreDLA Interrupt = %s\n", strerror(errno)); + return false; + } + return true; +} + +bool uio_interrupt::disable_irq() { + // Enable interrupts from the device + uint32_t info = 0; + ssize_t nb = write(_device_fd, &info, sizeof(info)); + if( nb != (ssize_t)sizeof(info) ) { + ERR( "Failed in disable CoreDLA Interrupt = %s\n", strerror(errno)); + return false; + } + return true; +} + +void uio_interrupt::work_thread(uio_interrupt& obj) { + obj.run_thread(); +} + +#define UIO_INTERRUPT_TIMEOUT (-1) +void uio_interrupt::run_thread() { + while( true ) { + // Need to re-enable the UIO interrupt handling as UIO disables the IRQ each time it is fired + if ( !enable_irq() ) { + exit(-1); + } + // Poll for the shutdown_event and uio interrupt + struct pollfd pollfd_arr[2]; + pollfd_arr[0].fd = _spShutdown_event->get_fd(); + pollfd_arr[0].events = POLLIN; + pollfd_arr[0].revents = 0; + pollfd_arr[1].fd = _device_fd; + pollfd_arr[1].events = POLLIN; + pollfd_arr[1].revents = 0; + + int res = poll(pollfd_arr, 2, UIO_INTERRUPT_TIMEOUT); + if (res < 0) { + ERR( "Poll error errno = %s\n", strerror(errno)); + exit(-1); + } else if (res > 0 && pollfd_arr[0].revents == POLLIN) { + uint64_t count; + ssize_t bytes_read = read(pollfd_arr[0].fd, &count, sizeof(count)); + if (bytes_read > 0) { + break; // We've been asked to shutdown + } else { + ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + exit(-1); + } + } else if (res > 0 && pollfd_arr[1].revents == POLLIN) { + uint32_t count; + ssize_t bytes_read = read(pollfd_arr[1].fd, &count, sizeof(count)); + if (bytes_read > 0) { + if( _interrupt_fn ) { // Run the callback to the application + _interrupt_fn(get_mmd_handle(), _interrupt_fn_user_data ); + } + } else { + ERR( "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + exit(-1); + } + } + } + // Disable interrupt handling in UIO + if( !disable_irq() ){ + exit(-1); + } +} + +int uio_interrupt::set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data) { + _interrupt_fn = fn; + _interrupt_fn_user_data = user_data; + return SUCCESS; +} +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h new file mode 100644 index 0000000..c5f3ed5 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/host/uio_device.h @@ -0,0 +1,162 @@ +#ifndef UIO_DEVICE_H_ +#define UIO_DEVICE_H_ + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/* ===- uio_device.h ------------------------------------------------- C++ -*-=== */ +/* */ +/* uio device access functions */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +/* */ +/* This file implements the functions used access the uio device objects */ +/* */ +/* ===-------------------------------------------------------------------------=== */ +#include <vector> +#include <string> +#include <string.h> +#include <memory> +#include <thread> +#include <mutex> +#include <sys/eventfd.h> +#include <unistd.h> + +#include "aocl_mmd.h" +#include "hps_types.h" + +// simple wrapper class for managing eventfd objects +class eventfd_object final { + public: + eventfd_object() { + m_initialized = false; + // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set + // The implementation of functions using eventfd assumes that + m_fd = eventfd(0, 0); + if (m_fd < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return; + } + + m_initialized = true; + } + + ~eventfd_object() { + if (m_initialized) { + if (close(m_fd) < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + } + } + } + + bool notify(uint64_t count) { + ssize_t res = write(m_fd, &count, sizeof(count)); + if (res < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return false; + } + return true; + } + + int get_fd() { return m_fd; } + bool initialized() { return m_initialized; } + + private: + // not used and not implemented + eventfd_object(eventfd_object& other); + eventfd_object& operator=(const eventfd_object& other); + + // member varaibles + int m_fd; + int m_initialized; +}; // class eventfd_object +typedef std::shared_ptr<eventfd_object> eventfd_object_ptr; + +#ifndef RUNTIME_POLLING +class uio_interrupt final { + public: + uio_interrupt(const int fd, const int mmd_handle); + ~uio_interrupt(); + bool initialized() { return _pThread != nullptr; }; // If the thread is not created then must be invalid + int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data); + + private: + bool is_irq_available(); // Checks that the interrupt has been mapped into userspace + bool enable_irq(); // Enables UIO Irq handling + bool disable_irq(); // Disabled UIO Irq handling + + static void work_thread(uio_interrupt &obj); + void run_thread(); // Function which handles waiting for interrupts + + uio_interrupt() = delete; + uio_interrupt(uio_interrupt const&) = delete; + void operator=(uio_interrupt const&) = delete; + + int get_mmd_handle() {return _mmd_handle; }; + + std::thread *_pThread = {nullptr}; // Pointer to a thread object for waiting for interrupts + int _device_fd = {-1}; // /dev/uio* device pointer + int _mmd_handle = {-1}; // handle to the parent mmd_device + eventfd_object_ptr _spShutdown_event = {nullptr}; // Shutdown thread event object + + aocl_mmd_interrupt_handler_fn _interrupt_fn = {nullptr}; + void *_interrupt_fn_user_data = {nullptr}; +}; +typedef std::shared_ptr<uio_interrupt> uio_interrupt_ptr; +#endif + +class uio_device +{ +public: + uio_device(std::string &name, const int mmd_handle, const bool bEnableIrq=false); + ~uio_device(); + + uint32_t read(const uint32_t reg); + void write(const uint32_t reg, const uint32_t value); + + int read_block(void *host_addr, size_t offset, size_t size); + int write_block(const void *host_addr, size_t offset, size_t size); + int set_interrupt_handler(aocl_mmd_interrupt_handler_fn fn, void* user_data); + + bool bValid(); + +private: + bool map_region( std::string &name, const uint32_t index ); + void unmap_region(); + + uio_device() = delete; + uio_device(uio_device const&) = delete; + void operator=(uio_device const &) = delete; + + int _mmd_handle; // Handle to the parent mmd device + int _fd = {-1}; // File pointer to UIO - Used to indicate the the uio_device is valid + uint64_t _size; // Size of the mmapped region + uint64_t _offset; // Offset of the first register + uint8_t *_pBase; // Base of the mmapped region + + uint32_t *_pPtr; // The first register +#ifndef RUNTIME_POLLING + bool _bIrqEnabled; // Indicates that we tried to create with IRQ + uio_interrupt_ptr _spInterrupt; // Object to handle UIO Interrupts +#endif +}; +typedef std::shared_ptr<uio_device> uio_device_ptr; + +extern board_names uio_get_devices(const std::string name, const int max_devices); +extern std::string uio_get_device(const std::string prefix, const int32_t index); + +#endif // UIO_DEVICE_H_ diff --git a/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h new file mode 100644 index 0000000..7c1c73d --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/hps_platform/include/aocl_mmd.h @@ -0,0 +1,645 @@ +#ifndef AOCL_MMD_H +#define AOCL_MMD_H + +/* (c) 1992-2021 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Support for memory mapped ACL devices. + * + * Typical API lifecycle, from the perspective of the caller. + * + * 1. aocl_mmd_open must be called first, to provide a handle for further + * operations. + * + * 2. The interrupt and status handlers must be set. + * + * 3. Read and write operations are performed. + * + * 4. aocl_mmd_close may be called to shut down the device. No further + * operations are permitted until a subsequent aocl_mmd_open call. + * + * aocl_mmd_get_offline_info can be called anytime including before + * open. aocl_mmd_get_info can be called anytime between open and close. + */ + +#ifndef AOCL_MMD_CALL +#if defined(_WIN32) +#define AOCL_MMD_CALL __declspec(dllimport) +#else +#define AOCL_MMD_CALL __attribute__((visibility ("default"))) +#endif +#endif + +#ifndef WEAK +#if defined(_WIN32) +#define WEAK +#else +/* This normally comes with "__attribute__((weak))" but for reasons not presently + * understood, the shared library is not properly loaded on Ubuntu18 when the functions + * are weak. + */ +#define WEAK +#endif +#endif + +#ifdef DLA_MMD +#include <cstddef> //size_t +#include <cstdint> //uint32_t +#endif + +/* The MMD API's version - the runtime expects this string when + * AOCL_MMD_VERSION is queried. This changes only if the API has changed */ +#define AOCL_MMD_VERSION_STRING "20.3" + +/* Memory types that can be supported - bitfield. Other than physical memory + * these types closely align with the OpenCL SVM types. + * + * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate + * directly with physical memory such as DDR, QDR, etc. + * + * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires explicit function calls from the user + * to synchronize the cache between the host processor and the FPGA. This level + * of SVM is not currently supported by Altera except as a subset of + * SVM_FINE_GAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires additional information from the user + * and/or host runtime that can be collected during pointer allocation in order + * to synchronize the cache between the host processor and the FPGA. Once this + * additional data is provided for an SVM pointer, the vendor interface handles + * cache synchronization between the host processor & the FPGA automatically. + * This level of SVM is not currently supported by Altera except as a subset + * of SVM_FINE_GRAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for + * caching SVM pointer data and does not require any additional information to + * synchronize the cache between the host processor and the FPGA. The vendor + * interface handles cache synchronization between the host processor & the + * FPGA automatically for all SVM pointers. This level of SVM support is + * currently under development by Altera and some features may not be fully + * supported. + */ +#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0) +#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1) +#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2) +#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3) + +/* program modes - bitfield + * + * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory + * when this bit is set to 1. If programming can't occur without preserving + * global memory contents, the program function must fail, in which case the + * runtime may re-invoke program with this bit set to 0, allowing programming + * to occur even if doing so destroys global memory contents. + * + * more modes are reserved for stacking on in the future + */ +#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0) +typedef int aocl_mmd_program_mode_t; + +typedef void* aocl_mmd_op_t; + +typedef struct { + unsigned lo; /* 32 least significant bits of time value. */ + unsigned hi; /* 32 most significant bits of time value. */ +} aocl_mmd_timestamp_t; + +/* Defines the set of characteristics that can be probed about the board before + * opening a device. The type of data returned by each is specified in + * parentheses in the adjacent comment. + * + * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES + * These two fields can be used to implement multi-device support. The MMD + * layer may have a list of devices it is capable of interacting with, each + * identified with a unique name. The length of the list should be returned + * in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in + * AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open + * for each board name returned in AOCL_MMD_BOARD_NAMES. + */ +typedef enum { + AOCL_MMD_VERSION = 0, /* Version of MMD (char*)*/ + AOCL_MMD_NUM_BOARDS = 1, /* Number of candidate boards (int)*/ + AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/ + AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */ + AOCL_MMD_VENDOR_ID = 4, /* An integer ID for the vendor (int) */ + AOCL_MMD_USES_YIELD = 5, /* 1 if yield must be called to poll hw (int) */ + /* The following can be combined in a bit field: + * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM + * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1 + */ + AOCL_MMD_MEM_TYPES_SUPPORTED = 6, +} aocl_mmd_offline_info_t; + +/** Possible capabilities to return from AOCL_MMD_*_MEM_CAPABILITIES query */ +/** + * If not set allocation function is not supported, even if other capabilities are set. + */ +#define AOCL_MMD_MEM_CAPABILITY_SUPPORTED (1 << 0) +/** + * Supports atomic access to the memory by either the host or device. + */ +#define AOCL_MMD_MEM_CAPABILITY_ATOMIC (1 << 1) +/** + * Supports concurrent access to the memory either by host or device if the + * accesses are not on the same block. Block granularity is defined by + * AOCL_MMD_*_MEM_CONCURRENT_GRANULARITY., blocks are aligned to this + * granularity + */ +#define AOCL_MMD_MEM_CAPABILITY_CONCURRENT (1 << 2) +/** + * Memory can be accessed by multiple devices at the same time. + */ +#define AOCL_MMD_MEM_CAPABILITY_P2P (1 << 3) + +/* Defines the set of characteristics that can be probed about the board after + * opening a device. This can involve communication to the device + * + * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1 + * + * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface. + * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int + * + * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each + * kernel interface. If a kernel interface is not clocked by acl_kernel_clk + * then return -1 + * + * */ +typedef enum { + AOCL_MMD_NUM_KERNEL_INTERFACES = 1, /* Number of Kernel interfaces (int) */ + AOCL_MMD_KERNEL_INTERFACES = 2, /* Kernel interface (int*) */ + AOCL_MMD_PLL_INTERFACES = 3, /* Kernel clk handles (int*) */ + AOCL_MMD_MEMORY_INTERFACE = 4, /* Global memory handle (int) */ + AOCL_MMD_TEMPERATURE = 5, /* Temperature measurement (float) */ + AOCL_MMD_PCIE_INFO = 6, /* PCIe information (char*) */ + AOCL_MMD_BOARD_NAME = 7, /* Name of board (char*) */ + AOCL_MMD_BOARD_UNIQUE_ID = 8, /* Unique ID of board (int) */ + AOCL_MMD_CONCURRENT_READS = 9, /* # of parallel reads; 1 is serial*/ + AOCL_MMD_CONCURRENT_WRITES = 10, /* # of parallel writes; 1 is serial*/ + AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11, /* total # of concurrent operations read + writes*/ + AOCL_MMD_MIN_HOST_MEMORY_ALIGNMENT = 12, /* Min alignment that the BSP supports for host allocations (size_t) */ + AOCL_MMD_HOST_MEM_CAPABILITIES = 13, /* Capabilities of aocl_mmd_host_alloc() (unsigned int)*/ + AOCL_MMD_SHARED_MEM_CAPABILITIES = 14, /* Capabilities of aocl_mmd_shared_alloc (unsigned int)*/ + AOCL_MMD_DEVICE_MEM_CAPABILITIES = 15, /* Capabilities of aocl_mmd_device_alloc (unsigned int)*/ + AOCL_MMD_HOST_MEM_CONCURRENT_GRANULARITY = 16, /*(size_t)*/ + AOCL_MMD_SHARED_MEM_CONCURRENT_GRANULARITY = 17, /*(size_t)*/ + AOCL_MMD_DEVICE_MEM_CONCURRENT_GRANULARITY = 18, /*(size_t)*/ +} aocl_mmd_info_t; + +typedef struct { + unsigned long long int exception_type; + void* user_private_info; + size_t user_cb; +} aocl_mmd_interrupt_info; + +typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data); +typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data); +typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status); + +/* Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +// AOCL_MMD_CALL int aocl_mmd_get_info(int handle, +// aocl_mmd_info_t requested_info_id, +// size_t param_value_size, +// void* param_value, +// size_t* param_size_ret) WEAK; + +/* Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ +AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK; + +/* Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK; + +/* Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signaled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK; + +/* Set the device interrupt handler for the opened device. + * The device interrupt handler is called whenever the client needs to be notified + * of a device event signaled by the device internals. + * For example, an ECC error has been reported. + * + * Important: Interrupts from the device must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a device interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +// AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle, +// aocl_mmd_device_interrupt_handler_fn fn, +// void* user_data) WEAK; + +/* Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +//AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK; + +/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle + * and hence possibly waiting for events to be processed by the device. + * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is + * assumed to provide status/event updates via some other execution thread + * such as through an interrupt handler. + * + * Returns: non-zero if the yield function performed useful work such as + * processing DMA transactions, 0 if there is no useful work to be performed + * + * NOTE: yield may be called continuously as long as it reports that it has useful work + */ +//AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK; + +/* Read, write and copy operations on a single interface. + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_read( + int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_write( + int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK; +// AOCL_MMD_CALL int aocl_mmd_copy( +// int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK; + +/* Host Channel create operation + * Opens channel between host and kernel. + * + * Arguments: + * channel_name - name of channel to initialize. Same name as used in board_spec.xml + * + * queue_depth - the size in bytes of pinned memory queue in system memory + * + * direction - the direction of the channel + * + * The return value is negative if initialization was unsuccessful, and + * positive otherwise. Positive return value is handle to the channel to be used for + * subsequent calls for the channel. + */ +//AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK; + +/* Host Channel destroy operation + * Closes channel between host and kernel. + * + * Arguments: + * channel - the handle to the channel to close, that was obtained with + * create channel + * + * The return value is 0 if the destroy was successful, and negative + * otherwise. + */ +//AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK; + +/* Host Channel get buffer operation + * Provide host with pointer to buffer they can access to write or + * read from kernel, along with space or data available in the buffer + * in bytes. + * + * Arguments: + * channel - the handle to the channel to get the buffer for + * + * buffer_size - the address that this call will write the amount of + * space or data that's available in the buffer, + * depending on direction of the channel, in bytes + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is the pointer to the buffer that host can write + * to or read from. NULL if the status is negative. + */ +//AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK; + +/* Host Channel acknowledge buffer operation + * Acknowledge to the channel that the user has written or read data from + * it. This will make the data or additional buffer space available to + * write to or read from kernel. + * + * Arguments: + * channel - the handle to the channel that user is acknowledging + * + * send_size - the size in bytes that the user is acknowledging + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is equal to send_size if send_size was less than or + * equal to the buffer_size from get buffer call. If send_size was + * greater, then return value is the amount that was actually sent. + */ +//AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK; + +/* Program the device + * + * The host will guarantee that no operations are currently executing on the + * device. That means the kernels will be idle and no read/write/copy + * commands are active. Interrupts should be disabled and the FPGA should + * be reprogrammed with the data from user_data which has size size. The host + * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler + * again. At this point interrupts can be enabled. + * + * The new handle to the board after reprogram does not have to be the same as + * the one before. + * + * Arguments: + * user_data - The binary contents of the fpga.bin file created during + * Quartus II compilation. + * size - the size in bytes of user_data + * program_mode - bit field for programming attributes. See + * aocl_mmd_program_mode_t definition + * + * Returns: the new non-negative integer handle for the board, otherwise a + * negative value to indicate error. + */ + +// #ifdef DLA_MMD +// // CoreDLA BSP has removed some stuff that MMD tries to handshake with, so provide a "raw access" function to +// // reprogram the FPGA directly from the sof. Can't call quartus_pgm directly since the MMD still needs to mask +// // the PCIe surprise down error (when full-chip programming the FPGA, the CPU thinks a PCIe device has disappeared). +// // BEWARE: reprogramming will invalidate the handle +// AOCL_MMD_CALL int aocl_mmd_program_sof(int handle, const char* sof_filename) WEAK; +// #else +// AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK; +// #endif + +/** Error values*/ +#define AOCL_MMD_ERROR_SUCCESS 0 +#define AOCL_MMD_ERROR_INVALID_HANDLE -1 +#define AOCL_MMD_ERROR_OUT_OF_MEMORY -2 +#define AOCL_MMD_ERROR_UNSUPPORTED_ALIGNMENT -3 +#define AOCL_MMD_ERROR_UNSUPPORTED_PROPERTY -4 +#define AOCL_MMD_ERROR_INVALID_POINTER -5 +#define AOCL_MMD_ERROR_INVALID_MIGRATION_SIZE -6 + +/** Memory properties*/ +typedef enum { + /** + * Specifies the name of a global memory that can be found in the + * board_spec.xml file for the BSP. Allocations will be allocated to this + * global memory interface. + */ + AOCL_MMD_MEM_PROPERTIES_GLOBAL_MEMORY = 1, + /** + * Specifies the index of a bank inside the global memory interface that can be found in + * the board_spec.xml file for the BSP. Allocations will be allocated to this + * memory bank. It is invalid to specify this property without also specifying + * AOCL_MMD_GLOBAL_MEMORY_INTERFACE. + */ + AOCL_MMD_MEM_PROPERTIES_MEMORY_BANK +} aocl_mmd_mem_properties_t; + +/** + * Host allocations provide memory that is allocated on the host. Host + * allocations are accessible by the host and one or more devices. + * The same pointer to a host allocation may be used on the host and all + * supported devices; they have address equivalence. This memory must be + * deallocated with aocl_mmd_free(); + * + * Once the device has signaled completion through + * aocl_mmd_interrupt_handler_fn() the host can assume it has access to the + * latest contents of the memory, allocated by this call. + * + * @param handles Handles for devices that will need access to this memory + * @param num_devices Number of devices in the handles + * @param size The size of the memory region + * @param alignment The alignment in bytes of the allocation + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported values are + * described above. Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return valid pointer, on error NULL + */ +// AOCL_MMD_CALL void* aocl_mmd_host_alloc(int* handles, +// size_t num_devices, +// size_t size, +// size_t alignment, +// aocl_mmd_mem_properties_t* properties, +// int* error) WEAK; + +/** + * Frees memory that has been allocated by MMD + * + * @param mem The pointer to the memory region. Must be a pointer that is + * allocated by the MMD. + * @return AOCL_MMD_ERROR_SUCCESS if success, else error code + */ +// AOCL_MMD_CALL int aocl_mmd_free(void* mem) WEAK; + +/** + * Allocate memory that is owned by the device. This pointer can only be + * accessed by the kernel; can't be accessed by the host. The host is able to + * manipulate the pointer (e.g. increment it) just not access the underlying + * data. This memory must be deallocated by aocl_mmd_free(); + * + * @param handle Device that will have access to this memory + * @param size The size of the memory region + * @param alignment The alignment in bytes of the memory region + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported values are + * described above. Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return Pointer that can be passed into the kernel. NULL on failure. + */ +// AOCL_MMD_CALL void* aocl_mmd_device_alloc( +// int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK; + +/** + * Shared allocations may migrate between the host and one or more associated + * device. The same pointer to a shared allocation may be used on the host and + * the supported device; they have address equivalence. + * + * If the device does not support concurrent access to memory allocated by + * aocl_mmd_shared_alloc() then a call must be made to + * aocl_mmd_shared_mem_migrate() to indicate that the shared allocation should + * be migrated to the device before the device accesses this memory. For + * example, a call to aocl_mmd_shared_mem_migrate() should be made before a + * kernel accessing this memory is launched). Conversely, + * aocl_mmd_shared_mem_migrate() should be called again to indicate that the + * shared allocation should be migrated to the host before the host accesses + * this memory again. If the device supports concurrent access to memory + * allocated with aocl_mmd_shared_alloc(), then the call to + * aocl_mmd_shared_mem_migrate() is not necessary, but may still be made. In + * the case of concurrent access, it is the responsibility of the MMD to ensure + * both the device and host can access aocl_mmd_shared_alloc() allocations at + * all times. + * + * Memory allocated by aocl_mmd_shared_alloc() must be deallocated with + * aocl_mmd_free(). + * + * @param handle Device that will have access to this memory + * @param size The size of the memory region + * @param alignment The alignment in bytes of the memory region + * @param properties Specifies additional information about the allocated + * memory, described by a property type name and its corresponding value. + * Each property type name is immediately followed by the corresponding + * desired value. The list is terminated with 0. Supported properties are + * listed above and have the prefix AOCL_MMD_MEM_PROPERTIES_. + * Example: [<property1>, <value1>, <property2>, <value2>, 0] + * @param error The error code defined by AOCL_MMD_ERROR* + * @return valid pointer, on error NULL + */ +// AOCL_MMD_CALL void* aocl_mmd_shared_alloc( +// int handle, size_t size, size_t alignment, aocl_mmd_mem_properties_t* properties, int* error) WEAK; + +typedef enum { AOCL_MMD_MIGRATE_TO_HOST = 0, AOCL_MMD_MIGRATE_TO_DEVICE = 1 } aocl_mmd_migrate_t; + +/** + * A call to aocl_mmd_shared_migrate() must be made for non-concurrent shared + * allocations any time the accessor of the allocation changes. For example, + * aocl_mmd_shared_migrate() should be called indicating that the allocation + * should be migrated to the device before a kernel accessing the allocation + * is launched on the device. Similarly, aocl_mmd_shared_migrate() should be + * called indicating that the allocation is migrated to the host before the + * host accesses the memory after kernel completion. + * + * For concurrent allocations this call may be used as a performance hint, but + * is not strictly required for functionality. + * + * @param handle Device that will have access to this memory + * @param shared_ptr Pointer allocated by aocl_mmd_shared_alloc() + * @param size In bytes, the size of the migration. Must be of multiple of a + * page boundary that the BSP supports. + * @param destination The destination of migration + * @return The error code defined by AOCL_MMD_ERROR* + */ +// AOCL_MMD_CALL int aocl_mmd_shared_migrate(int handle, +// void* shared_ptr, +// size_t size, +// aocl_mmd_migrate_t destination) WEAK; + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK; +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK; +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK; + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; + +#define STREAM_CONTROLLER_ACCESS +#ifdef STREAM_CONTROLLER_ACCESS +AOCL_MMD_CALL bool dla_is_stream_controller_valid(int handle, int instance) WEAK; +AOCL_MMD_CALL int dla_mmd_stream_controller_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_stream_controller_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; +#endif + +// Get the PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK; +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt new file mode 100644 index 0000000..d8be216 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/system_console/CMakeLists.txt @@ -0,0 +1,2 @@ + +add_library(system_console_mmd INTERFACE) diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp new file mode 100644 index 0000000..64c6631 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/system_console/mmd_wrapper.cpp @@ -0,0 +1,320 @@ +// Copyright 2020-2024 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "mmd_wrapper.h" +#include "dla_dma_constants.h" // DLA_DMA_CSR_OFFSET_*** + +#include <cassert> // assert +#include <cstddef> // size_t +#include <iostream> // std::cerr +#include <stdexcept> // std::runtime_error +#include <string> // std::string + +#include <boost/process.hpp> +#include <boost/filesystem.hpp> +#include <boost/format.hpp> +#include <boost/filesystem/fstream.hpp> +#include <boost/process/environment.hpp> +#include <string> +#include <iostream> +#include <string> +#include <cstdio> +#include <sstream> +#include <ostream> + +#define xstr(s) _str(s) +#define _str(s) #s + +// All board variants must obey the CoreDLA CSR spec, which says that all access must be +// - 32 bits in size +// - address must be 4 byte aligned +// - within the address range, CSR size is 2048 bytes +constexpr uint64_t DLA_CSR_ALIGNMENT = 4; +constexpr uint64_t DLA_CSR_SIZE = 2048; +namespace bp = boost::process; //we will assume this for all further examples + +constexpr auto max_size = std::numeric_limits<std::streamsize>::max(); + +static const boost::filesystem::path system_console_path("/home/pmclean/intelfpga_pro/23.4/qprogrammer/syscon/bin/system-console"); +static boost::filesystem::path temp_file_path; +static boost::filesystem::path tcl_file_path; +static boost::filesystem::path sof_file_path; +static uint32_t enable_pmon; +static bool preserve_temp_files; + +const uint32_t DLA_CSR_BASE_ADDRESS = 0x80000000; +const uint32_t DLA_DDR_BASE_ADDRESS = 0x0; + + +static bp::opstream in; +static bp::ipstream out; +static bp::child subprocess; + +static int capture_till_prompt(bp::ipstream& out, std::ostream& capture) +{ + std::array<char, 4096> line_buffer; + if (out.fail()) { + std::cout << "EOF" << std::endl; + return 1; + } + + do { + out.clear(); + out.getline(&line_buffer[0], (std::streamsize)line_buffer.size(), '%'); + capture.write(&line_buffer[0], out.gcount()); + // If out.getline fills the line buffer without encountering the delimiter + // then the failbit of out will be set, causing out.fail() to return true. + // bp::ipstream indirectly inherits std::ios_base::iostate, which defines failbit/badbit + } while (out.fail() && (static_cast<long unsigned int> (out.gcount()) == line_buffer.size()-1)); + + if (out.fail()) { + std::cout << "EOF" << std::endl; + return 1; + } + return 0; +} + +static int wait_for_prompt(bp::ipstream& out) +{ + return capture_till_prompt(out, std::cout); +} + +std::string remove_non_alphanumeric(const std::string& input) { + std::string result = input; + result.erase(std::remove_if(result.begin(), result.end(), [](unsigned char c) { + return !std::isalnum(c); + }), result.end()); + return result; +} + +static void send_command(bp::opstream& in, std::string command) +{ + in << command << "\n"; + in.flush(); +} + +static void write_to_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr, uint32_t data) { + addr += DLA_CSR_BASE_ADDRESS; + send_command(in, "master_write_32 $::g_dla_csr_service " + str( boost::format("0x%|08x| 0x%|08x|") % addr % data)); + if (0 != wait_for_prompt(out)) + { + throw std::runtime_error("Unexpected EOF"); + } +} + +static uint32_t read_from_csr(bp::opstream& in, bp::ipstream& out, uint32_t addr) { + if (addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK) + { + return 3; + } + if (addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG) + { + return 1; + } + addr += DLA_CSR_BASE_ADDRESS; + send_command(in, "master_read_32 $::g_dla_csr_service " + str( boost::format("0x%|08x|") % addr ) + " 1"); + std::basic_stringstream<char> s1; + std::string captured; + do { + if (0 != capture_till_prompt(out, s1)) + { + throw std::runtime_error("Unexpected EOF"); + } + captured = s1.str(); + } while (std::all_of(captured.begin(), captured.end(), [](unsigned char c){return (std::isspace(c) || std::iscntrl(c));})); + std::string trimmed = remove_non_alphanumeric(captured); + + uint32_t data = std::stoul(trimmed, nullptr, 16); + + return data; +} + +static void read_from_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, void* data) +{ + if (data == nullptr) + { + throw std::runtime_error("null data"); + } + boost::filesystem::path temp_file_name = boost::filesystem::unique_path(); + boost::filesystem::path temppath = temp_file_path / temp_file_name; + send_command(in, "master_read_to_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x| 0x%|08x|") % addr % length ) ); + if (0 != wait_for_prompt(out)) { + throw std::runtime_error("Unexpected EOF"); + } + boost::filesystem::ifstream ifs(temppath, std::ios::in | std::ios::binary); + ifs.read(static_cast<char *>(data), length); + ifs.close(); + + if (!preserve_temp_files) { + try { + boost::filesystem::remove(temppath); + } catch (const boost::filesystem::filesystem_error& ex) { + std::cerr << "Error removing file: " << ex.what() << std::endl; + } + } +} + +static void write_to_ddr(bp::opstream& in, bp::ipstream& out, uint64_t addr, uint64_t length, const void* data) +{ + boost::filesystem::path temp_file_name = boost::filesystem::unique_path(); + boost::filesystem::path temppath = temp_file_path / temp_file_name; + boost::filesystem::ofstream ofs(temppath, std::ios::out | std::ios::binary); + if (ofs.fail()) { + throw std::runtime_error("Failed to access the temporary file " + temppath.generic_string()); + } + ofs.write(static_cast<const char *>(data), length); + ofs.close(); + send_command(in, "master_write_from_file $::g_emif_ddr_service " + temppath.generic_string() + str( boost::format(" 0x%|08x|") % addr ) ); + if (0 != wait_for_prompt(out)) + { + throw std::runtime_error("Unexpected EOF"); + } + + if (!preserve_temp_files) { + try { + boost::filesystem::remove(temppath); + } catch (const boost::filesystem::filesystem_error& ex) { + std::cerr << "Error removing file: " << ex.what() << std::endl; + } + } +} + +MmdWrapper::MmdWrapper() { + // Check for the envrionment variable + auto env = boost::this_process::environment(); + tcl_file_path = env.find("DLA_SYSCON_SOURCE_FILE") != env.end() ? + boost::filesystem::path(env["DLA_SYSCON_SOURCE_FILE"].to_string()) : + boost::filesystem::path(xstr(DLA_SYSCON_SOURCE_ROOT)) / "system_console_script.tcl"; + if (!boost::filesystem::exists(tcl_file_path)) { + throw std::runtime_error("Cannot locate " + tcl_file_path.generic_string() + ". Please specify the path of the Tcl setup script by defining the environment variable DLA_SYSCON_SOURCE_FILE\n"); + } else { + std::cout <<"Using the Tcl setup script at "<<tcl_file_path.generic_string()<<std::endl; + } + + temp_file_path = env.find("DLA_TEMP_DIR") != env.end() ? + boost::filesystem::path(env["DLA_TEMP_DIR"].to_string()) : + boost::filesystem::current_path(); + if (!boost::filesystem::exists(temp_file_path)) { + throw std::runtime_error("The temporary file storage directory specified via the environment variable DLA_TEMP_DIR does not exist.\n"); + } else { + std::cout <<"Saving temporary files to "<<temp_file_path.generic_string()<<std::endl; + } + + sof_file_path = env.find("DLA_SOF_PATH") != env.end() ? + boost::filesystem::path(env["DLA_SOF_PATH"].to_string()): + boost::filesystem::current_path() / "top.sof"; + if (!boost::filesystem::exists(sof_file_path)) { + throw std::runtime_error("Cannot find the FPGA bitstream (.sof). Please specify its location via the environment variable DLA_SOF_PATH,"\ + " or copy it as top.sof to the current working directory.\n"); + } else { + std::cout <<"Using the FPGA bitstream at "<<sof_file_path.generic_string()<<" to configure the JTAG connection"<<std::endl; + } + + boost::filesystem::path system_console_path = bp::search_path("system-console"); + if (system_console_path.empty()) { + throw std::runtime_error("Cannot find system-console in system PATH!\n"); + + } + enable_pmon = env.find("DLA_ENABLE_PMON") != env.end() ? 1 : 0; + + preserve_temp_files = env.find("DLA_PRESERVE_TEMP_FILES") != env.end() ? true : false; + + subprocess = bp::child(system_console_path, "-cli", bp::std_out > out, bp::std_in < in); + if (wait_for_prompt(out)) + { + throw std::runtime_error("Could not find initial prompt"); + } + send_command(in, "set ::cl(sof) " + sof_file_path.generic_string()); + if (enable_pmon == 1) { + send_command(in, "set ::cl(enable_pmon) 1"); + } + send_command(in, "source " + tcl_file_path.generic_string()); + std::basic_stringstream<char> s1; + if (0 != capture_till_prompt(out, s1)) + { + throw std::runtime_error("Could not find prompt after source"); + } + std::string captured(s1.str()); + + // Reset the IP + write_to_csr(in, out, DLA_DMA_CSR_OFFSET_IP_RESET, 1); + // Constants of the design + maxInstances_ = 1; + ddrSizePerInstance_ = 0x80000000; + // Need to change the frequencies below when their counterparts in the Qsys system are modified + coreDlaClockFreq_ = 200; + ddrClockFreq_ = 200; + // Initialize the handle_ object to a dummy value. It is not relevant to this MMD + handle_ = 0; +} + +MmdWrapper::~MmdWrapper() { + send_command(in, "close_services"); + if (wait_for_prompt(out)) + { + std::cout << "Could not find prompt after attempting to close system console services\n"; + } + send_command(in, "exit"); + try { + subprocess.terminate(); + std::cout << "Successfully closed JTAG services.\n"; + } catch (const boost::process::process_error& e) { + std::cerr << "Failed to terminate the system-console process due to reason: " << e.what() << std::endl; + } +} + +void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const { + throw std::runtime_error("System Console plugin requires polling"); +} + +void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const { + write_to_csr(in, out, addr, data); +} + +uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const { + return read_from_csr(in, out, addr); +} + +void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const { + write_to_ddr(in, out, addr, length, data); +} + +void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const { + read_from_ddr(in, out, addr, length, data); +} + +#ifndef STREAM_CONTROLLER_ACCESS +// Stream controller access is not supported by the platform abstraction +bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; } + +// 32-bit handshake with each Stream Controller CSR +void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const { + assert(false); +} + +void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const { + assert(false); +} +#else +// If the mmd layer supports accesses to the Stream Controller +bool MmdWrapper::bIsStreamControllerValid(int instance) const { + return false; +} + +// 32-bit handshake with each Stream Controller CSR +void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const { +} + +void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const { +} +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl new file mode 100644 index 0000000..9e0e386 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/system_console/system_console_script.tcl @@ -0,0 +1,79 @@ +# Author: linqiaol +# Purpose: Perform write-read tests on external memory and CoreDLA CSR to make sure the registers can be accessed from host. + +# Declare and initialize CL arguments +if {![info exists ::cl(sof)]} { + set ::cl(sof) "top.sof" +} + +if {![info exists ::cl(enable_pmon)]} { + set ::cl(enable_pmon) 0 +} + +# Declare global variables +set ::g_emif_calip_service "" +set ::g_emif_ddr_service "" +set ::g_dla_csr_service "" +set ::g_pmon_service "" + +# Declare some contants +set ::g_const_master_offset_emif 0x0 +set ::g_const_master_range_emif 0x080000000 +set ::g_const_master_offset_dla 0x080000000 +set ::g_const_master_range_dla 0x000001000 + +#{{{ load_sof +proc load_sof {} { + puts "loading sof: $::cl(sof)" + design_load $::cl(sof) +} +#}}} + +#{{{claim_emif_ddr_service +proc claim_emif_ddr_service {} { + set all_master_paths [get_service_paths master] + set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]] + set service [claim_service master $path {} "\{${::g_const_master_offset_emif} ${::g_const_master_range_emif} EXCLUSIVE\}"] + return $service +} +#}}} + +#{{{claim_dla_csr_service +proc claim_dla_csr_service {} { + set all_master_paths [get_service_paths master] + set path [lindex $all_master_paths [lsearch -glob $all_master_paths *jtag*master*]] + set service [claim_service master $path {} "\{${::g_const_master_offset_dla} ${::g_const_master_range_dla} EXCLUSIVE\}"] + return $service +} +#}}} + +#{{{claim_pmon_service +proc claim_pmon_service {} { + set all_master_paths [get_service_paths master] + set path [lindex $all_master_paths [lsearch -glob $all_master_paths *pmon*master*]] + set service [claim_service master $path {} {{0x0 0x00001000 EXCLUSIVE}}] + return $service +} +#}}} + +proc initialization {} { + load_sof + puts "Claim required services" + set ::g_dla_csr_service [claim_dla_csr_service] + set ::g_emif_ddr_service [claim_emif_ddr_service] + if {$::cl(enable_pmon) == 1} { + puts "Claiming JTAG service to the AXI4 performance monitor" + set ::g_pmon_service [claim_pmon_service] + } +} + +proc close_services {} { + close_service master $::g_dla_csr_service + if {$::cl(enable_pmon) == 1} { + close_service master $::g_pmon_service + } + close_service master $::g_emif_ddr_service + puts "Closed DLA JTAG services" +} + +initialization
\ No newline at end of file diff --git a/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp new file mode 100644 index 0000000..9ac7598 --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/coredla_batch_job.cpp @@ -0,0 +1,125 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "coredla_batch_job.h" //CoreDlaBatchJob +#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_*** +#include "stream_controller_comms.h" + +static constexpr int CONFIG_READER_DATA_BYTES = 8; + +std::unique_ptr<BatchJob> CoreDlaBatchJob::MakeUnique(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms) { + return std::unique_ptr<BatchJob>(new CoreDlaBatchJob(mmdWrapper, + totalConfigWords, + configBaseAddrDDR, + inputAddrDDR, + outputAddrDDR, + inputSizeDDR, + outputSizeDDR, + enableIstream, + enableOstream, + instance, + spStreamControllerComms)); +} +CoreDlaBatchJob::CoreDlaBatchJob(MmdWrapper* mmdWrapper, + uint64_t totalConfigWords, + uint64_t configBaseAddrDDR, + uint64_t inputAddrDDR, + uint64_t outputAddrDDR, + uint64_t inputSizeDDR, + uint64_t outputSizeDDR, + const bool enableIstream, + const bool enableOstream, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms) +: mmdWrapper_(mmdWrapper) +, instance_(instance) +, totalConfigWords_(totalConfigWords) +, configBaseAddrDDR_(configBaseAddrDDR) +, inputAddrDDR_(inputAddrDDR) +, outputAddrDDR_(outputAddrDDR) +, inputSizeDDR_(inputSizeDDR) +, outputSizeDDR_(outputSizeDDR) +, enableIstream_(enableIstream) +, enableOstream_(enableOstream) +, lastJobQueueNumber_(0) +, spStreamControllerComms_(spStreamControllerComms) { +} + +// This function must be called by a single thread +// It can be called on a different thread than StartDla or WaitForDla +void CoreDlaBatchJob::LoadInputFeatureToDDR(void* inputArray) { + mmdWrapper_->WriteToDDR(instance_, inputAddrDDR_, inputSizeDDR_, inputArray); + StartDla(); +} + +void CoreDlaBatchJob::ScheduleInputFeature() const { + if (spStreamControllerComms_) { + // Send message to NIOS-V + uint64_t configurationSize64 = (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2; + uint32_t configurationBaseAddressDDR = static_cast<uint32_t>(configBaseAddrDDR_); + uint32_t configurationSize = static_cast<uint32_t>(configurationSize64); + uint32_t inputAddressDDR = static_cast<uint32_t>(inputAddrDDR_); + uint32_t outputAddressDDR = static_cast<uint32_t>(outputAddrDDR_); + + Payload<CoreDlaJobPayload> item; + item._configurationBaseAddressDDR = configurationBaseAddressDDR; + item._configurationSize = configurationSize; + item._inputAddressDDR = inputAddressDDR; + item._outputAddressDDR = outputAddressDDR; + + spStreamControllerComms_->ScheduleItems( { item } ); + } +} + +// This function must be called by a single thread +// It can be called on a different thread than WaitForDla or LoadInputFeatureToDDR +void CoreDlaBatchJob::StartDla() { + ////////////////////////////////////// + // Write to CSR to start the FPGA // + ////////////////////////////////////// + + // interrupt mask was already enabled in the DlaDevice constructor + + // intermediate buffer address was already set when the graph was loaded + + // base address for config reader + mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, configBaseAddrDDR_); + + // how many words for config reader to read + // hardware wants the number of words minus 2 since the implementation is a down counter which ends at -1, the sign + // bit is used to denote the end of the counter range + mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, (totalConfigWords_ / CONFIG_READER_DATA_BYTES) - 2); + + if (enableIstream_ && enableOstream_) { + // Arm the streaming interface. Will continuously load configs. + const unsigned int enable = 1; + mmdWrapper_->WriteToCsr(instance_, DLA_CSR_OFFSET_READY_STREAMING_IFACE, enable); + } else { + // base address for feature reader -- this will trigger one run of DLA + mmdWrapper_->WriteToCsr(instance_, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, inputAddrDDR_); + } +} + +void CoreDlaBatchJob::ReadOutputFeatureFromDDR(void* outputArray) const { + mmdWrapper_->ReadFromDDR(instance_, outputAddrDDR_, outputSizeDDR_, outputArray); +} diff --git a/python/openvino/runtime/coredla_device/src/coredla_device.cpp b/python/openvino/runtime/coredla_device/src/coredla_device.cpp new file mode 100644 index 0000000..b28d8a2 --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/coredla_device.cpp @@ -0,0 +1,574 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "coredla_device.h" //CoreDlaDevice +#include "coredla_batch_job.h" //CoreDlaBatchJob +#include "coredla_graph_job.h" //CoreDlaBatchJob +#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_*** +#include "stream_controller_comms.h" + +#include <algorithm> //std::count +#include <cassert> //assert +#include <chrono> //std::chrono::seconds +#include <cstddef> //size_t +#include <cstdlib> //std::getenv +#ifndef USE_OLD_COREDLA_DEVICE +#include <cinttypes> //printf formatters +#endif +#include <mutex> //std::mutex +#include <stdexcept> //std::runtime_error +#include <string> //std::string +#include <iostream> //std::cerr +#include <stdint.h> // +#include <thread> +#include <cinttypes> + +std::unique_ptr<Device> Device::MakeUnique(const arch_params* archParams, + uint32_t waitForDlaTimeoutSeconds) { + return std::unique_ptr<Device>(new CoreDlaDevice(waitForDlaTimeoutSeconds)); +} + +void InterruptServiceRoutine(int handle, void* data) { + InterruptServiceRoutineData* isrData = static_cast<InterruptServiceRoutineData*>(data); + // clear interrupt status -- write 1 to clear that bit + constexpr int writeDataToClearInterruptStatus = 3; + const int numInstances = static_cast<int>(isrData->jobsFinished.size()); + for (int i = 0; i < numInstances; i++) { + isrData->mmdWrapper->WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, writeDataToClearInterruptStatus); + } + for (int i = 0; i < numInstances; i++) { + isrData->desc_queue_diag[i] = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS); + // ask the csr how many jobs have finished + uint32_t completionCount = isrData->mmdWrapper->ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT); + // check if the completionCount wraps around (overflow detection) and save this information + if (isrData->prevCount[i] > completionCount) + isrData->base_multiplier[i] ++; + isrData->prevCount[i] = completionCount; + // we add base_multiplier to account for the fact that a wrap around is actually an increment of 1 + std::unique_lock<std::mutex> isrMutexLock(isrData->isrMutex[i]); + isrData->jobsFinished[i] = (uint64_t) isrData->base_multiplier[i] * UINT32_MAX + completionCount + isrData->base_multiplier[i]; + isrData->isrCondVar[i].notify_all(); + } +} + +CoreDlaDevice::CoreDlaDevice(uint32_t waitForDlaTimeoutSeconds) +: waitForDlaTimeoutSeconds_(waitForDlaTimeoutSeconds) { +#ifdef COREDLA_RUNTIME_POLLING + runtimePolling_ = true; +#else + runtimePolling_ = false; +#endif + // mmdWrapper_ ctor runs first, which will open a handle to the MMD. Now determine the number of hardware instances + // by writing a nonzero value to some offset and then reading it back. While trying to enable the interrupt + // mask, test for this. + numInstances_ = 0; + for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) { + constexpr uint32_t allInterruptsMask = (1<<DLA_DMA_CSR_INTERRUPT_ERROR_BIT) | (1<<DLA_DMA_CSR_INTERRUPT_DONE_BIT); + // clear any pending interrupts (there may be pending interrupts from last run), then enable mask for instance count + mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL, allInterruptsMask); + mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, allInterruptsMask); + uint32_t readData = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK); + if (allInterruptsMask == readData) numInstances_ = i + 1; + } + LOG_AND_PRINT(Logger::INFO, "numInstances_: %d\n", numInstances_); + assert(numInstances_ >= 1); + jobsWaited_.resize(numInstances_, 0); + + uint32_t license = mmdWrapper_.ReadFromCsr(0, DLA_DMA_CSR_OFFSET_LICENSE_FLAG); + if (license == 0) { + DLA_LOG("Using unlicensed IP\n"); + } + else if (license == 1) { + DLA_LOG("Using licensed IP\n"); + } + else { + throw std::runtime_error("Unrecongnized license flag"); + } +#ifndef USE_OLD_COREDLA_DEVICE + startClocksActive.resize(numInstances_, 0); + startClockAllJobs.resize(numInstances_, 0); +#endif + startNumInputFeatureMemoryReads.resize(numInstances_, 0); + startNumFilterMemoryReads.resize(numInstances_, 0); + startNumOutputFeatureMemoryWrites.resize(numInstances_, 0); + + // Package up the data that interrupt service routine needs + isrData_.mmdWrapper = &mmdWrapper_; + isrData_.jobsFinished = std::vector<uint64_t>(numInstances_, 0); + isrData_.base_multiplier = std::vector<uint32_t>(numInstances_, 0); + isrData_.prevCount = std::vector<uint32_t>(numInstances_, 0); + isrData_.desc_queue_diag = std::vector<uint32_t>(numInstances_, 0); + isrData_.isrMutex = std::vector<std::mutex>(numInstances_); + isrData_.isrCondVar = std::vector<std::condition_variable>(numInstances_); + + if (runtimePolling_) { + // disable the interrupt mask -- it was originally enabled to determine how many instances were present + for (int i = 0; i < mmdWrapper_.GetMaxInstances(); i++) { + constexpr uint32_t disableInterruptMaskValue = 0; + mmdWrapper_.WriteToCsr(i, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, disableInterruptMaskValue); + } + } + else { + // register an interrupt handler + mmdWrapper_.RegisterISR(&InterruptServiceRoutine, &isrData_); + } + + // Record the current counters + for(int i=0; i < numInstances_; i++) { +#ifndef USE_OLD_COREDLA_DEVICE + jobsWaited_[i] = mmdWrapper_.ReadFromCsr(i, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT); + isrData_.jobsFinished[i] = jobsWaited_[i]; + + startClocksActive[i] = GetClocksActive(i); + startClockAllJobs[i] = GetClocksAllJobs(i); +#endif + startNumInputFeatureMemoryReads.at(i) = GetNumInputFeatureMemoryReadsTotal(i); + startNumFilterMemoryReads.at(i) = GetNumFilterMemoryReadsTotal(i); + startNumOutputFeatureMemoryWrites.at(i) = GetNumOutputFeatureMemoryWritesTotal(i); + } + + // Allocator needs access to mmd to write to CSR the start address of the shared intermediate buffer allocated in DDR + ddrAllocator_ = std::unique_ptr<DeviceMemoryAllocator[]>(new DeviceMemoryAllocator[numInstances_]); + for (int i = 0; i < numInstances_; i++) { + ddrAllocator_[i].Initialize(mmdWrapper_.GetDDRSizePerInstance(), &mmdWrapper_); + } + +// Choose which data pattern you want, all zeros or all ones can also be useful for IP debug purposes +#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR, INDEX) ((ADDR * 12345) + (INDEX * 6789)) + //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0) + //#define DEBUG_RUNTIME_MEMORY_TEST_PATTERN(ADDR,INDEX) (0xffffffffffffffffULL) + bool run_memory_test = getenv("COREDLA_RUNTIME_MEMORY_TEST") != nullptr; + if (run_memory_test) { + // Ensure host can access all of the device memory that is accessible by all CoreDLA instances + // This is not necessarily the total device memory e.g. only 1 CoreDLA instance but 2 DDR banks + DLA_LOG("starting memory test with %d instances\n", numInstances_); + constexpr uint64_t CHUNK_SIZE = 1ULL << 20; // one address check is 1 MB + const uint64_t ADDR_LIMIT = mmdWrapper_.GetDDRSizePerInstance(); + int mismatch = 0; + uint64_t expected; + uint64_t* data = new uint64_t[CHUNK_SIZE / sizeof(uint64_t)]; + + for (int inst = 0; inst < numInstances_; ++inst) { + // write to entire fpga ddr + for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) { + for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) + data[index] = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index); + mmdWrapper_.WriteToDDR(inst, addr, CHUNK_SIZE, static_cast<const void*>(data)); + } + // read back entire fpga ddr and compare to expected + for (uint64_t addr = 0; addr < ADDR_LIMIT; addr += CHUNK_SIZE) { + mmdWrapper_.ReadFromDDR(inst, addr, CHUNK_SIZE, data); + for (uint64_t index = 0; index < CHUNK_SIZE / sizeof(uint64_t); index++) { + expected = DEBUG_RUNTIME_MEMORY_TEST_PATTERN(addr, index); + if (data[index] != expected) { + if (mismatch < 10) { +#if (!defined(USE_OLD_COREDLA_DEVICE) || defined(_WIN32)) + DLA_LOG("memory test mismatch, addr %" PRIu64 ", index %" PRIu64 ", got %" PRIu64 ", expected %" PRIu64 + "\n", + addr, + index, + data[index], + expected); +#else + DLA_LOG("memory test mismatch, addr %lu, index %lu, got %lu, expected %lu\n", + addr, + index, + data[index], + expected); +#endif + } + mismatch++; + } + } + } + } + delete[] data; + DLA_LOG("finished memory test "); + if (mismatch == 0) { + DLA_LOG("SUCCESS\n"); + } else { + DLA_LOG("FAILURE (%d mismatches)\n", mismatch); + } + } +} + +CoreDlaDevice::~CoreDlaDevice() { + // Avoid the scenario where some CoreDLA job has been started but something goes wrong + // in the runtime which causes it to exit, e.g. assertion failure or uncaught exception. + // CoreDLA will still raise an interrupt when the job finishes, yet the runtime will no + // longer be able to deal with it. Better to shut off interurpts. + for (int instance = 0; instance < numInstances_; instance++) { + // MmDWrapper.WriteToCSR might throw exception, and the destructor should not have + // unhandled exception, so we need to handle exceptions internally + try { + mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERRUPT_MASK, 0); + } catch (const std::exception& e) { + std::cerr << "Failed to shut off the DMA CSR interrupt mask due to " << e.what() << std::endl; + } + } +} + +GraphJob* CoreDlaDevice::CreateGraphJob(const dla::CompiledResult* compiledResult, +#ifndef USE_OLD_COREDLA_DEVICE + size_t numPipelines, +#else + uint64_t numPipelines, +#endif + int instance, + std::string AES_key, + std::string IV_key, + bool encryption_enabled, + const std::string export_dir, + const std::string parameter_rom_export_dir) { + assert(instance < numInstances_); + (void) export_dir; // unused in HW runtime. CoreDLA utilizes base pointers, which the SW emulator utilizes this variable. We void it here. + allGraphJobs_.push_back(move( + CoreDlaGraphJob::MakeUnique(&ddrAllocator_[instance], &mmdWrapper_, compiledResult, numPipelines, instance, spStreamControllerComms_))); + return (allGraphJobs_.back()).get(); +} + +// This function must be called by a single thread +void CoreDlaDevice::WaitForDla(int instance, size_t threadId, std::function<bool()> isCancelledPredicate) { + // ISR updates jobsFinished, if not enough jobs have finished then sleep until ISR runs again + // it is possible that several hardware jobs could finish around the same time + // by the time software handles the first interrupt, hardware could report that 2 jobs have + // finished, for example the second time that waitForInterrupt runs, software already tracks + // that the second job has finished and therefore don't need to sleep waiting for ISR + std::unique_lock<std::mutex> isrMutexLock(isrData_.isrMutex[instance]); + uint32_t completionCount = 0; + bool timedOut = false; + auto timeoutDuration = std::chrono::seconds(waitForDlaTimeoutSeconds_); + + if (runtimePolling_) { + std::chrono::time_point<std::chrono::system_clock> pollingEndingTime = + std::chrono::system_clock::now() + timeoutDuration; + + while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) { + // Update isrData_.jobsFinished[instance] here (polling) + if (isCancelledPredicate and isCancelledPredicate()) { + break; + } + + completionCount = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_COMPLETION_COUNT); + isrData_.jobsFinished[instance] = completionCount; + if (std::chrono::system_clock::now() > pollingEndingTime) { + timedOut = true; + break; + } + } + } else { + while (isrData_.jobsFinished[instance] == jobsWaited_[instance]) { + // isrData_.jobsFinished[instance] is updated in the ISR + if (std::cv_status::timeout == isrData_.isrCondVar[instance].wait_for(isrMutexLock, timeoutDuration)) { + timedOut = true; + break; + } + } + } + + if (timedOut) { + std::string str_poll_vs_int = "interrupt"; + if (runtimePolling_) { + str_poll_vs_int = "polling"; + } + std::string timeoutMsg = "WaitForDla " + str_poll_vs_int + " timeout with threadId_" + std::to_string(threadId) + "\n"; + + // Timeout has happened if we get here + timeoutMsg += "If inference on one batch is expected to take more than " + + std::to_string(waitForDlaTimeoutSeconds_) + + " seconds, then increase WAIT_FOR_DLA_TIMEOUT in dlia_plugin.cpp and " + "recompile the runtime.\n"; + DLA_LOG("%s", timeoutMsg.c_str()); // this should always print, even if logging + // verbosity is too low + LOG(Logger::WARNING, "%s", timeoutMsg.c_str()); + std::string exceptionMsg = "FATAL ERROR: inference on FPGA did not complete"; + exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]); + exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]); + throw std::runtime_error(exceptionMsg); + } + + if ((isrData_.desc_queue_diag[instance] >> DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT) & 0x01) { + std::cerr << "ERROR: Out of free inferences on this IP. " << + "The Intel FPGA AI suite cannot continue without a license!" << std::endl; + std::string exceptionMsg = "Inference on FPGA exited with a license error"; + exceptionMsg += ", jobs finished " + std::to_string(isrData_.jobsFinished[instance]); + exceptionMsg += ", jobs waited " + std::to_string(jobsWaited_[instance]); + exceptionMsg += "\nPlease check your license. The Intel FPGA AI suite cannot continue without a license!"; + throw std::runtime_error(exceptionMsg); + } + + jobsWaited_[instance]++; +} + +#ifndef USE_OLD_COREDLA_DEVICE +uint64_t CoreDlaDevice::GetClocksActive(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO); + uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI); + return (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo; +} + +double CoreDlaDevice::GetActiveHWTimeMs(int instance) const { + uint64_t clocksActive = GetClocksActive(instance) - startClocksActive[instance]; + // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds + return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq()); +} + +uint64_t CoreDlaDevice::GetClocksAllJobs(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO); + uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI); + return (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo; +} + +double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const { + uint64_t clocksAllJobs = GetClocksAllJobs(instance) - startClockAllJobs[instance]; + // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds + return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs); +} +#else +double CoreDlaDevice::GetActiveHWTimeMs(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t clocksActiveLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO); + uint32_t clocksActiveHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI); + uint64_t clocksActive = (((uint64_t)clocksActiveHi) << 32) | clocksActiveLo; + // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds + return clocksActive / (1000.0 * mmdWrapper_.GetDDRClockFreq()); +} + +double CoreDlaDevice::GetAvgHWTimePerJobMs(uint64_t num_jobs, int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t clocksAllJobsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO); + uint32_t clocksAllJobsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI); + uint64_t clocksAllJobs = (((uint64_t)clocksAllJobsHi) << 32) | clocksAllJobsLo; + // DDR clock freq is in MHz, so dividing by that would give microseconds, multiply by 1000 to get milliseconds + return clocksAllJobs / (1000.0 * mmdWrapper_.GetDDRClockFreq() * num_jobs); +} +#endif + +uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReads(int instance) const { + return GetNumInputFeatureMemoryReadsTotal(instance) - startNumInputFeatureMemoryReads.at(instance); +} + +uint64_t CoreDlaDevice::GetNumFilterMemoryReads(int instance) const { + return GetNumFilterMemoryReadsTotal(instance) - startNumFilterMemoryReads.at(instance); +} + +uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWrites(int instance) const { + return GetNumOutputFeatureMemoryWritesTotal(instance) - startNumOutputFeatureMemoryWrites.at(instance); +} + +uint64_t CoreDlaDevice::GetNumInputFeatureMemoryReadsTotal(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t numIFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO); + uint32_t numIFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI); + uint64_t numIFReads = (((uint64_t) numIFReadsHi) << 32) | ((uint64_t) numIFReadsLo); + return numIFReads; +} + +uint64_t CoreDlaDevice::GetNumFilterMemoryReadsTotal(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t numWeightReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO); + uint32_t numWeightReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI); + uint64_t numWeightReads = (((uint64_t) numWeightReadsHi) << 32) | ((uint64_t) numWeightReadsLo); + return numWeightReads; +} + +uint64_t CoreDlaDevice::GetNumOutputFeatureMemoryWritesTotal(int instance) const { + //Important: To satisfy the anti-rollover feature of the 64-bit counters in the DMA CSR + //the host must first read the lower 32-bit of the counter, + //then immediately read the higher 32-bit of the counter + uint32_t numOFReadsLo = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO); + uint32_t numOFReadsHi = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI); + uint64_t numOFReads = (((uint64_t) numOFReadsHi) << 32) | ((uint64_t) numOFReadsLo); + return numOFReads; +} + +// Read one 32-bit value from the debug network, return value indicates whether read was successful. A read can fail if +// the module number and address have not been implemented. The debug network is fault tolerant to both read requests +// never being accepted as well as read responses never being produced. +bool CoreDlaDevice::ReadDebugCsr( + uint32_t moduleNum, uint32_t address, int instance, uint32_t& readData, bool verbose) const { + assert(moduleNum <= 0xff); + assert(address <= 0xffffff); + uint32_t addr = ((moduleNum & 0xff) << 24) | (address & 0xffffff); + + // Step 1: send the address that the debug network will use to issue a read request. Writing once to this CSR offset + // will cause the debug network to issue one read request. + mmdWrapper_.WriteToCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR, addr); + + // Optional step: read back the value sent to CSR, sanity check that it is correct. Note this is all handled + // internally to the CSR, e.g. the CSR does not go ask the debug network what address it sent. + uint32_t addrCheck = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR); + if (addr != addrCheck) { + if (verbose) DLA_LOG("ReadDebugCsr addr read back check failed, expected %u, got %u\n", addr, addrCheck); + return false; + } + + // Step 2: the debug network should produce a read response which is cached by the CSR. Poll the corresponding status + // register inside the CSR until this happens, or until the runtime decides to give up and declare the read a failure. + // Do not throw an exception if the read fails, it is allowed to fail if the runtime is trying to figure out which + // external debug-capable modules are attached to the debug network. Once the runtime has determined that a module is + // attached, only then should read failures should cause an exception. + uint32_t isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID); + int retry = 5; + while (!isValid && retry) { + --retry; + isValid = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID); + } + if (!isValid) { + if (verbose) DLA_LOG("ReadDebugCsr failed to read at addr %u\n", addr); + return false; + } + + // Step 3: runtime has confirmed the CSR has a cached the read response from debug network, now go and get the value. + readData = mmdWrapper_.ReadFromCsr(instance, DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA); + if (verbose) DLA_LOG("ReadDebugCsr, addr %u, data %u\n", addr, readData); + return true; +} + +// This is a helper function that throws an exception if runtime fails to read from the debug network. This should only +// be called if the runtime has already confirmed that a module is attached to the debug network i.e. a previous read to +// this module number had succeeded. +void ReadDebugNetworkError(int moduleNum, int address, int instance) { + std::string msg = "ReadDebugNetwork failure, instance " + std::to_string(instance) + + ", failed to read at module number " + std::to_string(moduleNum) + " address " + + std::to_string(address); + throw std::runtime_error(msg); +} + +// Modules attached to the debug network have a ROM to specify the offset and description of the registers. Traverse +// this ROM, then return a map of key/value pairs, where the key is a human readable string describing what kind of +// information the debug register contains, and the value is the data of the debug register. Note that the runtime must +// completely tranverse the ROM before reading any of the debug register values, and the runtime must read the debug +// register values in the order that they occur inside the ROM. Usually profiling counters are 64-bit values, and since +// there is only a 32-bit read available, it takes more than one read to get all the data. The counters could still be +// updating when the runtime wants to read them, so typically there is a freeze register which can be activated by +// reading from a special address (hardware will see an incoming read request to this address, that is how it knows to +// freeze the counters). The offset for the freeze register will typically go first in the ROM, even if it is not the +// first offset in the address space. +DebugNetworkData CoreDlaDevice::ReadDebugNetwork(int instance) const { + DebugNetworkData result; + for (uint32_t moduleNum = 0; moduleNum < 256; moduleNum++) { + // Read the ROM to get the offsets and descriptions + std::vector<uint32_t> offset; + std::vector<std::string> description; + uint32_t address = 0, readData = 0; + bool first = true, success = false; + while (1) { + // Parse the offset + success = ReadDebugCsr(moduleNum, address, instance, readData); + if (!success) { + // Failure to read is allowed on the very first time, it is assumed that no external debug-capable module is + // attached to the debug network at this moduleNum + if (first) + break; + else + ReadDebugNetworkError(moduleNum, address, instance); + } + if (!readData) break; // end of list is indicated with offset = 0 + first = false; + address += 4; + offset.push_back(readData); + + // Parse the description string + std::string str; + bool endOfStringSeen = false; + while (!endOfStringSeen) { + success = ReadDebugCsr(moduleNum, address, instance, readData); + if (!success) ReadDebugNetworkError(moduleNum, address, instance); + address += 4; + for (int i = 0; i < 4; i++) { + if (readData & 0xff) { + str += ((char)(readData & 0xff)); + readData >>= 8; + } else { + endOfStringSeen = true; + break; + } + } + } + description.push_back(str); + } + + assert(offset.size() == description.size()); + + // Read the profiling counters + for (size_t i = 0; i < offset.size(); i++) { + address = offset[i]; + success = ReadDebugCsr(moduleNum, address, instance, readData); + if (!success) ReadDebugNetworkError(moduleNum, address, instance); + + int descriptionOccurenceCnt = result.count(description[i]); + // Same description name should show up 2 times in maximum + if (descriptionOccurenceCnt == 2) { + throw std::runtime_error("More than 2 profiling counter descriptions are the same."); + } else if (descriptionOccurenceCnt && (address - offset[i - 1] != 4)) { + // same description existed before + // check if the two addresses associatede with the same decription are consecutive (offset by 4) + throw std::runtime_error("Profiling counter addresses with name: " + description[i] + " are not consecutive"); + } else if (std::count(offset.begin(), offset.end(), address) > 1) { + // same address shows up more than once + throw std::runtime_error("Duplicate profiling counter address: " + address); + } + + // Avoid printing special stuff like _Freeze and _Unfreeze + if (description[i].at(0) != '_') { + if (descriptionOccurenceCnt) { + // This key has existed before, concatenate 2 uint32_t into uint64_t + result[description[i]] |= (((uint64_t)readData) << 32); + } else { + result[description[i]] = readData; + } + } + } + } + return result; +} + +int CoreDlaDevice::GetSizeCsrDescriptorQueue() const { return DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE; } + +double CoreDlaDevice::GetCoreDlaClockFreq() const { return mmdWrapper_.GetCoreDlaClockFreq(); } + +std::string CoreDlaDevice::SchedulerGetStatus() const { + if (!spStreamControllerComms_) return ""; + + Payload<StatusMessagePayload> statusPayload = spStreamControllerComms_->GetStatus(); + return spStreamControllerComms_->GetStatusString(statusPayload); +} + +bool CoreDlaDevice::InitializeScheduler(uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests, + const std::string source_fifo_file) { + spStreamControllerComms_ = std::make_shared<StreamControllerComms>(); + if (spStreamControllerComms_->IsPresent()) { + bool initOK = spStreamControllerComms_->Initialize(sourceBufferSize, dropSourceBuffers, numInferenceRequests); + return initOK; + } else { + spStreamControllerComms_.reset(); + return false; + } +} diff --git a/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp new file mode 100644 index 0000000..c1f349f --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/coredla_graph_job.cpp @@ -0,0 +1,279 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "coredla_graph_job.h" //CoreDlaGraphJob + +#include <cinttypes> +#include <cstdlib> //std::getenv +#include <iomanip> //std::hex +#include <iostream> //std::cerr +#include <sstream> //std::stringstream +#include <string> //std::string + +#define BUILD_VERSION_CSR_OFFSET (ARCH_HASH_SIZE) +#define ARCH_NAME_CSR_OFFSET (ARCH_HASH_SIZE + BUILD_VERSION_SIZE) + +#define FLAG_DISABLE_ARCH_CHECK "DLA_DISABLE_ARCH_CHECK" +#define FLAG_DISABLE_VERSION_CHECK "DLA_DISABLE_VERSION_CHECK" + +std::unique_ptr<GraphJob> CoreDlaGraphJob::MakeUnique(DeviceMemoryAllocator *ddrBufferAllocator, + MmdWrapper *mmdWrapper, + const dla::CompiledResult *compiledResult, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms) { + return std::unique_ptr<GraphJob>(new CoreDlaGraphJob( + ddrBufferAllocator, mmdWrapper, compiledResult, numPipelines, instance, spStreamControllerComms)); +} + +std::string get_env_var_wrapper(const std::string &env_var) { + const char *env_var_ptr = std::getenv(env_var.c_str()); + if (env_var_ptr == nullptr) { + return ""; + } + + return std::string(env_var_ptr); +} + +std::string arch_hash_to_string(const std::vector<int> &arch_hash) { + std::stringstream s; + for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) { + s << std::setfill('0') << std::setw(8) << std::hex << std::right << arch_hash[i] << " "; + } + + return s.str(); +} + +std::string read_string_from_bitstream_rom(MmdWrapper *mmdWrapper, + const int instance, + const uint32_t str_word_size_in_bytes, + const uint32_t str_offset_in_rom) { + std::string str_from_rom; + bool done = false; + for (uint32_t i = 0; i < str_word_size_in_bytes && (!done); ++i) { + int chunk = mmdWrapper->ReadFromCsr(instance, str_offset_in_rom + i * 4); + // Parse the int word into chars. Stops at any NUL char. + for (int j = 0; j < 4; ++j) { + char rom_char = (chunk >> (j * 8)) & 0xFF; + if (rom_char == 0) { + done = true; + break; + } else { + str_from_rom.push_back(rom_char); + } + } + } + return str_from_rom; +} + +CoreDlaGraphJob::CoreDlaGraphJob(DeviceMemoryAllocator *ddrBufferAllocator, + MmdWrapper *mmdWrapper, + const dla::CompiledResult *compiledResult, + uint64_t numPipelines, + int instance, + std::shared_ptr<StreamControllerComms> spStreamControllerComms) + : configFilterBiasBufferSizeDDR_(0), + intermediateBufferSizeDDR_(0), + ddrBufferAllocator_(ddrBufferAllocator), + mmdWrapper_(mmdWrapper), + batchJobsRequested_(0), + instance_(instance) { + // First read the arch_md5, build_version_string and arch_name string from + // the metadata stored in the bitstream discovery ROM, then compare them + // against the information present in the compiled result. Fail if it does not match. + + // ARCH_HASH_SIZE bytes for the arch hash. + std::vector<int> bitstream_arch_hash; + DLA_LOG("Read hash from bitstream ROM...\n"); + for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) { + bitstream_arch_hash.push_back(mmdWrapper_->ReadFromCsr(instance_, i * 4)); + } + + // Next BUILD_VERSION_SIZE bytes are for the build version string + DLA_LOG("Read build version string from bitstream ROM...\n"); + std::string bitstream_build_version = + read_string_from_bitstream_rom(mmdWrapper_, instance_, BUILD_VERSION_WORD_SIZE, BUILD_VERSION_CSR_OFFSET); + + // Next ARCH_NAME_SIZE bytes are for the arch name string + DLA_LOG("Read arch name string from bitstream ROM...\n"); + std::string bitstream_arch_name = + read_string_from_bitstream_rom(mmdWrapper_, instance_, ARCH_NAME_WORD_SIZE, ARCH_NAME_CSR_OFFSET); + + // ************************ Perform all checks ******************************* + // *************************************************************************** + if (get_env_var_wrapper(FLAG_DISABLE_ARCH_CHECK) != "1") { + DLA_LOG("Runtime arch check is enabled. Check started...\n"); + + for (size_t i = 0; i < ARCH_HASH_WORD_SIZE; ++i) { + if (compiledResult->get_arch_hash()[i] != bitstream_arch_hash[i]) { + std::cerr << "Arch check failed: " + << "compiledResult arch hash is " << arch_hash_to_string(compiledResult->get_arch_hash()) + << ", compiledResult arch is " << compiledResult->get_arch_name() << ", bitstream arch_hash is " + << arch_hash_to_string(bitstream_arch_hash) << ", bitstream arch is " << bitstream_arch_name + << std::endl; + + std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_ARCH_CHECK << "=1." + << std::endl; + std::exit(1); + } + } + DLA_LOG("Runtime arch check passed.\n"); + } else { + DLA_ERROR( + "Environment variable %s is set to 1; " + "architecture check will be skipped. " + "This might cause undefined behavior including hanging, " + "and the user should only disable the check if " + "they understand the potential consequences.\n", + FLAG_DISABLE_ARCH_CHECK); + } + + if (get_env_var_wrapper(FLAG_DISABLE_VERSION_CHECK) != "1") { + DLA_LOG( + "Runtime build version check is enabled. " + "Check started...\n"); + if (bitstream_build_version != compiledResult->get_build_version_string()) { + std::cerr << "Build version check failed:" + << "compiledResult build version is " << compiledResult->get_build_version_string() + << ", bitstream build version is " << bitstream_build_version << std::endl; + + std::cerr << "This check can be disabled by setting environment variable " << FLAG_DISABLE_VERSION_CHECK << "=1." + << std::endl; + + std::exit(1); + } + DLA_LOG("Runtime build version check passed.\n"); + } else { + DLA_ERROR( + "Environment variable %s is set to 1; " + "build version check will be skipped. " + "This might cause undefined behavior including hanging, " + "and the user should only disable the check if " + "they understand the potential consequences.\n", + FLAG_DISABLE_VERSION_CHECK); + } + + // Checks completed. Allocate buffers and write to DDR + intermediateBufferSizeDDR_ = compiledResult->get_conv_intermediate_size_in_bytes(); + uint64_t totalConfigBytes = compiledResult->get_ddrfree_header().enable_parameter_rom ? + 0 : + compiledResult->get_config_size_in_bytes(); + auto &config_fbs_array = compiledResult->get_config_filter_bias_scale_array(); + auto config_fbs_raw_array = compiledResult->get_ddrfree_header().enable_parameter_rom ? + nullptr : + config_fbs_array[0].data(); + configFilterBiasBufferSizeDDR_ = compiledResult->get_ddrfree_header().enable_parameter_rom ? + 0 : + config_fbs_array[0].size(); + + // TODO: uncomment when buffer_t object is added + // assert(config_filter_bias_graph_buffer_size_ddr == config_filter_bias_buffer->size_in_bytes()); + // Allocate graph buffer (config, filter, bias, io) in DDR + uint64_t inputSizeDDR = compiledResult->get_conv_input_size_in_bytes(); + uint64_t outputSizeDDR = compiledResult->get_conv_output_size_in_bytes(); + + // DMA data path width in bytes for feature and filter data + // TODO: move this into the arch + constexpr uint64_t featureWordSize = 32; + constexpr uint64_t filterWordSize = 64; + + // Sanity check that buffer sizes are sufficiently aligned to ensure address alignment. + // Input, output, and intermediate buffers contain feature words. + assert(inputSizeDDR % featureWordSize == 0); + assert(outputSizeDDR % featureWordSize == 0); + assert(intermediateBufferSizeDDR_ % featureWordSize == 0); + // filter contains filter words, and config must be padded to a filter word size + assert(totalConfigBytes % filterWordSize == 0); + assert(configFilterBiasBufferSizeDDR_ % filterWordSize == 0); + + // Allocate the intermediate buffer. + ddrBufferAllocator_->AllocateSharedBuffer(intermediateBufferSizeDDR_, instance_); + + // Allocate the input/output buffer. + // Output buffer must come immediately after the input buffer, so from an allocation perspective this is one buffer. + // Note there is an input/output buffer pair allocated for each pipeline. The input/output pair must be contiguous for + // each pipeline, but input/output pairs from different pipelines are allowed to have a gap. We could call the + // allocator for each input/output buffer pair, however because everything is sized and aligned to the feature word + // size, we won't get gaps between them due to alignment. Calling the allocator once per pipeline would result in the + // same allocation as calling the allocator just once and using offsets within this big buffer for each pipeline. + uint64_t inputOutputBufferSize = numPipelines * (inputSizeDDR + outputSizeDDR); // how much space to allocate + uint64_t inputOutputBufferAlignment = featureWordSize; // starting address must be aligned to this + uint64_t inputOutputBufferAddr; // where did the allocator place this buffer + ddrBufferAllocator_->AllocatePrivateBuffer(inputOutputBufferSize, inputOutputBufferAlignment, inputOutputBufferAddr); + + // Allocate the config/filter buffer. + // Filter buffer must come immediately after the config buffer, so from an allocation perspective this is one buffer. + uint64_t configFilterBufferSize = configFilterBiasBufferSizeDDR_; + uint64_t configFilterBufferAlignment = filterWordSize; + uint64_t configFilterBufferAddr; + ddrBufferAllocator_->AllocatePrivateBuffer( + configFilterBufferSize, configFilterBufferAlignment, configFilterBufferAddr); + + // Print the allocation results + bool print_allocation_result = getenv("COREDLA_RUNTIME_DEBUG") != nullptr; + ios_base::fmtflags coutFlags = cout.flags(); // printing in both decimal and hex, save cout state to undo it later + if (print_allocation_result) { + DLA_LOG("FPGA DDR allocation results\n"); + // Intermediate buffer address is hardcoded to 0 in device_memory_allocator.cpp, don't bother printing this + DLA_LOG(" Config buffer is at address %" PRIu64, configFilterBufferAddr); + DLA_LOG(" (%#" PRIx64 ")\n", configFilterBufferAddr); + const uint64_t filter_buffer_address = configFilterBufferAddr + totalConfigBytes; + DLA_LOG(" Filter/bias/scale buffer is at address %" PRIu64, filter_buffer_address); + DLA_LOG(" (%#" PRIx64 ")\n", filter_buffer_address); + } + + const bool enable_istream = compiledResult->get_input_configuration().begin()->second.enable_input_streaming; + const bool enable_ostream = compiledResult->get_output_configuration().output_streaming_enabled; + + // Write graph buffer to DDR + if (!compiledResult->get_ddrfree_header().enable_parameter_rom) { + mmdWrapper_->WriteToDDR(instance_, configFilterBufferAddr, configFilterBiasBufferSizeDDR_, config_fbs_raw_array); + } else { + DLA_LOG(" Ddrfree graph constants are not written to DDR.\n"); + } + + for (uint64_t i = 0; i < numPipelines; i++) { + uint64_t inputAddrDDR = inputOutputBufferAddr + i * (inputSizeDDR + outputSizeDDR); + uint64_t outputAddrDDR = inputAddrDDR + inputSizeDDR; + if (print_allocation_result) { + DLA_LOG(" Input buffer %" PRIu64 " is at address %" PRIu64, i, inputAddrDDR); + DLA_LOG(" (%#" PRIx64 ")\n", inputAddrDDR); + DLA_LOG(" Output buffer %" PRIu64 " is at address %" PRIu64, i, outputAddrDDR); + DLA_LOG(" (%#" PRIx64 ")\n", outputAddrDDR); + } + batchJobs_.push_back(move(CoreDlaBatchJob::MakeUnique(mmdWrapper_, + totalConfigBytes, + configFilterBufferAddr, + inputAddrDDR, + outputAddrDDR, + inputSizeDDR, + outputSizeDDR, + enable_istream, + enable_ostream, + instance_, + spStreamControllerComms))); + } + cout.flags(coutFlags); // restore the state of cout +} + +BatchJob *CoreDlaGraphJob::GetBatchJob() { + graphJobMutex.lock(); + if (batchJobsRequested_ >= batchJobs_.size()) { + graphJobMutex.unlock(); + return nullptr; + } + auto *batchJob = batchJobs_[batchJobsRequested_].get(); + batchJobsRequested_++; + graphJobMutex.unlock(); + return batchJob; +} diff --git a/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp new file mode 100644 index 0000000..48844f4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/device_memory_allocator.cpp @@ -0,0 +1,80 @@ +// Copyright 2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "device_memory_allocator.h" //DeviceMemoryAllocator +#include "dla_dma_constants.h" //DLA_DMA_CSR_OFFSET_*** + +#include <stdexcept> //std::runtime_error +#include <string> //std::string + +void DeviceMemoryAllocator::Initialize(uint64_t totalSize, MmdWrapper* mmdWrapper) { + totalGlobalMemSize_ = totalSize; + mmdWrapper_ = mmdWrapper; + currentIntermediateMaxBufferSizeAllocated_ = 0; + currentStartAddressGraphBufferSpace_ = totalSize; +} + +// The intermediate buffer is shared among all graphs. It gets placed at the lowest address +// and grows upwards (if a new graph is added which needs a bigger intermediate buffer). +void DeviceMemoryAllocator::AllocateSharedBuffer(uint64_t bufferSize, int instance) { + if (bufferSize > currentIntermediateMaxBufferSizeAllocated_) { + currentIntermediateMaxBufferSizeAllocated_ = bufferSize; + + // error intermediate buffer grows into the region of memory used for private buffers + if (currentIntermediateMaxBufferSizeAllocated_ > currentStartAddressGraphBufferSpace_) { + std::string msg = "FPGA DDR allocation failed, intermediate buffer grew upwards to " + + std::to_string(currentIntermediateMaxBufferSizeAllocated_) + + ", remaining unallocated space is limited to " + + std::to_string(currentStartAddressGraphBufferSpace_); + throw std::runtime_error(msg); + } + + // tell the fpga where the intermediate buffer is located. At address 0 now. Will change in future with multiple + // pe_arrays + mmdWrapper_->WriteToCsr(instance, DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR, 0); + } +} + +// The config, filter, input, and output buffers are specific to a graph and therefore require +// their own space in device memory. Note that filter must come immediately after config, so the +// allocator allocates both of these together as one buffer. Likewise output must come immediately +// after input. Private buffers are allocated from the highest to lowest address since the size is +// known at allocation time. Hardware requires the address to have some alignment, which is +// specified by the bufferAlignment argument. +void DeviceMemoryAllocator::AllocatePrivateBuffer(uint64_t bufferSize, uint64_t bufferAlignment, uint64_t& bufferAddr) { + uint64_t maxInflatedBufferSize = bufferSize + bufferAlignment; // be conservative for how much space buffer may take + + // error if the graph does not fit in fpga ddr + if (currentIntermediateMaxBufferSizeAllocated_ + maxInflatedBufferSize > currentStartAddressGraphBufferSpace_) { + std::string msg = + "FPGA DDR allocation failed, allocating buffer of size " + std::to_string(maxInflatedBufferSize) + + " exceeds the remaining space available of size " + + std::to_string(currentStartAddressGraphBufferSpace_ - currentIntermediateMaxBufferSizeAllocated_) + + ". This could be caused by the graph being too large or splitting the graph into too many subgraphs. " + + "Memory requirements for large graphs can be reduced by selecting different folding options, " + + "reducing batch size or selecting architectures with less padding."; + throw std::runtime_error(msg); + } + + currentStartAddressGraphBufferSpace_ -= bufferSize; // allocate from highest to lowest address + currentStartAddressGraphBufferSpace_ -= + (currentStartAddressGraphBufferSpace_ % bufferAlignment); // correct for alignment + bufferAddr = currentStartAddressGraphBufferSpace_; +} + +void DeviceMemoryAllocator::Clear() { + currentIntermediateMaxBufferSizeAllocated_ = 0; + currentStartAddressGraphBufferSpace_ = totalGlobalMemSize_; +} + +DeviceMemoryAllocator::~DeviceMemoryAllocator() { Clear(); } diff --git a/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp new file mode 100644 index 0000000..bbb052a --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/mmd_wrapper.cpp @@ -0,0 +1,172 @@ +// Copyright 2020-2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "mmd_wrapper.h" +#include "aocl_mmd.h" // aocl_mmd_*** +#include "dla_dma_constants.h" // DLA_DMA_CSR_OFFSET_*** + +#include <cassert> // assert +#include <cstddef> // size_t +#include <iostream> // std::cerr +#include <stdexcept> // std::runtime_error +#include <string> // std::string + +// All board variants must obey the CoreDLA CSR spec, which says that all access must be +// - 32 bits in size +// - address must be 4 byte aligned +// - within the address range, CSR size is 2048 bytes +constexpr uint64_t DLA_CSR_ALIGNMENT = 4; +constexpr uint64_t DLA_CSR_SIZE = 2048; + +// assert(status == 0) is removed by the c++ processor when compiling in release mode +// this is a handy workaround for suppressing the compiler warning about an unused variable +template <class T> +void suppress_warning_unused_varible(const T &) {} + +MmdWrapper::MmdWrapper() { + // Open the MMD + constexpr size_t MAX_BOARD_NAMES_LEN = 4096; + char name[MAX_BOARD_NAMES_LEN]; + size_t sz; + int status = aocl_mmd_get_offline_info(AOCL_MMD_BOARD_NAMES, MAX_BOARD_NAMES_LEN, name, &sz); + if (status) { + std::string msg = "Failed to query a board name from MMD. Perhaps no FPGA device is available?"; + throw std::runtime_error(msg); + } + int handle = aocl_mmd_open(name); + if (handle < 0) { + std::string msg = "Failed to open MMD"; + throw std::runtime_error(msg); + } + handle_ = handle; + + // Query some board-specific information from the MMD. Some values can be hardcoded constants + // where different boards have different constants, e.g. capacity of FPGA DDR. Others values may + // be determined experimentally e.g. start and stop a counter with a known duration in between to + // measure the clk_dla frequency. + maxInstances_ = dla_mmd_get_max_num_instances(); + ddrSizePerInstance_ = dla_mmd_get_ddr_size_per_instance(); + coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_); + + // On DE10 Agilex boards with GCC 8.3.0, we noticed that the clock frequency was being read as 0, + // around 50% of the time, and around 10% of the time on GCC 9.2.0, causing failures on perf_est + // tests. This retry loop will recall the function until the coreDlaClockFreq is non zero, or + // it exhausts 10 retries. + // We have no idea why this happens currently, but it typically passes by the second try. + int clockFreqRetries = 10; + while (coreDlaClockFreq_ == 0 && clockFreqRetries > 0) { + coreDlaClockFreq_ = dla_mmd_get_coredla_clock_freq(handle_); + clockFreqRetries--; + } + ddrClockFreq_ = dla_mmd_get_ddr_clock_freq(); +} + +MmdWrapper::~MmdWrapper() { + // Close the MMD + int status = aocl_mmd_close(handle_); + if (status) { + // Avoid throwning an exception from a Destructor. We are ultimately + // part of a (virtual) OpenVINO destructor, so we should follow the + // noexcept(true) that it advertises. Perhaps we can close the mmd + // as a separate step prior to destruction to make signaling errors + // easier? + std::cerr << "Failed to close MMD" << std::endl; + std::cerr << "Error status " << status << std::endl; + std::exit(1); + } +} + +void MmdWrapper::RegisterISR(interrupt_service_routine_signature func, void *data) const { + // register an interrupt handler + int status = aocl_mmd_set_interrupt_handler(handle_, func, data); + if (status) { + std::string msg = "Failed to register an interrupt handler with MMD"; + throw std::runtime_error(msg); + } +} + +void MmdWrapper::WriteToCsr(int instance, uint32_t addr, uint32_t data) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE); + assert(addr % DLA_CSR_ALIGNMENT == 0); + int status = dla_mmd_csr_write(handle_, instance, addr, &data); + assert(status == 0); + suppress_warning_unused_varible(status); +} + +uint32_t MmdWrapper::ReadFromCsr(int instance, uint32_t addr) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr + sizeof(uint32_t) <= DLA_CSR_SIZE); + assert(addr % DLA_CSR_ALIGNMENT == 0); + uint32_t data; + int status = dla_mmd_csr_read(handle_, instance, addr, &data); + assert(status == 0); + suppress_warning_unused_varible(status); + return data; +} + +void MmdWrapper::WriteToDDR(int instance, uint64_t addr, uint64_t length, const void *data) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr + length <= ddrSizePerInstance_); + int status = dla_mmd_ddr_write(handle_, instance, addr, length, data); + assert(status == 0); + suppress_warning_unused_varible(status); +} + +void MmdWrapper::ReadFromDDR(int instance, uint64_t addr, uint64_t length, void *data) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr + length <= ddrSizePerInstance_); + int status = dla_mmd_ddr_read(handle_, instance, addr, length, data); + assert(status == 0); + suppress_warning_unused_varible(status); +} + +#ifndef STREAM_CONTROLLER_ACCESS +// Stream controller access is not supported by the platform abstraction +bool MmdWrapper::bIsStreamControllerValid(int instance) const { return false; } + +// 32-bit handshake with each Stream Controller CSR +void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const { + assert(false); +} + +void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const { + assert(false); +} +#else +// If the mmd layer supports accesses to the Stream Controller +bool MmdWrapper::bIsStreamControllerValid(int instance) const { + assert(instance >= 0 && instance < maxInstances_); + bool status = dla_is_stream_controller_valid(handle_, instance); + return status; +} + +// 32-bit handshake with each Stream Controller CSR +void MmdWrapper::WriteToStreamController(int instance, uint32_t addr, uint64_t length, const void *data) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr % sizeof(uint32_t) == 0); + assert(length % sizeof(uint32_t) == 0); + int status = dla_mmd_stream_controller_write(handle_, instance, addr, length, data); + assert(status == 0); + suppress_warning_unused_varible(status); +} + +void MmdWrapper::ReadFromStreamController(int instance, uint32_t addr, uint64_t length, void *data) const { + assert(instance >= 0 && instance < maxInstances_); + assert(addr % sizeof(uint32_t) == 0); + assert(length % sizeof(uint32_t) == 0); + int status = dla_mmd_stream_controller_read(handle_, instance, addr, length, data); + assert(status == 0); + suppress_warning_unused_varible(status); +} +#endif diff --git a/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp new file mode 100644 index 0000000..677f6e4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/src/stream_controller_comms.cpp @@ -0,0 +1,274 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "stream_controller_comms.h" +#include <chrono> +#include <cstring> +#include <iostream> +#include <sstream> +#include <thread> + +// StreamControllerComms provides an interface to the Stream Controller +// microcode running in the NIOS-V + +static const uint32_t messageReadyMagicNumber = 0x55225522; +static constexpr uint32_t mailboxRamSize = 0x1000; + +StreamControllerComms::StreamControllerComms() {} + +bool StreamControllerComms::IsPresent() { + // Check there is an interface to the stream controller + if (!_mmdWrapper.bIsStreamControllerValid(_streamControllerInstance)) { + return false; + } + + // Check that the stream controller responds + bool isPresent = Ping(); + return isPresent; +} + +// Query for the current status +Payload<StatusMessagePayload> StreamControllerComms::GetStatus() { + BusyCheck busyCheck(_busyFlag); + if (!busyCheck) { + return {}; + } + + if (SendMessage(MessageType_GetStatus)) { + if (ReceiveMessage() == MessageType_Status) { + return _receivedStatusMessage; + } + } + + return {}; +} + +// Schedule an inference request with the stream controller +bool StreamControllerComms::ScheduleItems(std::vector<Payload<CoreDlaJobPayload>> items) { + BusyCheck busyCheck(_busyFlag); + if (!busyCheck) { + return false; + } + + bool status = true; + + for (auto& job : items) { + bool thisJobStatus = false; + + if (SendMessage(MessageType_ScheduleItem, job.GetPayload(), job.GetSize())) { + if (ReceiveMessage() == MessageType_NoOperation) { + thisJobStatus = true; + } + } + + if (!thisJobStatus) { + status = false; + } + } + + return status; +} + +// Send a ping command to the stream controller and wait for a pong +// response. +bool StreamControllerComms::Ping() { + BusyCheck busyCheck(_busyFlag); + if (!busyCheck) { + return false; + } + + if (SendMessage(MessageType_Ping)) { + return (ReceiveMessage() == MessageType_Pong); + } + + return false; +} + +// Initialize and reset the stream controller +// +// sourceBufferSize: +// The size of the MSGDMA buffers that the stream +// controller will receive from the layout transform +// dropSourceBuffers: +// How many source buffers to drop between each +// processed one. 0 by default unless set in the configuration +// by the app with DLIAPlugin::properties::streaming_drop_source_buffers.name() +// numInferenceRequest: +// A constant value set in the executable network. The +// stream controller will start executing once it has +// received this number of inference requests from OpenVINO +bool StreamControllerComms::Initialize(uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests) { + BusyCheck busyCheck(_busyFlag); + if (!busyCheck) { + return false; + } + + Payload<InitializeStreamControllerPayload> initializePayload{}; + initializePayload._sourceBufferSize = sourceBufferSize; + initializePayload._dropSourceBuffers = dropSourceBuffers; + initializePayload._numInferenceRequests = numInferenceRequests; + + if (SendMessage( + MessageType_InitializeStreamController, initializePayload.GetPayload(), initializePayload.GetSize())) { + if (ReceiveMessage() == MessageType_NoOperation) { + return true; + } + } + + return false; +} + +// Receive a message from the stream controller by reading from the +// mailbox memory until the magic number is set to indicate a message is ready. +// Only the Status return message has a payload +MessageType StreamControllerComms::ReceiveMessage() { + uint32_t receiveMessageOffset = mailboxRamSize / 2; + MessageHeader* pReceiveMessage = nullptr; + uint32_t messageReadyMagicNumberOffset = receiveMessageOffset; + uint32_t payloadOffset = static_cast<uint32_t>(receiveMessageOffset + (size_t)&pReceiveMessage->_payload); + uint32_t waitCount = 0; + + while (waitCount < 100) { + MessageHeader messageHeader; + _mmdWrapper.ReadFromStreamController( + _streamControllerInstance, receiveMessageOffset, sizeof(messageHeader), &messageHeader); + if (messageHeader._messageReadyMagicNumber == messageReadyMagicNumber) { + MessageType messageType = static_cast<MessageType>(messageHeader._messageType); + uint32_t sequenceId = messageHeader._sequenceID; + + bool ok = false; + + if (messageType == MessageType_Status) { + ok = StatusMessageHandler(payloadOffset); + } else if (messageType == MessageType_Pong) { + ok = true; + } + + if (!ok) { + _numBadMessages++; + } + + _mmdWrapper.WriteToStreamController( + _streamControllerInstance, messageReadyMagicNumberOffset, sizeof(sequenceId), &sequenceId); + _lastReceiveSequenceID = sequenceId; + return messageType; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + waitCount++; + } + + return MessageType_Invalid; +} + +// Send a message to the stream controller by writing to the mailbox memory, +// and wait for the message to be received/processed +bool StreamControllerComms::SendMessage(MessageType messageType, void* pPayload, size_t payloadSize) { + uint32_t sendMessageOffset = 0; + MessageHeader* pSendMessage = nullptr; + uint32_t messageReadyMagicNumberOffset = 0; + uint32_t messageTypeOffset = static_cast<uint32_t>((size_t)&pSendMessage->_messageType); + uint32_t sequenceIDOffset = static_cast<uint32_t>((size_t)&pSendMessage->_sequenceID); + uint32_t payloadOffset = static_cast<uint32_t>((size_t)&pSendMessage->_payload); + + uint32_t uintMessageType = static_cast<uint32_t>(messageType); + + _mmdWrapper.WriteToStreamController( + _streamControllerInstance, messageTypeOffset, sizeof(uintMessageType), &uintMessageType); + _mmdWrapper.WriteToStreamController( + _streamControllerInstance, sequenceIDOffset, sizeof(_sendSequenceID), &_sendSequenceID); + + if (payloadSize > 0) { + _mmdWrapper.WriteToStreamController(_streamControllerInstance, payloadOffset, payloadSize, pPayload); + } + + // Signal the message as ready + _mmdWrapper.WriteToStreamController(_streamControllerInstance, + messageReadyMagicNumberOffset, + sizeof(messageReadyMagicNumber), + &messageReadyMagicNumber); + + // Wait until the message has been processed by looking for the sequence ID + // in the magic number position + uint32_t waitCount = 0; + while (waitCount < 100) { + MessageHeader messageHeader; + _mmdWrapper.ReadFromStreamController( + _streamControllerInstance, sendMessageOffset, sizeof(messageHeader), &messageHeader); + + if (messageHeader._messageReadyMagicNumber == _sendSequenceID) { + _sendSequenceID++; + return true; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + waitCount++; + } + + return false; +} + +// Read the status message payload +bool StreamControllerComms::StatusMessageHandler(uint32_t payloadOffset) { + _mmdWrapper.ReadFromStreamController( + _streamControllerInstance, payloadOffset, sizeof(_receivedStatusMessage), &_receivedStatusMessage); + return true; +} + +// Parse the status message payload into a string +std::string StreamControllerComms::GetStatusString(Payload<StatusMessagePayload>& statusPayload) { + std::ostringstream stringStream; + stringStream << static_cast<uint32_t>(statusPayload._status) << "," << statusPayload._statusLineNumber << "," + << statusPayload._numReceivedSourceBuffers << "," << statusPayload._numScheduledInferences << "," + << statusPayload._numExecutedJobs; + return stringStream.str(); +} + +/////////////////////////////////////////////////////////////////////////////// + +// BusyFlag is used to prevent concurrent access to the stream controller, +// without holding a mutex when sending/receiving commands +using LockGuard = std::lock_guard<std::recursive_mutex>; + +bool BusyFlag::Lock() { + LockGuard lock(_mutex); + if (_busy) { + return false; + } + + _busy = true; + return true; +} + +void BusyFlag::Release() { + LockGuard lock(_mutex); + _busy = false; +} + +BusyCheck::BusyCheck(BusyFlag& busyFlag) : _busyFlag(busyFlag), _haveLocked(false) {} + +BusyCheck::~BusyCheck() { + if (_haveLocked) { + _busyFlag.Release(); + } +} + +BusyCheck::operator bool() { + bool locked = _busyFlag.Lock(); + if (locked) { + _haveLocked = true; + } + return locked; +} diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h new file mode 100644 index 0000000..d77c5ab --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/dla_registers.h @@ -0,0 +1,45 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +//the numbers below are byte addresses, must be a multiple of 4 since each access is 32 bits +static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL = 512; //0x200 +static const uint32_t DLA_DMA_CSR_OFFSET_INTERRUPT_MASK = 516; +static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR = 528; //0x210 +static const uint32_t DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO = 532; +static const uint32_t DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR = 536; +static const uint32_t DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS = 540; +static const uint32_t DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR = 544; //0x220 +static const uint32_t DLA_DMA_CSR_OFFSET_COMPLETION_COUNT = 548; +static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO = 576; //0x240 +static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI = 580; +static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO = 584; +static const uint32_t DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI = 588; +static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR = 592; //0x250 +static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID = 596; +static const uint32_t DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA = 600; + +//bit positions in interrupt control and mask +static const uint32_t DLA_DMA_CSR_INTERRUPT_ERROR_BIT = 0; +static const uint32_t DLA_DMA_CSR_INTERRUPT_DONE_BIT = 1; + +//bit positions in descriptor diagnostic +static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT = 0; +static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT = 1; +static const uint32_t DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT = 2; + +//descriptor queue +//runtime knows how many jobs it has enqueued and how many jobs have finished +//runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware +static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE = 64; //max number of jobs that runtime can enqueue +static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB = 8; //how many words in the queue are needed to enqueue 1 job +static const uint32_t DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE = DLA_DMA_CSR_DESCRIPTOR_QUEUE_LOGICAL_SIZE * DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //number of words in the hardware queue diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c new file mode 100644 index 0000000..1a12def --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.c @@ -0,0 +1,80 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "message_handlers.h" +#include "stream_controller_messages.h" + +bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + InitializeStreamControllerPayload* pInitializePayload = (InitializeStreamControllerPayload*)pPayload; + this->InitializeStreamController(this, + pInitializePayload->_sourceBufferSize, + pInitializePayload->_dropSourceBuffers, + pInitializePayload->_numInferenceRequests); + this->SendMessage(this, MessageType_NoOperation, NULL, 0); + return true; +} + +bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + volatile CoreDlaJobPayload* pCoreDlaJobPayload = (volatile CoreDlaJobPayload*)pPayload; + this->NewInferenceRequestReceived(this, pCoreDlaJobPayload); + this->SendMessage(this, MessageType_NoOperation, NULL, 0); + return true; +} + +bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + this->SendMessage(this, MessageType_Pong, NULL, 0); + return true; +} + +bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + StatusMessagePayload statusMessagePayload; + statusMessagePayload._status = this->_status; + statusMessagePayload._statusLineNumber = this->_statusLineNumber; + statusMessagePayload._numReceivedSourceBuffers = this->_numReceivedSourceBuffers; + statusMessagePayload._numScheduledInferences = this->_numScheduledInferences; + statusMessagePayload._numExecutedJobs = this->_numExecutedJobs; + this->SendMessage(this, MessageType_Status, &statusMessagePayload, sizeof(statusMessagePayload)); + return true; +} + +bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + ManualArmDmaTransferPayload* pManualArmDmaTransferPayload = (ManualArmDmaTransferPayload*)pPayload; + CoreDlaJobItem emptyJob = {}; + this->_debugJob = emptyJob; + this->_debugJob._payload._inputAddressDDR = pManualArmDmaTransferPayload->_inputAddressDDR; + this->_sourceBufferSize = pManualArmDmaTransferPayload->_sourceBufferSize; + bool fromHPS = (pManualArmDmaTransferPayload->_fromHPS != 0); + this->ArmDmaTransfer(this, &this->_debugJob, fromHPS); + this->SendMessage(this, MessageType_NoOperation, NULL, 0); + return true; +} + +bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload) +{ + ManualScheduleDlaInferencePayload* pManualScheduleDlaInferencePayload = (ManualScheduleDlaInferencePayload*)pPayload; + CoreDlaJobItem emptyJob = {}; + this->_debugJob = emptyJob; + this->_debugJob._payload._configurationBaseAddressDDR = pManualScheduleDlaInferencePayload->_configurationBaseAddressDDR; + this->_debugJob._payload._configurationSize = pManualScheduleDlaInferencePayload->_configurationSize; + this->_debugJob._payload._inputAddressDDR = pManualScheduleDlaInferencePayload->_inputAddressDDR; + this->ScheduleDlaInference(this, &this->_debugJob); + this->SendMessage(this, MessageType_NoOperation, NULL, 0); + return true; +} + + diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h new file mode 100644 index 0000000..a7e5187 --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/message_handlers.h @@ -0,0 +1,22 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once +#include "stream_controller.h" + +extern bool InitializeStreamControllerMessageHandler(StreamController* this, volatile uint32_t* pPayload); +extern bool ScheduleItemMessageHandler(StreamController* this, volatile uint32_t* pPayload); +extern bool PingMessageHandler(StreamController* this, volatile uint32_t* pPayload); +extern bool GetStatusMessageHandler(StreamController* this, volatile uint32_t* pPayload); +extern bool ManualArmDmaTransferMessageHandler(StreamController* this, volatile uint32_t* pPayload); +extern bool ManualScheduleDlaInferenceMessageHandler(StreamController* this, volatile uint32_t* pPayload); diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c new file mode 100644 index 0000000..ad8b372 --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.c @@ -0,0 +1,426 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#include "stream_controller.h" +#include "message_handlers.h" +#include "sys/alt_cache.h" +#include "dla_registers.h" +#include <string.h> + +static const uint32_t messageReadyMagicNumber = 0x55225522; +static const uint32_t mailboxBaseAddress = 0x40000; +static const uint32_t mailboxSize = 0x1000; +static const uint32_t dlaBaseAddress = 0x30000; + +static void Start(StreamController* this); +static void Reset(StreamController* this); +static bool InitializeMsgDma(StreamController* this); +static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS); +static void RunEventLoop(StreamController* this); +static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data); +static void InitializeStreamController(StreamController* this, uint32_t sourceBufferSize, uint32_t dropSourceBuffers, uint32_t numInferenceRequests); +static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber); +static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage); +static bool SendMessage(StreamController* this, + MessageType messageType, + void* pPayload, + size_t payloadSize); +static void NewSourceBuffer(StreamController* this); +static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob); +static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload); +static void MsgDmaIsr(void* pContext); + +int main() +{ + StreamController streamController = {}; + StreamController* this = &streamController; + + this->Start = Start; + this->Reset = Reset; + this->InitializeMsgDma = InitializeMsgDma; + this->ArmDmaTransfer = ArmDmaTransfer; + this->RunEventLoop = RunEventLoop; + this->WriteToDlaCsr = WriteToDlaCsr; + this->InitializeStreamController = InitializeStreamController; + this->SetStatus = SetStatus; + this->ReceiveMessage = ReceiveMessage; + this->SendMessage = SendMessage; + this->NewSourceBuffer = NewSourceBuffer; + this->ScheduleDlaInference = ScheduleDlaInference; + this->NewInferenceRequestReceived = NewInferenceRequestReceived; + + // Message handlers + this->GetStatusMessageHandler = GetStatusMessageHandler; + this->ScheduleItemMessageHandler = ScheduleItemMessageHandler; + this->PingMessageHandler = PingMessageHandler; + this->InitializeStreamControllerMessageHandler = InitializeStreamControllerMessageHandler; + this->ManualArmDmaTransferMessageHandler = ManualArmDmaTransferMessageHandler; + this->ManualScheduleDlaInferenceMessageHandler = ManualScheduleDlaInferenceMessageHandler; + + this->Reset(this); + this->Start(this); + + return 0; +} + +static void Start(StreamController* this) +{ + // Clear the mailbox memory + uint8_t* pMailbox = (uint8_t*)(mailboxBaseAddress); + memset(pMailbox, 0, mailboxSize); + + if (this->InitializeMsgDma(this)) + { + // Run the main event loop + this->RunEventLoop(this); + } +} + +static bool InitializeMsgDma(StreamController* this) +{ + this->_pMsgDevice = alt_msgdma_open(DLA_MSGDMA_0_CSR_NAME); + if (this->_pMsgDevice) + { + alt_msgdma_register_callback(this->_pMsgDevice, MsgDmaIsr, 0, this); + alt_dcache_flush_all(); + return true; + } + else + { + this->SetStatus(this, NiosStatusType_MsgDmaFailed, __LINE__); + return false; + } +} + +static bool ArmDmaTransfer(StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS) +{ + this->_pFillingImageJob = pFillJob; + + alt_u32* pWriteBuffer = (alt_u32*)this->_pFillingImageJob->_payload._inputAddressDDR; + alt_u32 length = this->_sourceBufferSize; + alt_u32 control = ALTERA_MSGDMA_DESCRIPTOR_CONTROL_TRANSFER_COMPLETE_IRQ_MASK; + + int r = 0; + if (fromHPS) + { + r = alt_msgdma_construct_extended_st_to_mm_descriptor(this->_pMsgDevice, + &this->_msgdmaDescriptor, + pWriteBuffer, + length, + control, + 0, + 0, + 1); + } + else + { + r = alt_msgdma_construct_extended_mm_to_st_descriptor(this->_pMsgDevice, + &this->_msgdmaDescriptor, + pWriteBuffer, + length, + control, + 0, + 0, + 1); + } + + if (r == 0) + { + r = alt_msgdma_extended_descriptor_async_transfer(this->_pMsgDevice, &this->_msgdmaDescriptor); + if (r != 0) + { + this->SetStatus(this, NiosStatusType_AsyncTransferFailed, __LINE__); + } + } + else + { + this->SetStatus(this, NiosStatusType_BadDescriptor, __LINE__); + } + + return (r == 0); +} + +static void RunEventLoop(StreamController* this) +{ + volatile MessageHeader* pReceiveMessage = (MessageHeader*)(mailboxBaseAddress); + + uint32_t previousIsrCount = this->_isrCount; + + while (true) + { + uint32_t isrCount = this->_isrCount; + + if (isrCount != previousIsrCount) + { + this->NewSourceBuffer(this); + } + + if (pReceiveMessage->_messageReadyMagicNumber == messageReadyMagicNumber) + { + this->ReceiveMessage(this, pReceiveMessage); + } + + previousIsrCount = isrCount; + } +} + +static MessageType ReceiveMessage(StreamController* this, volatile MessageHeader* pReceiveMessage) +{ + MessageType messageType = pReceiveMessage->_messageType; + uint32_t sequenceId = pReceiveMessage->_sequenceID; + this->_commandCounter++; + + bool ok = false; + + volatile uint32_t* pPayload = &pReceiveMessage->_payload; + + if (messageType == MessageType_GetStatus) + ok = this->GetStatusMessageHandler(this, pPayload); + else if (messageType == MessageType_ScheduleItem) + ok = this->ScheduleItemMessageHandler(this, pPayload); + else if (messageType == MessageType_Ping) + ok = this->PingMessageHandler(this, pPayload); + else if (messageType == MessageType_InitializeStreamController) + ok = this->InitializeStreamControllerMessageHandler(this, pPayload); + else if (messageType == MessageType_ManualArmDmaTransfer) + ok = this->ManualArmDmaTransferMessageHandler(this, pPayload); + else if (messageType == MessageType_ManualScheduleDlaInference) + ok = this->ManualScheduleDlaInferenceMessageHandler(this, pPayload); + + if (!ok) + this->SetStatus(this, NiosStatusType_BadMessage, __LINE__); + + pReceiveMessage->_messageReadyMagicNumber = sequenceId; + + if ((this->_lastReceiveSequenceID != 0) && ((this->_lastReceiveSequenceID + 1) != sequenceId)) + { + // If the DLA plugin has restarted, the first message will be InitializeStreamController + // with a sequence ID of 0 + if ((sequenceId != 0) || (messageType != MessageType_InitializeStreamController)) + this->SetStatus(this, NiosStatusType_BadMessageSequence, __LINE__); + } + + this->_lastReceiveSequenceID = sequenceId; + return messageType; +} + +static bool SendMessage(StreamController* this, + MessageType messageType, + void *pPayload, + size_t payloadSize) +{ + uint32_t mailboxSendAddress = mailboxBaseAddress + (mailboxSize / 2); + uint32_t* pMailbox = (uint32_t*)mailboxSendAddress; + MessageHeader* pSendMessage = (MessageHeader*)(pMailbox); + void* pPayloadDestination = &pSendMessage->_payload; + + pSendMessage->_messageType = messageType; + pSendMessage->_sequenceID = this->_sendSequenceID; + + if (payloadSize > 0) + memcpy(pPayloadDestination, pPayload, payloadSize); + + // Signal the message as ready + pSendMessage->_messageReadyMagicNumber = messageReadyMagicNumber; + + this->_sendSequenceID++; + return true; +} + +// We have received a new source buffer via the msgdma +static void NewSourceBuffer(StreamController* this) +{ + // Read the response to flush the buffer + CoreDlaJobItem* pJustFilledJob = this->_pFillingImageJob; + CoreDlaJobItem* pNextFillJob = NULL; + + uint32_t bufferSequence = this->_numReceivedSourceBuffers; + this->_numReceivedSourceBuffers++; + + // Have we just captured a manually armed DMA transfer? + if (pJustFilledJob == &this->_debugJob) + return; + + if (this->_dropSourceBuffers > 0) + { + // If _dropSourceBuffers = 1, we process 1, drop 1 etc + // if _dropSourceBuffers = 2, we process 1, drop 2, process 1, drop 2 etc + if (bufferSequence % (this->_dropSourceBuffers + 1) != 0) + { + // Drop this buffer, capture the next one in its place + this->ArmDmaTransfer(this, pJustFilledJob, true); + return; + } + } + + pJustFilledJob->_hasSourceBuffer = true; + + if (pJustFilledJob->_pNextJob->_hasSourceBuffer) + { + // No space in the next job, so keep filling the same job + pNextFillJob = pJustFilledJob; + + // It already has a buffer but we have to + // consider this as dropped as we will write another + // in its place + pNextFillJob->_hasSourceBuffer = false; + } + else + { + pNextFillJob = pJustFilledJob->_pNextJob; + } + + // Re-arm the DMA transfer + this->ArmDmaTransfer(this, pNextFillJob, true); + + // If there are less than two scheduled buffers, then we can schedule another one + // _pNextInferenceRequestJob is the executing job if it is marked as scheduled + + uint32_t nScheduled = 0; + if (this->_pNextInferenceRequestJob->_scheduledWithDLA) + nScheduled++; + if (this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA) + nScheduled++; + + if (nScheduled < 2) + this->ScheduleDlaInference(this, pJustFilledJob); +} + +static void NewInferenceRequestReceived(StreamController* this, volatile CoreDlaJobPayload* pJobPayload) +{ + // Once we have received all '_totalNumInferenceRequests' inference requests, + // we set the state to running and can now capture the input dma's + bool wasRunning = this->_running; + this->_numInferenceRequests++; + this->_running = (this->_numInferenceRequests >= this->_totalNumInferenceRequests); + + CoreDlaJobItem* pThisJob = this->_pNextInferenceRequestJob; + + // Store the job details and move to the next + uint32_t previousAddress = pThisJob->_payload._inputAddressDDR; + pThisJob->_payload = *pJobPayload; + + // This job has just completed so clear its state + pThisJob->_scheduledWithDLA = false; + pThisJob->_hasSourceBuffer = false; + + // The jobs are recycled by the DLA plugin so the inputAddrDDR should + // stay the same for each _jobs[n] + if ((pThisJob->_payload._inputAddressDDR != previousAddress) && (previousAddress != 0)) + this->SetStatus(this, NiosStatusType_Error, __LINE__); + + this->_pNextInferenceRequestJob = this->_pNextInferenceRequestJob->_pNextJob; + + if (wasRunning) + { + this->_numExecutedJobs++; + + // Check if we have any jobs ready to be scheduled. Maximum of 2 can have _scheduledWithDLA set + if (!this->_pNextInferenceRequestJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_hasSourceBuffer) + { + this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob); + } + else if (!this->_pNextInferenceRequestJob->_pNextJob->_scheduledWithDLA && this->_pNextInferenceRequestJob->_pNextJob->_hasSourceBuffer) + { + this->ScheduleDlaInference(this, this->_pNextInferenceRequestJob->_pNextJob); + } + } + else if (this->_running) + { + // We have just started running + // Arm the DMA transfer to start receiving source buffers + this->ArmDmaTransfer(this, &this->_jobs[0], true); + } +} + +static void ScheduleDlaInference(StreamController* this, CoreDlaJobItem* pJob) +{ + // The DLA has an input FIFO. By setting the base address register, + // we add this request to the FIFO + pJob->_scheduledWithDLA = true; + this->_numScheduledInferences++; + + CoreDlaJobPayload* pJobPayload = &pJob->_payload; + this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR, pJobPayload->_configurationBaseAddressDDR); + this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO, pJobPayload->_configurationSize); + this->WriteToDlaCsr(this, DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR, pJobPayload->_inputAddressDDR); +} + +static void SetStatus(StreamController* this, NiosStatusType statusType, uint32_t lineNumber) +{ + this->_status = statusType; + this->_statusLineNumber = lineNumber; +} + +static void InitializeStreamController(StreamController* this, + uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests) +{ + // This is called once when the inference app is run, + // so acts like a reset + this->_sourceBufferSize = sourceBufferSize; + this->_dropSourceBuffers = dropSourceBuffers; + this->_totalNumInferenceRequests = numInferenceRequests; + this->_jobs = malloc(sizeof(CoreDlaJobItem) * this->_totalNumInferenceRequests); + + // Reset any previous state + this->Reset(this); +} + +static void Reset(StreamController* this) +{ + CoreDlaJobItem emptyJob = {}; + uint32_t lastIndex = this->_totalNumInferenceRequests - 1; + + // Set up the circular job buffers + for (uint32_t i = 0; i < this->_totalNumInferenceRequests; i++) + { + this->_jobs[i] = emptyJob; + this->_jobs[i]._index = i; + uint32_t previousIndex = (i == 0) ? lastIndex : i - 1; + uint32_t nextIndex = (i == lastIndex) ? 0 : i + 1; + this->_jobs[i]._pPreviousJob = &this->_jobs[previousIndex]; + this->_jobs[i]._pNextJob = &this->_jobs[nextIndex]; + } + + this->_pNextInferenceRequestJob = &this->_jobs[0]; + this->_pFillingImageJob = &this->_jobs[0]; + this->_status = NiosStatusType_OK; + this->_statusLineNumber = 0; + this->_commandCounter = 0; + this->_numInferenceRequests = 0; + this->_numExecutedJobs = 0; + this->_numScheduledInferences = 0; + this->_lastReceiveSequenceID = 0; + this->_sendSequenceID = 0; + this->_running = false; + this->_isrCount = 0; + this->_numReceivedSourceBuffers = 0; +} + +static void WriteToDlaCsr(StreamController* this, uint32_t addr, uint32_t data) +{ + uint32_t* pRegister = (uint32_t*)(dlaBaseAddress + addr); + pRegister[0] = data; +} + +// Incrementing the ISR count here will result in NewSourceBuffer above being called +// in the event loop +static void MsgDmaIsr(void* pContext) +{ + StreamController* this = (StreamController*)pContext; + this->_isrCount++; +} + + diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h new file mode 100644 index 0000000..8b19066 --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller.h @@ -0,0 +1,86 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once + +#include <stddef.h> +#include <stdbool.h> +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include "altera_msgdma.h" +#include "system.h" +#include "stream_controller_messages.h" + +typedef struct CoreDlaJobItem +{ + uint32_t _index; + bool _hasSourceBuffer; + bool _scheduledWithDLA; + CoreDlaJobPayload _payload; + struct CoreDlaJobItem* _pPreviousJob; + struct CoreDlaJobItem* _pNextJob; +} CoreDlaJobItem; + +typedef struct StreamController +{ + void (*Start)(struct StreamController* this); + void (*Reset)(struct StreamController* this); + bool (*InitializeMsgDma)(struct StreamController* this); + bool (*ArmDmaTransfer)(struct StreamController* this, CoreDlaJobItem* pFillJob, bool fromHPS); + void (*RunEventLoop)(struct StreamController* this); + void (*WriteToDlaCsr)(struct StreamController* this, uint32_t addr, uint32_t data); + void (*InitializeStreamController)(struct StreamController* this, + uint32_t sourceBufferSize, + uint32_t dropSourceBuffers, + uint32_t numInferenceRequests); + void (*SetStatus)(struct StreamController* this, + NiosStatusType statusType, uint32_t lineNumber); + MessageType (*ReceiveMessage)(struct StreamController *this, volatile MessageHeader* pReceiveMessage); + bool (*SendMessage)(struct StreamController* this, + MessageType messageType, + void* pPayload, + size_t payloadSize); + void (*NewSourceBuffer)(struct StreamController* this); + void (*ScheduleDlaInference)(struct StreamController* this, CoreDlaJobItem* pJob); + void (*NewInferenceRequestReceived)(struct StreamController* this, volatile CoreDlaJobPayload* pJob); + + // Message handlers + bool (*GetStatusMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + bool (*ScheduleItemMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + bool (*PingMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + bool (*InitializeStreamControllerMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + bool (*ManualArmDmaTransferMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + bool (*ManualScheduleDlaInferenceMessageHandler)(struct StreamController* this, volatile uint32_t* pPayload); + + CoreDlaJobItem* _jobs; + CoreDlaJobItem* _pNextInferenceRequestJob; + CoreDlaJobItem* _pFillingImageJob; + CoreDlaJobItem _debugJob; + NiosStatusType _status; + uint32_t _statusLineNumber; + uint32_t _commandCounter; + uint32_t _sourceBufferSize; + uint32_t _dropSourceBuffers; + uint32_t _totalNumInferenceRequests; + uint32_t _numInferenceRequests; + uint32_t _numExecutedJobs; + uint32_t _numScheduledInferences; + uint32_t _lastReceiveSequenceID; + uint32_t _sendSequenceID; + bool _running; + uint32_t _numReceivedSourceBuffers; + volatile uint32_t _isrCount; + alt_msgdma_dev* _pMsgDevice; + alt_msgdma_extended_descriptor _msgdmaDescriptor; +} StreamController; diff --git a/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h new file mode 100644 index 0000000..3891326 --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/app/stream_controller_messages.h @@ -0,0 +1,90 @@ +// Copyright 2023 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +#pragma once +#include <stdint.h> + +typedef enum +{ + MessageType_Invalid, + MessageType_NoOperation, + MessageType_GetStatus, + MessageType_Status, + MessageType_ScheduleItem, + MessageType_Ping, + MessageType_Pong, + MessageType_InitializeStreamController, + MessageType_ManualArmDmaTransfer, + MessageType_ManualScheduleDlaInference +} MessageType; + +typedef enum +{ + NiosStatusType_OK = 1000, + NiosStatusType_Error, + NiosStatusType_BadMessage, + NiosStatusType_BadMessageSequence, + NiosStatusType_BadDescriptor, + NiosStatusType_AsyncTransferFailed, + NiosStatusType_MsgDmaFailed, + NiosStatusType_InvalidParameter +} NiosStatusType; + +typedef struct +{ + uint32_t _messageReadyMagicNumber; + uint32_t _messageType; + uint32_t _sequenceID; + uint32_t _payload; +} MessageHeader; + +// Message payloads: + +typedef struct +{ + uint32_t _configurationBaseAddressDDR; + uint32_t _configurationSize; + uint32_t _inputAddressDDR; + uint32_t _outputAddressDDR; +} CoreDlaJobPayload; + +typedef struct +{ + uint32_t _sourceBufferSize; + uint32_t _dropSourceBuffers; + uint32_t _numInferenceRequests; +} InitializeStreamControllerPayload; + +typedef struct +{ + NiosStatusType _status; + uint32_t _statusLineNumber; + uint32_t _numReceivedSourceBuffers; + uint32_t _numScheduledInferences; + uint32_t _numExecutedJobs; +} StatusMessagePayload; + +typedef struct +{ + uint32_t _sourceBufferSize; + uint32_t _inputAddressDDR; + uint32_t _fromHPS; +} ManualArmDmaTransferPayload; + +typedef struct +{ + uint32_t _configurationBaseAddressDDR; + uint32_t _configurationSize; + uint32_t _inputAddressDDR; +} ManualScheduleDlaInferencePayload; + diff --git a/python/openvino/runtime/coredla_device/stream_controller/build.sh b/python/openvino/runtime/coredla_device/stream_controller/build.sh new file mode 100755 index 0000000..2d22c5e --- /dev/null +++ b/python/openvino/runtime/coredla_device/stream_controller/build.sh @@ -0,0 +1,54 @@ +#! /bin/bash +# Run in Nios V Command Shell, Quartus Prime 22.4 or later + +quartus_project=$1 +qsys_file=$2 +hex_file=$3 + +usage() +{ + echo "Usage:" + echo " build.sh <quartus_project_file> <qsys_file> <destination_hex_file>" +} + +if [ -z "$quartus_project" ]; then + usage + exit 1 +fi + +if [ -z "$qsys_file" ]; then + usage + exit 1 +fi + +if [ -z "$hex_file" ]; then + usage + exit 1 +fi + +if [ ! -f "$quartus_project" ]; then + echo Quartus project file not found "$quartus_project" + usage + exit 1 +fi + +if [ ! -f "$qsys_file" ]; then + echo qsys file not found "$qsys_file" + usage + exit 1 +fi + +# Export the bsp folder from the Quartus project, create the +# CMakeFiles.txt for the application, build the app, then +# build the stream_controller.hex binary, in the 'build' folder + +niosv-bsp -c --quartus-project=$quartus_project --qsys=$qsys_file --type=hal bsp/settings.bsp +niosv-app --bsp-dir=bsp --app-dir=app --srcs=app --elf-name=stream_controller.elf + +# cmake dependency, version 3.14.10 or later. https://cmake.org/download/ +cmake -B build -DCMAKE_BUILD_TYPE=Release app +cmake --build build +elf2hex build/stream_controller.elf -b 0x0 -w 32 -e 0x1ffff -r 4 -o build/stream_controller.hex +cp build/stream_controller.hex $hex_file + +exit 0 |
