diff options
Diffstat (limited to 'python/openvino/runtime/coredla_device/mmd/dcp_a10_pac')
24 files changed, 5643 insertions, 0 deletions
diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore new file mode 100644 index 0000000..66e06bf --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/.gitignore @@ -0,0 +1,18 @@ +*~ +*# +*.marks +release_build/ +build/ +example_designs/mem_bandwidth/bin/ +example_designs/mem_bandwidth/simulation.tar.gz +example_designs/mem_bandwidth/temp_simulation/ +linux64/lib/ +linux64/libexec/diagnose +linux64/libexec/program +ase/mpf_src +*.pyc +*.swp +*.kwlp +*.kwps +temp_simulation/ +simulation.tar.gz diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt new file mode 100644 index 0000000..28dcfa4 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/CMakeLists.txt @@ -0,0 +1,63 @@ +# (C) 2017 Intel Corporation. All rights reserved. +# Your use of Intel Corporation's design tools, logic functions and other +# software and tools, and its AMPP partner logic functions, and any output +# files any of the foregoing (including device programming or simulation +# files), and any associated documentation or information are expressly subject +# to the terms and conditions of the Intel Program License Subscription +# Agreement, Intel MegaCore Function License Agreement, or other applicable +# license agreement, including, without limitation, that your use is for the +# sole purpose of programming logic devices manufactured by Intel and sold by +# Intel or its authorized distributors. Please refer to the applicable +# agreement for further details. + +cmake_minimum_required(VERSION 2.8.12) +project(mmd) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") + +find_package(OPAE REQUIRED) +find_package(NUMA REQUIRED) + +# DLA specific modifications made to the MMD +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDLA_MMD") + +enable_language(C ASM) + +set(ASM_OPTIONS "-x assembler-with-cpp") +if(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(ASM_OPTIONS "${ASM_OPTIONS} -no-integrated-as") +endif() + +set(CMAKE_ASM_FLAGS "${CFLAGS} ${ASM_OPTIONS}") + +set(MMD_SRC + ./host/ccip_mmd.cpp + ./host/ccip_mmd_device.cpp + ./host/dma_work_thread.cpp + ./host/fpga_dma.c + ./host/kernel_interrupt.cpp + ./host/mmd_dma.cpp + ./host/memcpy_s_fast.c + ./host/x86-sse2.S +) + +# Add a shared library target called intel_opae_mmd +# and build it from the MMD_SRC files +add_library(intel_opae_mmd SHARED ${MMD_SRC}) + +# Specify the include directories to be used when compiling intel_opae_mmd library +target_include_directories(intel_opae_mmd PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + +# Specify libraries needed when liking the intel_opae_mmd library +target_link_libraries(intel_opae_mmd + libopae-c + libnuma +) + +# Set the installation rules for the project +install(TARGETS intel_opae_mmd + LIBRARY DESTINATION lib + COMPONENT intel_opae_mmd +) diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake new file mode 100644 index 0000000..c981150 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindNUMA.cmake @@ -0,0 +1,34 @@ +# - Try to find libnuma +# Once done will define: +# +# NUMA_FOUND - system has libnuma +# NUMA_INCLUDE_DIRS - include directory with numa.h +# NUMA_LIBRARIES - link with this for libnuma + +find_path(NUMA_INCLUDE_DIRS + NAMES numa.h + PATHS + ${LIBNUMA_ROOT}/include + /usr/include + /p/psg/swip/dla/resources/numactl/2.0.16/include + + ) + +find_library(NUMA_LIBRARIES + NAMES numa + PATHS + ${LIBNUMA_ROOT}/lib + ${LIBNUMA_ROOT}/lib64 + /usr/lib + /usr/lib64 + /p/psg/swip/dla/resources/numactl/2.0.16/lib + + ) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(NUMA + REQUIRED_VARS NUMA_INCLUDE_DIRS NUMA_LIBRARIES) + +add_library(libnuma IMPORTED SHARED) +set_target_properties(libnuma PROPERTIES + IMPORTED_LOCATION ${NUMA_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS}) diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake new file mode 100644 index 0000000..6395d7c --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/cmake/modules/FindOPAE.cmake @@ -0,0 +1,44 @@ +# - Try to find libintelfpga +# Once done, this will define +# +# libopae-c_FOUND - system has libopae-c +# libopae-c_INCLUDE_DIRS - the libopae-c include directories +# libopae-c_LIBRARIES - link these to use libopae-c + +find_package(PkgConfig) +pkg_check_modules(PC_OPAE QUIET opae-c) + +# Use pkg-config to get hints about paths +execute_process(COMMAND pkg-config --cflags opae-c --silence-errors + COMMAND cut -d I -f 2 + OUTPUT_VARIABLE OPAE-C_PKG_CONFIG_INCLUDE_DIRS) +set(OPAE-C_PKG_CONFIG_INCLUDE_DIRS "${OPAE-C_PKG_CONFIG_INCLUDE_DIRS}" CACHE STRING "Compiler flags for OPAE-C library") + +# Include dir +find_path(libopae-c_INCLUDE_DIRS + NAMES opae/fpga.h + PATHS ${LIBOPAE-C_ROOT}/include + ${OPAE-C_PKG_CONFIG_INCLUDE_DIRS} + /usr/local/include + /usr/include + ${CMAKE_EXTRA_INCLUDES}) + +# The library itself +find_library(libopae-c_LIBRARIES + NAMES opae-c + PATHS ${LIBOPAE-C_ROOT}/lib + ${LIBOPAE-C_ROOT}/lib64 + /usr/local/lib + /usr/lib + /lib + /usr/lib/x86_64-linux-gnu + ${CMAKE_EXTRA_LIBS}) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPAE + REQUIRED_VARS libopae-c_LIBRARIES libopae-c_INCLUDE_DIRS) + +add_library(libopae-c IMPORTED SHARED) +set_target_properties(libopae-c PROPERTIES + IMPORTED_LOCATION ${libopae-c_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${libopae-c_INCLUDE_DIRS}) + diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore new file mode 100644 index 0000000..1530978 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/.gitignore @@ -0,0 +1 @@ +*.o
\ No newline at end of file diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h new file mode 100644 index 0000000..6d8f9fa --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/afu_bbb_util.h @@ -0,0 +1,123 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +/** + * \fpga_dma.h + * \brief FPGA DMA BBB API Header + * + * Known Limitations + * - Driver does not support Address Span Extender + * - Implementation is not optimized for performance. + * User buffer data is copied into a DMA-able buffer before the transfer + * - Supports only synchronous (blocking) transfers + */ + +#ifndef AFU_BBB_UTIL_H__ +#define AFU_BBB_UTIL_H__ + +#include <assert.h> +#include <opae/fpga.h> +#include <uuid/uuid.h> + +#define DFH_FEATURE_EOL(dfh) (((dfh >> 40) & 1) == 1) +#define DFH_FEATURE(dfh) ((dfh >> 60) & 0xf) +#define DFH_FEATURE_IS_PRIVATE(dfh) (DFH_FEATURE(dfh) == 3) +#define DFH_FEATURE_IS_BBB(dfh) (DFH_FEATURE(dfh) == 2) +#define DFH_FEATURE_IS_AFU(dfh) (DFH_FEATURE(dfh) == 1) +#define DFH_FEATURE_NEXT(dfh) ((dfh >> 16) & 0xffffff) + +static bool find_dfh_by_guid(fpga_handle afc_handle, + uint64_t find_id_l, + uint64_t find_id_h, + uint64_t *result_offset = NULL, + uint64_t *result_next_offset = NULL) { + assert(find_id_l); + assert(find_id_h); + + uint64_t offset = 0; + if (result_offset) { + offset = *result_offset; + } + uint64_t dfh = 0; + + // Limit the maximum number of DFH search iterations to avoid getting stuck + // in an infinte loop in case the DFH_FEATURE_EOL is not found. Limit of + // 5000 is very conservaitve. In practice search should terminate in 3 or + // fewer iterations. + int MAX_DFH_SEARCHES = 5000; + int dfh_search_iterations = 0; + + do { + fpgaReadMMIO64(afc_handle, 0, offset, &dfh); + + int is_bbb = DFH_FEATURE_IS_BBB(dfh); + int is_afu = DFH_FEATURE_IS_AFU(dfh); + + if (is_afu || is_bbb) { + uint64_t id_l = 0; + uint64_t id_h = 0; + fpgaReadMMIO64(afc_handle, 0, offset + 8, &id_l); + fpgaReadMMIO64(afc_handle, 0, offset + 16, &id_h); + + if (find_id_l == id_l && find_id_h == id_h) { + if (result_offset) *result_offset = offset; + if (result_next_offset) *result_next_offset = DFH_FEATURE_NEXT(dfh); + return true; + } + } + offset += DFH_FEATURE_NEXT(dfh); + + dfh_search_iterations++; + if (dfh_search_iterations > MAX_DFH_SEARCHES) { + return false; + } + } while (!DFH_FEATURE_EOL(dfh)); + + return false; +} + +static bool find_dfh_by_guid(fpga_handle afc_handle, + const char *guid_str, + uint64_t *result_offset = NULL, + uint64_t *result_next_offset = NULL) { + fpga_guid guid; + + if (uuid_parse(guid_str, guid) < 0) return 0; + + uint32_t i; + uint32_t s; + + uint64_t find_id_l = 0; + uint64_t find_id_h = 0; + + // The API expects the MSB of the GUID at [0] and the LSB at [15]. + s = 64; + for (i = 0; i < 8; ++i) { + s -= 8; + find_id_h = ((find_id_h << 8) | (0xff & guid[i])); + } + + s = 64; + for (i = 0; i < 8; ++i) { + s -= 8; + find_id_l = ((find_id_l << 8) | (0xff & guid[8 + i])); + } + + return find_dfh_by_guid(afc_handle, find_id_l, find_id_h, result_offset, result_next_offset); +} + +#endif // AFU_BBB_UTIL_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp new file mode 100644 index 0000000..b7cd06a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd.cpp @@ -0,0 +1,655 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> + +#include <cassert> +#include <iomanip> +#include <iostream> +#include <map> +#include <sstream> + +#ifdef DLA_MMD +#include <chrono> +#include <thread> +#endif + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "aocl_mmd.h" +#include "ccip_mmd_device.h" + +using namespace intel_opae_mmd; + +#define ACL_DCP_ERROR_IF(COND, NEXT, ...) \ + do { \ + if (COND) { \ + printf("\nMMD ERROR: " __VA_ARGS__); \ + fflush(stdout); \ + NEXT; \ + } \ + } while (0) + +#define ACL_PKG_SECTION_DCP_GBS_GZ ".acl.gbs.gz" + +// If the MMD is loaded dynamically, destructors in the MMD will execute before the destructors in the runtime +// upon program termination. The DeviceMapManager guards accesses to the device/handle maps to make sure +// the runtime doesn't get to reference them after MMD destructors have been called. +// Destructor makes sure that all devices are closed at program termination regardless of what the runtime does. +// Implemented as a singleton. +class DeviceMapManager final { + public: + typedef std::map<int, CcipDevice*> t_handle_to_dev_map; + typedef std::map<uint64_t, int> t_id_to_handle_map; + + static const int SUCCESS = 0; + static const int FAILURE = -1; + + // Returns handle and device pointer to the device with the specified name + // Creates a new entry for this device if it doesn't already exist + // Return 0 on success, -1 on failure + int get_or_create_device(const char* board_name, int* handle, CcipDevice** device); + + // Return obj id based on BSP name. + uint64_t id_from_name(const char* board_name); + + // Return MMD handle based on obj id. Returned value is negative if board doesn't exist + inline int handle_from_id(uint64_t obj_id); + + // Return pointer to CCIP device based on MMD handle. Returned value is null if board doesn't exist + CcipDevice* device_from_handle(int handle); + + // Closes specified device if it exists + void close_device_if_exists(int handle); + + // Returns a reference to the class singleton + static DeviceMapManager& get_instance() { + static DeviceMapManager instance; + return instance; + } + + DeviceMapManager(DeviceMapManager const&) = delete; + void operator=(DeviceMapManager const&) = delete; + ~DeviceMapManager() { + // delete all allocated CcipDevice* entries + while (handle_to_dev_map->size() > 0) { + int handle = handle_to_dev_map->begin()->first; + aocl_mmd_close(handle); + } + delete handle_to_dev_map; + delete id_to_handle_map; + handle_to_dev_map = nullptr; + id_to_handle_map = nullptr; + } + + private: + DeviceMapManager() { + handle_to_dev_map = new t_handle_to_dev_map(); + id_to_handle_map = new t_id_to_handle_map(); + } + t_handle_to_dev_map* handle_to_dev_map = nullptr; + t_id_to_handle_map* id_to_handle_map = nullptr; +}; +static DeviceMapManager& device_manager = DeviceMapManager::get_instance(); + +int DeviceMapManager::get_or_create_device(const char* board_name, int* handle, CcipDevice** device) { + int _handle = CCIP_MMD_INVALID_PARAM; + CcipDevice* _device = nullptr; + + if (id_to_handle_map == nullptr || handle_to_dev_map == nullptr) { + return DeviceMapManager::FAILURE; + } + + uint64_t obj_id = id_from_name(board_name); + if (id_to_handle_map->count(obj_id) == 0) { + try { + _device = new CcipDevice(obj_id); + _handle = _device->get_mmd_handle(); + id_to_handle_map->insert({obj_id, _handle}); + handle_to_dev_map->insert({_handle, _device}); + } catch (std::runtime_error& e) { + LOG_ERR("%s\n", e.what()); + delete _device; + return DeviceMapManager::FAILURE; + } + } else { + _handle = id_to_handle_map->at(obj_id); + _device = handle_to_dev_map->at(_handle); + } + + (*handle) = _handle; + (*device) = _device; + return DeviceMapManager::SUCCESS; +} + +uint64_t DeviceMapManager::id_from_name(const char* board_name) { + uint64_t obj_id = 0; + if (CcipDevice::parse_board_name(board_name, obj_id)) { + return obj_id; + } else { + // TODO: add error hanlding for DeviceMapManager (make sure 0 is marked as invalid device) + return 0; + } +} + +inline int DeviceMapManager::handle_from_id(uint64_t obj_id) { + int handle = CCIP_MMD_INVALID_PARAM; + if (id_to_handle_map) { + auto it = id_to_handle_map->find(obj_id); + if (it != id_to_handle_map->end()) { + handle = it->second; + } + } + return handle; +} + +CcipDevice* DeviceMapManager::device_from_handle(int handle) { + CcipDevice* dev = nullptr; + if (handle_to_dev_map) { + auto it = handle_to_dev_map->find(handle); + if (it != handle_to_dev_map->end()) { + return it->second; + } + } + return dev; +} + +void DeviceMapManager::close_device_if_exists(int handle) { + if (handle_to_dev_map) { + if (handle_to_dev_map->count(handle) > 0) { + CcipDevice* dev = handle_to_dev_map->at(handle); + uint64_t obj_id = dev->get_fpga_obj_id(); + delete dev; + handle_to_dev_map->erase(handle); + id_to_handle_map->erase(obj_id); + } + } +} + +// Interface for checking if AFU has BSP loaded +bool ccip_mmd_bsp_loaded(const char* name) { + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + return false; + } + + int handle = device_manager.handle_from_id(obj_id); + if (handle > 0) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->bsp_loaded(); + else + return false; + } else { + bool bsp_loaded = false; + try { + CcipDevice dev(obj_id); + bsp_loaded = dev.bsp_loaded(); + } catch (std::runtime_error& e) { + LOG_ERR("%s\n", e.what()); + return false; + } + return bsp_loaded; + } +} + +static int get_offline_num_acl_boards(bool bsp_only = true) { + fpga_guid dcp_guid; + fpga_result res = FPGA_OK; + uint32_t num_matches = 0; + bool ret_err = false; + fpga_properties filter = NULL; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + ret_err = true; + goto out; + } + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + if (bsp_only) { + res = fpgaPropertiesSetGUID(filter, dcp_guid); + if (res != FPGA_OK) { + LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + + res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + ret_err = true; + goto out; + } + +out: + if (filter) fpgaDestroyProperties(&filter); + + if (ret_err) { + return CCIP_MMD_AOCL_ERR; + } else { + return num_matches; + } +} + +bool static get_offline_board_names(std::string& boards, bool bsp_only = true) { + fpga_guid dcp_guid; + fpga_result res = FPGA_OK; + uint32_t num_matches = 0; + fpga_properties filter = nullptr; + fpga_properties prop = nullptr; + std::ostringstream board_name; + fpga_token* toks = nullptr; + uint64_t obj_id; + bool success = true; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + success = false; + goto cleanup; + } + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + LOG_ERR("Error creating properties object: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + LOG_ERR("Error setting object type: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + if (bsp_only) { + res = fpgaPropertiesSetGUID(filter, dcp_guid); + if (res != FPGA_OK) { + LOG_ERR("Error setting GUID: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + } + res = fpgaEnumerate(&filter, 1, NULL, 0, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + toks = static_cast<fpga_token*>(calloc(num_matches, sizeof(fpga_token))); + if (toks == NULL) { + LOG_ERR("Error allocating memory\n"); + success = false; + goto cleanup; + } + + res = fpgaEnumerate(&filter, 1, toks, num_matches, &num_matches); + if (res != FPGA_OK) { + LOG_ERR("Error enumerating AFCs: %s\n", fpgaErrStr(res)); + success = false; + goto cleanup; + } + + for (unsigned int i = 0; i < num_matches; i++) { + if (prop) fpgaDestroyProperties(&prop); + res = fpgaGetProperties(toks[i], &prop); + if (res == FPGA_OK) { + res = fpgaPropertiesGetObjectID(prop, &obj_id); + if (res != FPGA_OK) { + LOG_ERR("Error reading object ID: %s\n", fpgaErrStr(res)); + success = false; + break; + } + boards.append(CcipDevice::get_board_name(BSP_NAME, obj_id)); + if (i < num_matches - 1) boards.append(";"); + } else { + success = false; + LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res)); + } + } + +cleanup: + if (prop) { + fpgaDestroyProperties(&prop); + } + if (filter) { + fpgaDestroyProperties(&filter); + } + if (toks) { + for (unsigned i = 0; i < num_matches; i++) { + if (toks[i]) { + fpgaDestroyToken(&toks[i]); + } + } + free(toks); + } + + return success; +} + +int aocl_mmd_yield(int handle) { + DEBUG_PRINT("* Called: aocl_mmd_yield\n"); + YIELD_DELAY(); + + CcipDevice* dev = device_manager.device_from_handle(handle); + assert(dev); + if (dev) { + return dev->yield(); + } + + return 0; +} + +// Macros used for acol_mmd_get_offline_info and aocl_mmd_get_info +#define RESULT_INT(X) \ + { \ + *((int*)param_value) = X; \ + if (param_size_ret) *param_size_ret = sizeof(int); \ + } +#define RESULT_STR(X) \ + do { \ + unsigned Xlen = strlen(X) + 1; \ + unsigned Xcpylen = (param_value_size <= Xlen) ? param_value_size : Xlen; \ + memcpy_s_fast((void*)param_value, param_value_size, X, Xcpylen); \ + if (param_size_ret) *param_size_ret = Xcpylen; \ + } while (0) + +int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) { + // aocl_mmd_get_offline_info can be called many times by the runtime + // and it is expensive to query the system. Only compute values first + // time aocl_mmd_get_offline_info called future iterations use saved results + static bool initialized = false; + static int mem_type_info; + static int num_acl_boards; + static std::string boards; + static bool success; + + if (!initialized) { + mem_type_info = (int)AOCL_MMD_PHYSICAL_MEMORY; + num_acl_boards = get_offline_num_acl_boards(); + success = get_offline_board_names(boards, true); + initialized = true; + } + + switch (requested_info_id) { + case AOCL_MMD_VERSION: + RESULT_STR(AOCL_MMD_VERSION_STRING); + break; + case AOCL_MMD_NUM_BOARDS: { + if (num_acl_boards >= 0) { + RESULT_INT(num_acl_boards); + } else { + return CCIP_MMD_AOCL_ERR; + } + break; + } + case AOCL_MMD_VENDOR_NAME: + RESULT_STR("Intel Corp"); + break; + case AOCL_MMD_BOARD_NAMES: { + if (success) { + RESULT_STR(boards.c_str()); + } else { + return CCIP_MMD_AOCL_ERR; + } + break; + } + case AOCL_MMD_VENDOR_ID: + RESULT_INT(0); + break; + case AOCL_MMD_USES_YIELD: + RESULT_INT(KernelInterrupt::yield_is_enabled()); + break; + case AOCL_MMD_MEM_TYPES_SUPPORTED: + RESULT_INT(mem_type_info); + break; + } + + return 0; +} + +int ccip_mmd_get_offline_board_names(size_t param_value_size, void* param_value, size_t* param_size_ret) { + std::string boards; + bool success = get_offline_board_names(boards, false); + if (success) { + RESULT_STR(boards.c_str()); + } else { + RESULT_INT(-1); + } + + return 0; +} + +int aocl_mmd_get_info( + int handle, aocl_mmd_info_t requested_info_id, size_t param_value_size, void* param_value, size_t* param_size_ret) { + DEBUG_PRINT("called aocl_mmd_get_info\n"); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev == NULL) return 0; + + assert(param_value); + switch (requested_info_id) { + case AOCL_MMD_BOARD_NAME: { + std::ostringstream board_name; + board_name << "Intel PAC Platform" + << " (" << dev->get_dev_name() << ")"; + RESULT_STR(board_name.str().c_str()); + break; + } + case AOCL_MMD_NUM_KERNEL_INTERFACES: + RESULT_INT(1); + break; + case AOCL_MMD_KERNEL_INTERFACES: + RESULT_INT(AOCL_MMD_KERNEL); + break; +#ifdef SIM + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#else + case AOCL_MMD_PLL_INTERFACES: + RESULT_INT(-1); + break; +#endif + case AOCL_MMD_MEMORY_INTERFACE: + RESULT_INT(AOCL_MMD_MEMORY); + break; + case AOCL_MMD_PCIE_INFO: { + RESULT_STR(dev->get_bdf().c_str()); + break; + } + case AOCL_MMD_BOARD_UNIQUE_ID: + RESULT_INT(0); + break; + case AOCL_MMD_TEMPERATURE: { + if (param_value_size == sizeof(float)) { + float* ptr = static_cast<float*>(param_value); + *ptr = dev->get_temperature(); + if (param_size_ret) *param_size_ret = sizeof(float); + } + break; + } + case AOCL_MMD_CONCURRENT_READS: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_WRITES: + RESULT_INT(1); + break; + case AOCL_MMD_CONCURRENT_READS_OR_WRITES: + RESULT_INT(2); + break; + } + return 0; +} + +#undef RESULT_INT +#undef RESULT_STR + +int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) { + dev->set_kernel_interrupt(fn, user_data); + } else { + return CCIP_MMD_AOCL_ERR; + } + return 0; +} + +int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) { + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) dev->set_status_handler(fn, user_data); + // TODO: handle error condition if dev null + return 0; +} + +// Host to device-global-memory write +int aocl_mmd_write(int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) { + DCP_DEBUG_MEM("\n- aocl_mmd_write: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, src, mmd_interface, offset); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->write_block(op, mmd_interface, src, offset, len); + else + return -1; + // TODO: handle error condition if dev null +} + +int aocl_mmd_read(int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) { + DCP_DEBUG_MEM("\n+ aocl_mmd_read: %d\t %p\t %lu\t %p\t %d\t %lu\n", handle, op, len, dst, mmd_interface, offset); + CcipDevice* dev = device_manager.device_from_handle(handle); + if (dev) + return dev->read_block(op, mmd_interface, dst, offset, len); + else + return -1; + // TODO: handle error condition if dev null +} + +int aocl_mmd_open(const char* name) { + DEBUG_PRINT("Opening device: %s\n", name); + + uint64_t obj_id = device_manager.id_from_name(name); + if (!obj_id) { + return CCIP_MMD_INVALID_PARAM; + } + + int handle; + CcipDevice* dev = nullptr; + if (device_manager.get_or_create_device(name, &handle, &dev) != DeviceMapManager::SUCCESS) { + delete dev; + return CCIP_MMD_AOCL_ERR; + } + + assert(dev); + if (dev->bsp_loaded()) { + if (!dev->initialize_bsp()) { + LOG_ERR("Error initializing bsp\n"); + return CCIP_MMD_BSP_INIT_FAILED; + } + } else { + return CCIP_MMD_BSP_NOT_LOADED; + } + + return handle; +} + +int aocl_mmd_close(int handle) { + device_manager.close_device_if_exists(handle); + + return 0; +} + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() { return 2; } +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() { return 1ULL << 32; } +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() { return 266.666667; } // MHz + +// Helper functions for the wrapper functions around CSR and DDR +uint64_t dla_get_raw_csr_address(int instance, uint64_t addr) { return 0x38000 + (0x1000 * instance) + addr; } +uint64_t dla_get_raw_ddr_address(int instance, uint64_t addr) { return (1ULL << 32) * instance + addr; } + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) { + return aocl_mmd_write(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) { + return aocl_mmd_read(handle, NULL, sizeof(uint32_t), data, AOCL_MMD_KERNEL, dla_get_raw_csr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) { + return aocl_mmd_write(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) { + return aocl_mmd_read(handle, NULL, length, data, AOCL_MMD_MEMORY, dla_get_raw_ddr_address(instance, addr)); +} + +// Get the PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) { + constexpr uint64_t hw_timer_address = 0x37000; + const uint32_t start_bit = 1; + const uint32_t stop_bit = 2; + + // Send the start command to the hardware counter + std::chrono::high_resolution_clock::time_point time_before = std::chrono::high_resolution_clock::now(); + int status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &start_bit, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Unlikely to sleep for exactly 10 milliseconds, but it doesn't matter since we use a high resolution clock to + // determine the amount of time between the start and stop commands for the hardware counter + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // Send the stop command to the hardware counter + std::chrono::high_resolution_clock::time_point time_after = std::chrono::high_resolution_clock::now(); + status = aocl_mmd_write(handle, NULL, sizeof(uint32_t), &stop_bit, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Read back the value of the counter + uint32_t counter = 0; + status = aocl_mmd_read(handle, NULL, sizeof(uint32_t), &counter, AOCL_MMD_KERNEL, hw_timer_address); + assert(status == 0); + + // Calculate the clock frequency of the counter, which is running on clk_dla + double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(time_after - time_before).count(); + return 1.0e-6 * counter / elapsed_seconds; // 1.0e-6 is to convert to MHz +} + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp new file mode 100644 index 0000000..9bc055a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.cpp @@ -0,0 +1,579 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <assert.h> +#include <numa.h> + +#include <unistd.h> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <limits> +#include <sstream> + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "ccip_mmd_device.h" + +// TODO: better encapsulation of afu_bbb_util functions +#include "afu_bbb_util.h" + +#define MMD_COPY_BUFFER_SIZE (1024 * 1024) + +#define MEM_WINDOW_BBB_GUID "72347537-7821-4125-442a-472d4b615064" +#define MEM_WINDOW_BBB_SIZE 8192 + +#define MSGDMA_BBB_GUID "ef82def7-f6ec-40fc-a914-9a35bace01ea" +#define MSGDMA_BBB_SIZE 256 + +#define NULL_DFH_BBB_GUID "da1182b1-b344-4e23-90fe-6aab12a0132f" +#define BSP_AFU_GUID "96ef4230-dafa-cb5f-18b7-9ffa2ee54aa0" + +using namespace intel_opae_mmd; + +int CcipDevice::next_mmd_handle{1}; + +std::string CcipDevice::get_board_name(std::string prefix, uint64_t obj_id) { + std::ostringstream stream; + stream << prefix << std::setbase(16) << obj_id; + return stream.str(); +} + +CcipDevice::CcipDevice(uint64_t obj_id) + : fpga_obj_id(obj_id), + kernel_interrupt_thread(NULL), + event_update(NULL), + event_update_user_data(NULL), + enable_set_numa(false), + fme_sysfs_temp_initialized(false), + bus(0), + device(0), + function(0), + afu_initialized(false), + bsp_initialized(false), + mmio_is_mapped(false), + afc_handle(NULL), + filter(NULL), + afc_token(NULL), + dma_ch0_dfh_offset(0), + dma_ch1_dfh_offset(0), + dma_ase_dfh_offset(0), + dma_host_to_fpga(NULL), + dma_fpga_to_host(NULL), + mmd_copy_buffer(NULL) { + // Note that this constructor is not thread-safe because next_mmd_handle + // is shared between all class instances + mmd_handle = next_mmd_handle; + if (next_mmd_handle == std::numeric_limits<int>::max()) + next_mmd_handle = 1; + else + next_mmd_handle++; + + mmd_copy_buffer = (char *)malloc(MMD_COPY_BUFFER_SIZE); + if (mmd_copy_buffer == NULL) { + throw std::runtime_error(std::string("malloc failed for mmd_copy_buffer")); + } + + fpga_result res = FPGA_OK; + uint32_t num_matches; + + res = fpgaGetProperties(NULL, &filter); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error creating properties object: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error setting object type: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaPropertiesSetObjectID(filter, obj_id); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error setting object ID: ") + std::string(fpgaErrStr(res))); + } + + res = fpgaEnumerate(&filter, 1, &afc_token, 1, &num_matches); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error enumerating AFCs: ") + std::string(fpgaErrStr(res))); + } + + if (num_matches < 1) { + res = fpgaDestroyProperties(&filter); + throw std::runtime_error("AFC not found"); + } + + res = fpgaOpen(afc_token, &afc_handle, 0); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error opening AFC: ") + std::string(fpgaErrStr(res))); + } + + fpga_properties prop = nullptr; + res = fpgaGetProperties(afc_token, &prop); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading properties: ") + std::string(fpgaErrStr(res))); + } + + if (prop) { + res = fpgaPropertiesGetBus(prop, &bus); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading bus: ") + std::string(fpgaErrStr(res))); + } + res = fpgaPropertiesGetDevice(prop, &device); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading device: ") + std::string(fpgaErrStr(res))); + } + res = fpgaPropertiesGetFunction(prop, &function); + if (res != FPGA_OK) { + throw std::runtime_error(std::string("Error reading function: ") + std::string(fpgaErrStr(res))); + } + fpgaDestroyProperties(&prop); + } + + initialize_fme_sysfs(); + + mmd_dev_name = get_board_name(BSP_NAME, obj_id); + afu_initialized = true; +} + +// Return true if board name parses correctly, false if it does not +// Return the parsed object_id in obj_id as an [out] parameter +bool CcipDevice::parse_board_name(const char *board_name_str, uint64_t &obj_id) { + std::string prefix(BSP_NAME); + std::string board_name(board_name_str); + + obj_id = 0; + if (board_name.length() <= prefix.length() && board_name.compare(0, prefix.length(), prefix)) { + LOG_ERR("Error parsing device name '%s'\n", board_name_str); + return false; + } + + std::string device_num_str = board_name.substr(prefix.length()); + obj_id = std::stol(device_num_str, 0, 16); + + // Assume that OPAE does not use 0 as a valid object ID. This is true for now + // but relies somewhat on an implementaion dependent feature. + assert(obj_id > 0); + return true; +} + +// Read information directly from sysfs. This is non-portable and relies on +// paths set in driver (will not interoperate between DFH driver in up-stream +// kernel and Intel driver distributed with PAC cards). In the future hopefully +// OPAE can provide SDK to read this information +void CcipDevice::initialize_fme_sysfs() { + const int MAX_LEN = 250; + char temp_fmepath[MAX_LEN]; + char numa_path[MAX_LEN]; + + // HACK: currently ObjectID is constructed using its lower 20 bits + // as the device minor number. The device minor number also matches + // the device ID in sysfs. This is a simple way to construct a path + // to the device FME using information that is already available (object_id). + // Eventually this code should be replaced with a direct call to OPAE C API, + // but API does not currently expose the device temperature. + int dev_num = 0xFFFFF & fpga_obj_id; + + // Path to temperature value + snprintf(temp_fmepath, + MAX_LEN, + "/sys/class/fpga/intel-fpga-dev.%d/intel-fpga-fme.%d/thermal_mgmt/temperature", + dev_num, + dev_num); + // Path to NUMA node + snprintf(numa_path, MAX_LEN, "/sys/class/fpga/intel-fpga-dev.%d/device/numa_node", dev_num); + + // Try to open the sysfs file. If open succeeds then set as initialized + // to be able to read temperature in future. If open fails then not + // initalized and skip attempt to read temperature in future. + FILE *tmp; + tmp = fopen(temp_fmepath, "r"); + if (tmp) { + fme_sysfs_temp_path = std::string(temp_fmepath); + fme_sysfs_temp_initialized = true; + fclose(tmp); + } + + // Read NUMA node and set value for future use. If not available set to -1 + // and disable use of NUMA setting + std::ifstream sysfs_numa_node(numa_path, std::ifstream::in); + if (sysfs_numa_node.is_open()) { + sysfs_numa_node >> fpga_numa_node; + sysfs_numa_node.close(); + if (std::stoi(fpga_numa_node) >= 0) { + enable_set_numa = true; + } else { + enable_set_numa = false; + } + } else { + enable_set_numa = false; + fpga_numa_node = "-1"; + } +} + +bool CcipDevice::find_dma_dfh_offsets() { + uint64_t dfh_offset = 0; + uint64_t next_dfh_offset = 0; + if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ch0_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA CH1 offset: 0x%lX\t GUID: %s\n", dma_ch0_dfh_offset, MSGDMA_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA: Cannot find DMA channel 0 DFH offset\n"); + return false; + } + + dfh_offset += next_dfh_offset; + if (find_dfh_by_guid(afc_handle, MSGDMA_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ch1_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA CH2 offset: 0x%lX\t GUID: %s\n", dma_ch1_dfh_offset, MSGDMA_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA. Cannot find DMA channel 2 DFH offset\n"); + return false; + } + + dfh_offset = 0; + if (find_dfh_by_guid(afc_handle, MEM_WINDOW_BBB_GUID, &dfh_offset, &next_dfh_offset)) { + dma_ase_dfh_offset = dfh_offset; + DEBUG_PRINT("DMA ASE offset: 0x%lX\t GUID: %s\n", dma_ase_dfh_offset, MEM_WINDOW_BBB_GUID); + } else { + fprintf(stderr, "Error initalizing DMA. Cannot find ASE DFH offset\n"); + return false; + } + + assert(dma_ch0_dfh_offset != 0); + assert(dma_ch1_dfh_offset != 0); + assert(dma_ase_dfh_offset != 0); + assert(dma_ch0_dfh_offset != dma_ch1_dfh_offset); + + return true; +} + +bool CcipDevice::initialize_bsp() { + if (bsp_initialized) { + return true; + } + + fpga_result res = fpgaMapMMIO(afc_handle, 0, NULL); + if (res != FPGA_OK) { + LOG_ERR("Error mapping MMIO space: %s\n", fpgaErrStr(res)); + return false; + } + mmio_is_mapped = true; + + /* Reset AFC */ + res = fpgaReset(afc_handle); + if (res != FPGA_OK) { + LOG_ERR("Error resetting AFC: %s\n", fpgaErrStr(res)); + return false; + } + AFU_RESET_DELAY(); + + // DMA performance is heavily dependent on the memcpy operation that transfers + // data from user allocated buffer to the pinned buffer that is used for + // DMA. On some machines with multiple NUMA nodes it is critical for performance + // that the pinned buffer is located on the NUMA node as the threads that + // performs the DMA operation. + // + // The performance also improves slighlty if the DMA threads are on the same + // NUMA node as the FPGA PCI device. + // + // This code pins memory allocation to occur from FPGA NUMA node prior to + // initializing the DMA buffers. It also pins all threads in the process + // to run on this same node. + struct bitmask *mask = NULL; + if (enable_set_numa) { + mask = numa_parse_nodestring(fpga_numa_node.c_str()); + numa_set_membind(mask); + int ret = numa_run_on_node_mask_all(mask); + if (ret < 0) { + fprintf(stderr, " Error setting NUMA node mask\n"); + } + } + + find_dma_dfh_offsets(); + + const int dma_ch0_interrupt_num = 0; // DMA channel 0 hardcoded to interrupt 0 + dma_host_to_fpga = new mmd_dma(afc_handle, mmd_handle, dma_ch0_dfh_offset, dma_ase_dfh_offset, dma_ch0_interrupt_num); + if (!dma_host_to_fpga->initialized()) { + LOG_ERR("Error initializing mmd dma\n"); + delete dma_host_to_fpga; + return false; + } + + const int dma_ch1_interrupt_num = 2; // DMA channel 1 hardcoded to interrupt 2 + dma_fpga_to_host = new mmd_dma(afc_handle, mmd_handle, dma_ch1_dfh_offset, dma_ase_dfh_offset, dma_ch1_interrupt_num); + if (!dma_fpga_to_host->initialized()) { + fprintf(stderr, "Error initializing mmd dma\n"); + return false; + } + + // Turn off membind restriction in order to allow future allocation to + // occur on different NUMA nodes if needed. Hypothesis is that only + // the pinned buffers are performance critical for the memcpy. Other + // allocations in the process can occur on other NUMA nodes if needed. + if (enable_set_numa) { + numa_set_membind(numa_nodes_ptr); + numa_free_nodemask(mask); + } + + kernel_interrupt_thread = new KernelInterrupt(afc_handle, mmd_handle); + + if (!kernel_interrupt_thread->initialized()) { + LOG_ERR("Error initializing kernel interrupts\n"); + delete kernel_interrupt_thread; + return false; + } + + bsp_initialized = true; + return bsp_initialized; +} + +CcipDevice::~CcipDevice() { + int num_errors = 0; + if (mmd_copy_buffer) { + free(mmd_copy_buffer); + mmd_copy_buffer = NULL; + } + + if (kernel_interrupt_thread) { + delete kernel_interrupt_thread; + kernel_interrupt_thread = NULL; + } + + if (dma_host_to_fpga) { + delete dma_host_to_fpga; + dma_host_to_fpga = NULL; + } + + if (dma_fpga_to_host) { + delete dma_fpga_to_host; + dma_fpga_to_host = NULL; + } + + if (mmio_is_mapped) { + if (fpgaUnmapMMIO(afc_handle, 0)) num_errors++; + } + + if (afc_handle) { + if (fpgaClose(afc_handle) != FPGA_OK) num_errors++; + } + + if (afc_token) { + if (fpgaDestroyToken(&afc_token) != FPGA_OK) num_errors++; + } + + if (filter) { + if (fpgaDestroyProperties(&filter) != FPGA_OK) num_errors++; + } + + if (num_errors > 0) { + DEBUG_PRINT("Error freeing resources in destructor\n"); + } +} + +int CcipDevice::yield() { + if (kernel_interrupt_thread) kernel_interrupt_thread->yield(); + return 0; +} + +bool CcipDevice::bsp_loaded() { + fpga_guid dcp_guid; + fpga_guid afu_guid; + fpga_properties prop; + fpga_result res; + + if (uuid_parse(DCP_OPENCL_BSP_AFU_ID, dcp_guid) < 0) { + LOG_ERR("Error parsing guid '%s'\n", DCP_OPENCL_BSP_AFU_ID); + return false; + } + + res = fpgaGetProperties(afc_token, &prop); + if (res != FPGA_OK) { + LOG_ERR("Error reading properties: %s\n", fpgaErrStr(res)); + fpgaDestroyProperties(&prop); + return false; + } + + res = fpgaPropertiesGetGUID(prop, &afu_guid); + if (res != FPGA_OK) { + LOG_ERR("Error reading GUID\n"); + fpgaDestroyProperties(&prop); + return false; + } + + fpgaDestroyProperties(&prop); + if (uuid_compare(dcp_guid, afu_guid) == 0) { + return true; + } else { + return false; + } +} + +std::string CcipDevice::get_bdf() { + std::ostringstream bdf; + bdf << std::setfill('0') << std::setw(2) << unsigned(bus) << ":" << std::setfill('0') << std::setw(2) + << unsigned(device) << "." << unsigned(function); + + return bdf.str(); +} + +float CcipDevice::get_temperature() { + float temp = 0; + if (fme_sysfs_temp_initialized) { + std::ifstream sysfs_temp(fme_sysfs_temp_path, std::ifstream::in); + sysfs_temp >> temp; + sysfs_temp.close(); + } + return temp; +} + +void CcipDevice::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data) { + if (kernel_interrupt_thread) { + kernel_interrupt_thread->set_kernel_interrupt(fn, user_data); + } +} + +void CcipDevice::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + event_update = fn; + event_update_user_data = user_data; + dma_host_to_fpga->set_status_handler(fn, user_data); + dma_fpga_to_host->set_status_handler(fn, user_data); +} + +void CcipDevice::event_update_fn(aocl_mmd_op_t op, int status) { + event_update(mmd_handle, event_update_user_data, op, status); +} + +int CcipDevice::read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t offset, size_t size) { + fpga_result res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO read of + // base address + offset + if (mmd_interface == AOCL_MMD_MEMORY) { + res = dma_fpga_to_host->read_memory(op, static_cast<uint64_t *>(host_addr), offset, size); + } else { + res = read_mmio(host_addr, mmd_interface + offset, size); + + if (op) { + // TODO: check what status value should really be instead of just using 0 + // Also handle case when op is NULL + this->event_update_fn(op, 0); + } + } + + if (res != FPGA_OK) { + LOG_ERR("fpgaReadMMIO error: %s\n", fpgaErrStr(res)); + return -1; + } else { + return 0; + } +} + +int CcipDevice::write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t offset, size_t size) { + fpga_result res; + + // The mmd_interface is defined as the base address of the MMIO write. Access + // to memory requires special functionality. Otherwise do direct MMIO write + if (mmd_interface == AOCL_MMD_MEMORY) { + res = dma_host_to_fpga->write_memory(op, static_cast<const uint64_t *>(host_addr), offset, size); + } else { + res = write_mmio(host_addr, mmd_interface + offset, size); + + if (op) { + // TODO: check what 'status' value should really be. Right now just + // using 0 as was done in previous CCIP MMD. Also handle case if op is NULL + this->event_update_fn(op, 0); + } + } + + // TODO: check what status values aocl wants and also parse the result + if (res != FPGA_OK) { + LOG_ERR("fpgaWriteMMIO error: %s\n", fpgaErrStr(res)); + return -1; + } else { + return 0; + } +} + +fpga_result CcipDevice::read_mmio(void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + DCP_DEBUG_MEM("read_mmio start: %p\t %lx\t %lu\n", host_addr, mmio_addr, size); + + // HACK: need extra delay for opencl sw reset + if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY(); + + uint64_t *host_addr64 = static_cast<uint64_t *>(host_addr); + while (size >= 8) { + res = fpgaReadMMIO64(afc_handle, 0, mmio_addr, host_addr64); + if (res != FPGA_OK) return res; + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + uint32_t *host_addr32 = reinterpret_cast<uint32_t *>(host_addr64); + while (size >= 4) { + res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, host_addr32); + if (res != FPGA_OK) return res; + host_addr32 += 1; + mmio_addr += 4; + size -= 4; + } + + if (size > 0) { + uint32_t read_data; + res = fpgaReadMMIO32(afc_handle, 0, mmio_addr, &read_data); + if (res != FPGA_OK) return res; + memcpy_s_fast(host_addr32, size, &read_data, size); + } + + return res; +} + +fpga_result CcipDevice::write_mmio(const void *host_addr, size_t mmio_addr, size_t size) { + fpga_result res = FPGA_OK; + + DEBUG_PRINT("write_mmio\n"); + + // HACK: need extra delay for opencl sw reset + if (mmio_addr == KERNEL_SW_RESET_BASE) OPENCL_SW_RESET_DELAY(); + + const uint64_t *host_addr64 = static_cast<const uint64_t *>(host_addr); + while (size >= 8) { + res = fpgaWriteMMIO64(afc_handle, 0, mmio_addr, *host_addr64); + if (res != FPGA_OK) return res; + host_addr64 += 1; + mmio_addr += 8; + size -= 8; + } + + const uint32_t *host_addr32 = reinterpret_cast<const uint32_t *>(host_addr64); + while (size > 0) { + uint32_t tmp_data32 = 0; + size_t chunk_size = (size >= 4) ? 4 : size; + memcpy_s_fast(&tmp_data32, sizeof(tmp_data32), host_addr32, chunk_size); + res = fpgaWriteMMIO32(afc_handle, 0, mmio_addr, tmp_data32); + if (res != FPGA_OK) return res; + host_addr32 += 1; + mmio_addr += chunk_size; + size -= chunk_size; + } + + return res; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h new file mode 100644 index 0000000..f8088ac --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/ccip_mmd_device.h @@ -0,0 +1,187 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _CCIP_MMD_DEVICE_H +#define _CCIP_MMD_DEVICE_H + +#include <limits.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <string> + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE +#include <sched.h> +#pragma pop_macro("_GNU_SOURCE") + +#include <opae/fpga.h> +#include <uuid/uuid.h> + +#include "aocl_mmd.h" +#include "kernel_interrupt.h" +#include "mmd_dma.h" + +// Tune delay for simulation or HW. Eventually delay +// should be removed for HW, may still be needed for ASE simulation +#ifdef SIM +#define DELAY_MULTIPLIER 100 +#else +#define DELAY_MULTIPLIER 1 +#endif + +// Most AOCL_MMD_CALL functions return negative number in case of error, +// CCIP_MMD_AOCL_ERR is used to indicate an error from the MMD that is being +// returned to the runtime. Simply set to -2 for now since neither interface +// defines a meaning to return codes for errors. +#define CCIP_MMD_AOCL_ERR -1 + +// NOTE: some of the code relies on invalid handle returning -1 +// future TODO eliminate dependency on specific error values +#define CCIP_MMD_INVALID_PARAM -1 + +// Our diagnostic script relies on handle values < -1 to determine when +// a valid device is present but a functioning BSP is not loaded. +#define CCIP_MMD_BSP_NOT_LOADED -2 +#define CCIP_MMD_BSP_INIT_FAILED -3 + +// Delay settings +// TODO: Figure out why these delays are needed and +// have requirement removed (at least for HW) +#define MMIO_DELAY() +#define YIELD_DELAY() usleep(1 * DELAY_MULTIPLIER) +#define OPENCL_SW_RESET_DELAY() usleep(5000 * DELAY_MULTIPLIER) +#define AFU_RESET_DELAY() usleep(20000 * DELAY_MULTIPLIER) + +#define KERNEL_SW_RESET_BASE (AOCL_MMD_KERNEL + 0x30) + +#define DCP_OPENCL_BSP_AFU_ID "63B3779B-8BDD-4F03-9CEB-0301181D6AEF" + +#define BSP_NAME "pac_" + +// LOG ERRORS +#define CCIP_MMD_ERR_LOGGING 1 +#ifdef CCIP_MMD_ERR_LOGGING +#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) +#else +#define LOG_ERR(...) +#endif + +// debugging +#ifdef DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#ifdef DEBUG_MEM +#define DCP_DEBUG_MEM(...) fprintf(stderr, __VA_ARGS__) +#else +#define DCP_DEBUG_MEM(...) +#endif + +enum { +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + AOCL_IRQ_POLLING_BASE = 0x0100, // CSR to polling interrupt status + AOCL_IRQ_MASKING_BASE = 0x0108, // CSR to set/unset interrupt mask + AOCL_MMD_KERNEL = 0x4000, /* Control interface into kernel interface */ +#else + AOCL_MMD_KERNEL = 0, // CoreDLA completely removes the Opencl kernel interface, repurposed for CSRs +#endif + AOCL_MMD_MEMORY = 0x100000 /* Data interface to device memory */ +}; + +enum AfuStatu { CCIP_MMD_INVALID_ID = 0, CCIP_MMD_BSP, CCIP_MMD_AFU }; + +class CcipDevice final { + public: + CcipDevice(uint64_t); + CcipDevice(const CcipDevice &) = delete; + CcipDevice &operator=(const CcipDevice &) = delete; + ~CcipDevice(); + + static std::string get_board_name(std::string prefix, uint64_t obj_id); + static bool parse_board_name(const char *board_name, uint64_t &obj_id); + + int get_mmd_handle() { return mmd_handle; } + uint64_t get_fpga_obj_id() { return fpga_obj_id; } + std::string get_dev_name() { return mmd_dev_name; } + std::string get_bdf(); + float get_temperature(); + bool initialize_bsp(); + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void *user_data); + void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + int yield(); + void event_update_fn(aocl_mmd_op_t op, int status); + bool bsp_loaded(); + + int read_block(aocl_mmd_op_t op, int mmd_interface, void *host_addr, size_t dev_addr, size_t size); + + int write_block(aocl_mmd_op_t op, int mmd_interface, const void *host_addr, size_t dev_addr, size_t size); + + private: + static int next_mmd_handle; + + int mmd_handle; + uint64_t fpga_obj_id; + std::string mmd_dev_name; + intel_opae_mmd::KernelInterrupt *kernel_interrupt_thread; + aocl_mmd_status_handler_fn event_update; + void *event_update_user_data; + + // HACK: use the sysfs path to read temperature value and NUMA node + // this should be replaced with OPAE call once that is + // available + std::string fme_sysfs_temp_path; + std::string fpga_numa_node; + bool enable_set_numa; + bool fme_sysfs_temp_initialized; + void initialize_fme_sysfs(); + + void initialize_local_cpus_sysfs(); + + bool find_dma_dfh_offsets(); + + uint8_t bus; + uint8_t device; + uint8_t function; + + bool afu_initialized; + bool bsp_initialized; + bool mmio_is_mapped; + + fpga_handle afc_handle; + fpga_properties filter; + fpga_token afc_token; + uint64_t dma_ch0_dfh_offset; + uint64_t dma_ch1_dfh_offset; + uint64_t dma_ase_dfh_offset; + intel_opae_mmd::mmd_dma *dma_host_to_fpga; + intel_opae_mmd::mmd_dma *dma_fpga_to_host; + + char *mmd_copy_buffer; + + // Helper functions + fpga_result read_mmio(void *host_addr, size_t dev_addr, size_t size); + fpga_result write_mmio(const void *host_addr, size_t dev_addr, size_t size); +}; + +#endif // _CCIP_MMD_DEVICE_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp new file mode 100644 index 0000000..30113eb --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.cpp @@ -0,0 +1,151 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include "dma_work_thread.h" +#include <assert.h> +#include <poll.h> +#include <stdlib.h> +#include <string.h> +#include <cstdint> +#include <iostream> +#include <thread> +#include "ccip_mmd_device.h" +#include "eventfd_wrapper.h" +#include "mmd_dma.h" + +using namespace intel_opae_mmd; + +dma_work_thread::dma_work_thread(mmd_dma &mmd_dma_arg) + : m_initialized(false), + m_thread_wake_event(NULL), + m_thread(NULL), + m_work_queue_mutex(), + m_work_queue(), + m_mmd_dma(mmd_dma_arg) { + m_thread_wake_event = new eventfd_wrapper(); + if (!m_thread_wake_event->initialized()) return; + + m_thread = new std::thread(work_thread, std::ref(*this)); + + m_initialized = true; +} + +dma_work_thread::~dma_work_thread() { + // kill the thread + if (m_thread) { + // send message to thread to end it + m_thread_wake_event->notify(UINT64_MAX - 1); + + // join with thread until it ends + m_thread->join(); + + delete m_thread; + m_thread = NULL; + } + + if (m_thread_wake_event) { + delete m_thread_wake_event; + m_thread_wake_event = NULL; + } + + m_initialized = false; +} + +void dma_work_thread::work_thread(dma_work_thread &obj) { + int res; + + // get eventfd handle + int thread_signal_fd = obj.m_thread_wake_event->get_fd(); + + struct pollfd pollfd_setup; + while (1) { + pollfd_setup.fd = thread_signal_fd; + pollfd_setup.events = POLLIN; + pollfd_setup.revents = 0; + res = poll(&pollfd_setup, 1, -1); + if (res < 0) { + fprintf(stderr, "Poll error errno = %s\n", strerror(errno)); + } else if (res > 0 && pollfd_setup.revents == POLLIN) { + uint64_t count_work_items = 0; + ssize_t bytes_read = read(thread_signal_fd, &count_work_items, sizeof(count_work_items)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + // TODO: the MMD should not exit. But I have a different branch + // I'm working on that will change synchronization to use + // condition variable instead of eventfd in synchronization + // within the same process. Will remove this exit() call at + // when PR for that change is submitted. + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + exit(-1); + } + + // Ensure count is in proper range + const unsigned long MAX_WORK_ITEMS = 1000000000; + if (count_work_items > MAX_WORK_ITEMS && count_work_items != (UINT64_MAX - 1)) { + fprintf(stderr, "Error: poll value is out of range"); + exit(-1); + } + + obj.m_work_queue_mutex.lock(); + if (obj.m_work_queue.empty() && count_work_items == UINT64_MAX - 1) { + // The maximum value of count is set when there is no work left + // The work queue must also be empty + // This thread can break out of the loop + obj.m_work_queue_mutex.unlock(); + break; + } + + std::queue<dma_work_item> items; + for (uint64_t i = 0; i < count_work_items; i++) { + // Check if there are enough jobs in the work queue as requested (count) + if (obj.m_work_queue.empty()) { + fprintf(stderr, "Poll error. Not enough tasks in queue."); + exit(-1); + } + dma_work_item item = obj.m_work_queue.front(); + items.push(item); + obj.m_work_queue.pop(); + } + obj.m_work_queue_mutex.unlock(); + + while (!items.empty()) { + dma_work_item item = items.front(); + obj.do_dma(item); + items.pop(); + } + } + } +} + +int dma_work_thread::enqueue_dma(dma_work_item &item) { + if (item.op) { + m_work_queue_mutex.lock(); + m_work_queue.push(item); + m_work_queue_mutex.unlock(); + // send message to thread to wake it + // setting count to 1 as only 1 job is pushed to the work queue + m_thread_wake_event->notify(1); + return 0; + } else { + // if op is not specified, it is a blocking operation and we don't use + // the thread + return do_dma(item); + } +} + +int dma_work_thread::do_dma(dma_work_item &item) { return m_mmd_dma.do_dma(item); } diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h new file mode 100644 index 0000000..0afb036 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/dma_work_thread.h @@ -0,0 +1,73 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _DMA_WORK_THREAD_H +#define _DMA_WORK_THREAD_H + +#include <opae/fpga.h> + +#include <mutex> +#include <queue> +#include <thread> + +#include "aocl_mmd.h" + +namespace intel_opae_mmd { + +// forward class definitions +class eventfd_wrapper; +class mmd_dma; + +class dma_work_item { + public: + aocl_mmd_op_t op; + uint64_t *rd_host_addr; + const uint64_t *wr_host_addr; + size_t dev_addr; + size_t size; +}; + +class dma_work_thread final { + public: + dma_work_thread(mmd_dma &mmd_dma_arg); + ~dma_work_thread(); + + bool initialized() { return m_initialized; } + + int enqueue_dma(dma_work_item &item); + int do_dma(dma_work_item &item); + + private: + static void work_thread(dma_work_thread &obj); + + bool m_initialized; + + eventfd_wrapper *m_thread_wake_event; + std::thread *m_thread; + std::mutex m_work_queue_mutex; + std::queue<dma_work_item> m_work_queue; + + mmd_dma &m_mmd_dma; + + // not used and not implemented + dma_work_thread(dma_work_thread &other); + dma_work_thread &operator=(const dma_work_thread &other); +}; // class dma_work_thread + +}; // namespace intel_opae_mmd + +#endif // _DMA_WORK_THREAD_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h new file mode 100644 index 0000000..2de3f74 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/eventfd_wrapper.h @@ -0,0 +1,74 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _EVENTFD_WRAPPER_H +#define _EVENTFD_WRAPPER_H + +#include <sys/eventfd.h> +#include <unistd.h> + +namespace intel_opae_mmd { + +// simple wrapper class for managing eventfd objects +class eventfd_wrapper final { + public: + eventfd_wrapper() { + m_initialized = false; + // Note: EFD_SEMAPHORE and EFD_NONBLOCK are not set + // The implementation of functions using eventfd assumes that + m_fd = eventfd(0, 0); + if (m_fd < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return; + } + + m_initialized = true; + } + + ~eventfd_wrapper() { + if (m_initialized) { + if (close(m_fd) < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + } + } + } + + bool notify(uint64_t count) { + ssize_t res = write(m_fd, &count, sizeof(count)); + if (res < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return false; + } + return true; + } + + int get_fd() { return m_fd; } + bool initialized() { return m_initialized; } + + private: + // not used and not implemented + eventfd_wrapper(eventfd_wrapper& other); + eventfd_wrapper& operator=(const eventfd_wrapper& other); + + // member varaibles + int m_fd; + int m_initialized; +}; // class eventfd_wrapper + +}; // namespace intel_opae_mmd + +#endif // _EVENTFD_WRAPPER_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c new file mode 100644 index 0000000..6c8df30 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.c @@ -0,0 +1,1313 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma.c + * \brief FPGA DMA User-mode driver + */ + +#include "fpga_dma.h" +#include <assert.h> +#include <errno.h> +#include <opae/fpga.h> +#include <poll.h> +#include <safe_string/safe_string.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <unistd.h> +#include "fpga_dma_internal.h" +#include "memcpy_s_fast.h" + +#ifdef SIM +#define USE_ASE +#else +// TODO: Need this until we can adequately sync MMIO R/W with pointer accesses. +// Causes module to use fpgaMMIORead32() instead of foo = *ptr; +#define USE_ASE +#endif + +#ifdef FPGA_DMA_DEBUG +static int err_cnt = 0; +#endif + +#ifdef CHECK_DELAYS +double poll_wait_count = 0; +double buf_full_count = 0; +#endif + +/* + * macro for checking return codes + */ +#define ON_ERR_GOTO(res, label, desc) \ + do { \ + if ((res) != FPGA_OK) { \ + error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \ + goto label; \ + } \ + } while (0) + +#define ON_ERR_RETURN(res, desc) \ + do { \ + if ((res) != FPGA_OK) { \ + error_print("Error %s: %s\n", (desc), fpgaErrStr(res)); \ + return (res); \ + } \ + } while (0) + +// Internal Functions + +/** + * MMIOWrite64Blk + * + * @brief Writes a block of 64-bit values to FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIOWrite64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_QWORD(device)); + assert(IS_ALIGNED_QWORD(bytes)); + + uint64_t *haddr = (uint64_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device); + for (i = 0; i < bytes / sizeof(uint64_t); i++) { +#ifdef USE_ASE + res = fpgaWriteMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, *haddr); + ON_ERR_RETURN(res, "fpgaWriteMMIO64"); + haddr++; + device += sizeof(uint64_t); +#else + *dev_addr++ = *haddr++; +#endif + } + return res; +} + +/** + * MMIOWrite32Blk + * + * @brief Writes a block of 32-bit values to FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIOWrite32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_DWORD(device)); + assert(IS_ALIGNED_DWORD(bytes)); + + uint32_t *haddr = (uint32_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, haddr, (void *)device); + for (i = 0; i < bytes / sizeof(uint32_t); i++) { +#ifdef USE_ASE + res = fpgaWriteMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, *haddr); + ON_ERR_RETURN(res, "fpgaWriteMMIO32"); + haddr++; + device += sizeof(uint32_t); +#else + *dev_addr++ = *haddr++; +#endif + } + return res; +} + +/** + * MMIORead64Blk + * + * @brief Reads a block of 64-bit values from FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIORead64Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_QWORD(device)); + assert(IS_ALIGNED_QWORD(bytes)); + + uint64_t *haddr = (uint64_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint64_t *dev_addr = HOST_MMIO_64_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr); + for (i = 0; i < bytes / sizeof(uint64_t); i++) { +#ifdef USE_ASE + res = fpgaReadMMIO64(dma_h->fpga_h, dma_h->mmio_num, device, haddr); + ON_ERR_RETURN(res, "fpgaReadMMIO64"); + haddr++; + device += sizeof(uint64_t); +#else + *haddr++ = *dev_addr++; +#endif + } + return res; +} + +/** + * MMIORead32Blk + * + * @brief Reads a block of 32-bit values from FPGA MMIO space + * @param[in] dma Handle to the FPGA DMA object + * @param[in] device FPGA address + * @param[in] host Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result MMIORead32Blk(fpga_dma_handle dma_h, uint64_t device, uint64_t host, uint64_t bytes) { + assert(IS_ALIGNED_DWORD(device)); + assert(IS_ALIGNED_DWORD(bytes)); + + uint32_t *haddr = (uint32_t *)host; + uint64_t i; + fpga_result res = FPGA_OK; + +#ifndef USE_ASE + volatile uint32_t *dev_addr = HOST_MMIO_32_ADDR(dma_h, device); +#endif + + debug_print("copying %lld bytes from 0x%p to 0x%p\n", (long long int)bytes, (void *)device, haddr); + for (i = 0; i < bytes / sizeof(uint32_t); i++) { +#ifdef USE_ASE + res = fpgaReadMMIO32(dma_h->fpga_h, dma_h->mmio_num, device, haddr); + ON_ERR_RETURN(res, "fpgaReadMMIO32"); + haddr++; + device += sizeof(uint32_t); +#else + *haddr++ = *dev_addr++; +#endif + } + return res; +} + +// Feature type is BBB +static inline bool fpga_dma_feature_is_bbb(uint64_t dfh) { + // BBB is type 2 + return ((dfh >> AFU_DFH_TYPE_OFFSET) & 0xf) == FPGA_DMA_BBB; +} + +/** + * _switch_to_ase_page + * + * @brief Updates the current page of ASE to the address given + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] addr Address to which the ASE page should be switched + * @return Nothing. Side-effect is to update the current page in the DMA handle. + * + */ +static inline void _switch_to_ase_page(fpga_dma_handle dma_h, uint64_t addr) { + uint64_t requested_page = addr & ~DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + if (requested_page != dma_h->cur_ase_page) { + MMIOWrite64Blk(dma_h, ASE_CNTL_BASE(dma_h), (uint64_t)&requested_page, sizeof(requested_page)); + dma_h->cur_ase_page = requested_page; + } +} + +/** + * _send_descriptor + * + * @brief Queues a DMA descriptor to the FPGA + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] desc Pointer to a descriptor structure to send + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _send_descriptor(fpga_dma_handle dma_h, msgdma_ext_desc_t *desc) { + fpga_result res = FPGA_OK; + msgdma_status_t status = {0}; + + debug_print("desc.rd_address = %x\n", desc->rd_address); + debug_print("desc.wr_address = %x\n", desc->wr_address); + debug_print("desc.len = %x\n", desc->len); + debug_print("desc.wr_burst_count = %x\n", desc->wr_burst_count); + debug_print("desc.rd_burst_count = %x\n", desc->rd_burst_count); + debug_print("desc.wr_stride %x\n", desc->wr_stride); + debug_print("desc.rd_stride %x\n", desc->rd_stride); + debug_print("desc.rd_address_ext %x\n", desc->rd_address_ext); + debug_print("desc.wr_address_ext %x\n", desc->wr_address_ext); + + debug_print("SGDMA_CSR_BASE = %lx SGDMA_DESC_BASE=%lx\n", dma_h->dma_csr_base, dma_h->dma_desc_base); + +#ifdef CHECK_DELAYS + bool first = true; +#endif + do { + res = MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg)); + ON_ERR_GOTO(res, out, "MMIORead32Blk"); +#ifdef CHECK_DELAYS + if (first && status.st.desc_buf_full) { + buf_full_count++; + first = false; + } +#endif + } while (status.st.desc_buf_full); + + res = MMIOWrite64Blk(dma_h, dma_h->dma_desc_base, (uint64_t)desc, sizeof(*desc)); + ON_ERR_GOTO(res, out, "MMIOWrite64Blk"); + +out: + return res; +} + +/** + * _do_dma + * + * @brief Performs a DMA transaction with the FPGA + * @param[in] dma_h Handle to the FPGA DMA object + * @param[in] dst Pointer to a host or FPGA buffer to send or retrieve + * @param[in] src Pointer to a host or FPGA buffer to send or retrieve + * @param[in] count Number of bytes + * @param[in] is_last_desc True if this is the last buffer of a batch + * @param[in] type Direction of transfer + * @param[in] intr_en True means to ask for an interrupt from the FPGA + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _do_dma(fpga_dma_handle dma_h, + uint64_t dst, + uint64_t src, + int count, + int is_last_desc, + fpga_dma_transfer_t type, + bool intr_en) { + msgdma_ext_desc_t desc = {0}; + fpga_result res = FPGA_OK; + int alignment_offset = 0; + int segment_size = 0; + + // src, dst and count must be 64-byte aligned + if (dst % FPGA_DMA_ALIGN_BYTES != 0 || src % FPGA_DMA_ALIGN_BYTES != 0 || count % FPGA_DMA_ALIGN_BYTES != 0) { + return FPGA_INVALID_PARAM; + } + // these fields are fixed for all DMA transfers + desc.seq_num = 0; + desc.wr_stride = 1; + desc.rd_stride = 1; + + desc.control.go = 1; + if (intr_en) + desc.control.transfer_irq_en = 1; + else + desc.control.transfer_irq_en = 0; + + // Enable "earlyreaddone" in the control field of the descriptor except the last. + // Setting early done causes the read logic to move to the next descriptor + // before the previous descriptor completes. + // This elminates a few hundred clock cycles of waiting between transfers. + if (!is_last_desc) + desc.control.early_done_en = 1; + else + desc.control.early_done_en = 0; + + if (type == FPGA_TO_FPGA_MM) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.len = count; + desc.wr_burst_count = 4; + desc.rd_burst_count = 4; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // either FPGA to Host or Host to FPGA transfer so we need to make sure the DMA transaction is aligned to the burst + // size (CCIP restriction) + else { + // need to determine if the CCIP (host) address is aligned to 4CL (256B). When 0 the CCIP address is aligned. + alignment_offset = + (type == HOST_TO_FPGA_MM) ? (src % (4 * FPGA_DMA_ALIGN_BYTES)) : (dst % (4 * FPGA_DMA_ALIGN_BYTES)); + + // not aligned to 4CL so performing a short transfer to get aligned + if (alignment_offset != 0) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.wr_burst_count = 1; + desc.rd_burst_count = 1; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + // count isn't large enough to hit next 4CL boundary + if (((4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset) >= count) { + segment_size = count; + count = 0; // only had to transfer count amount of data to reach the end of the provided buffer + } else { + segment_size = (4 * FPGA_DMA_ALIGN_BYTES) - alignment_offset; + src += segment_size; + dst += segment_size; + count -= segment_size; // subtract the segment size from count since the transfer below will bring us into 4CL + // alignment + desc.control.transfer_irq_en = 0; + } + + // will post short transfer to align to a 4CL (256 byte) boundary + desc.len = segment_size; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // at this point we are 4CL (256 byte) aligned + // if there is at least 4CL (256 bytes) of data to transfer, post bursts of 4 + if (count >= (4 * FPGA_DMA_ALIGN_BYTES)) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.wr_burst_count = 4; + desc.rd_burst_count = 4; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + + // buffer ends on 4CL boundary + if ((count % (4 * FPGA_DMA_ALIGN_BYTES)) == 0) { + segment_size = count; + count = 0; // transfer below will move the remainder of the buffer + } + // buffers do not end on 4CL boundary so transfer only up to the last 4CL boundary leaving a segment at the end to + // finish later + else { + segment_size = count - (count % (4 * FPGA_DMA_ALIGN_BYTES)); // round count down to the nearest multiple of 4CL + src += segment_size; + dst += segment_size; + count -= segment_size; + desc.control.transfer_irq_en = 0; + } + + desc.len = segment_size; + + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + // at this point we have posted all the bursts of length 4 we can but there might be 64, 128, or 192 bytes of data + // to transfer still if buffer did not end on 4CL (256 byte) boundary post short transfer to handle the remainder + if (count > 0) { + desc.rd_address = src & FPGA_DMA_MASK_32_BIT; + desc.wr_address = dst & FPGA_DMA_MASK_32_BIT; + desc.len = count; + desc.wr_burst_count = 1; + desc.rd_burst_count = 1; + desc.rd_address_ext = (src >> 32) & FPGA_DMA_MASK_32_BIT; + desc.wr_address_ext = (dst >> 32) & FPGA_DMA_MASK_32_BIT; + if (intr_en) desc.control.transfer_irq_en = 1; + // will post short transfer to move the remainder of the buffer + res = _send_descriptor(dma_h, &desc); + ON_ERR_GOTO(res, out, "_send_descriptor"); + } + + } // end of FPGA --> Host or Host --> FPGA transfer + +out: + return res; +} + +fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dfh_offset, int interrupt_num, fpga_dma_handle *dma_p) { + fpga_result res = FPGA_OK; + fpga_dma_handle dma_h = NULL; + int i = 0; + if (!fpga) { + return FPGA_INVALID_PARAM; + } + if (!dma_p) { + return FPGA_INVALID_PARAM; + } + // init the dma handle + dma_h = (fpga_dma_handle)malloc(sizeof(struct _dma_handle_t)); + if (!dma_h) { + return FPGA_NO_MEMORY; + } + dma_h->fpga_h = fpga; + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) dma_h->dma_buf_ptr[i] = NULL; + dma_h->mmio_num = 0; + dma_h->cur_ase_page = 0xffffffffffffffffUll; + + // Discover DMA BBB by traversing the device feature list + bool dma_found = false; + +#ifndef USE_ASE + res = fpgaMapMMIO(dma_h->fpga_h, 0, (uint64_t **)&dma_h->mmio_va); + ON_ERR_GOTO(res, out, "fpgaMapMMIO"); +#endif + + dfh_feature_t dfh = {0}; + res = MMIORead64Blk(dma_h, dfh_offset, (uint64_t)&dfh, sizeof(dfh)); + ON_ERR_GOTO(res, out, "MMIORead64Blk"); + + if (fpga_dma_feature_is_bbb(dfh.dfh) && (dfh.feature_uuid_lo == FPGA_DMA_UUID_L) && + (dfh.feature_uuid_hi == FPGA_DMA_UUID_H)) { + dma_h->dma_base = dfh_offset; + dma_h->dma_csr_base = dma_h->dma_base + FPGA_DMA_CSR; + dma_h->dma_desc_base = dma_h->dma_base + FPGA_DMA_DESC; + dma_h->dma_ase_cntl_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_CNTL; + dma_h->dma_ase_data_base = dma_h->dma_base + FPGA_DMA_ADDR_SPAN_EXT_DATA; + dma_found = true; + *dma_p = dma_h; + res = FPGA_OK; + } else { + *dma_p = NULL; + res = FPGA_NOT_FOUND; + goto out; + } + + // Buffer size must be page aligned for prepareBuffer + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaPrepareBuffer( + dma_h->fpga_h, FPGA_DMA_BUF_SIZE, (void **)&(dma_h->dma_buf_ptr[i]), &dma_h->dma_buf_wsid[i], 0); + ON_ERR_GOTO(res, out, "fpgaPrepareBuffer"); + + // Make sure it's actually allocated + dma_h->dma_buf_ptr[i][0] = 0xff; + madvise((void *)dma_h->dma_buf_ptr[i], FPGA_DMA_BUF_SIZE, MADV_SEQUENTIAL); + + res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->dma_buf_wsid[i], &dma_h->dma_buf_iova[i]); + ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress"); + } + + // Allocate magic number buffer + res = fpgaPrepareBuffer(dma_h->fpga_h, FPGA_DMA_ALIGN_BYTES, (void **)&(dma_h->magic_buf), &dma_h->magic_wsid, 0); + ON_ERR_GOTO(res, out, "fpgaPrepareBuffer"); + + dma_h->magic_buf[0] = 0xff; + + res = fpgaGetIOAddress(dma_h->fpga_h, dma_h->magic_wsid, &dma_h->magic_iova); + ON_ERR_GOTO(res, rel_buf, "fpgaGetIOAddress"); + memset((void *)dma_h->magic_buf, 0, FPGA_DMA_ALIGN_BYTES); + + // turn on global interrupts + msgdma_ctrl_t ctrl = {0}; + ctrl.ct.global_intr_en_mask = 1; + res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg)); + ON_ERR_GOTO(res, rel_buf, "MMIOWrite32Blk"); + + // register interrupt event handle + res = fpgaCreateEventHandle(&dma_h->eh); + ON_ERR_GOTO(res, rel_buf, "fpgaCreateEventHandle"); + + res = fpgaRegisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh, interrupt_num /*vector id */); + ON_ERR_GOTO(res, destroy_eh, "fpgaRegisterEvent"); + + return FPGA_OK; + +destroy_eh: + res = fpgaDestroyEventHandle(&dma_h->eh); + ON_ERR_GOTO(res, rel_buf, "fpgaDestroyEventHandle"); + +rel_buf: + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer"); + } +out: + if (!dma_found) { + free(dma_h); + } + return res; +} + +/** + * _read_memory_mmio_unaligned + * + * @brief Performs a unaligned read(address not 4/8/64 byte aligned) from FPGA address(device address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dev_addr FPGA address + * @param[in] host_addr Host buffer address + * @param[in] count Size in bytes, always less than 8bytes. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _read_memory_mmio_unaligned(fpga_dma_handle dma_h, + uint64_t dev_addr, + uint64_t host_addr, + uint64_t count) { + fpga_result res = FPGA_OK; + + assert(count < QWORD_BYTES); + + if (0 == count) return res; + + uint64_t shift = dev_addr % QWORD_BYTES; + debug_print("shift = %08lx , count = %08lx \n", shift, count); + + _switch_to_ase_page(dma_h, dev_addr); + uint64_t dev_aligned_addr = (dev_addr - shift) & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + // read data from device memory + uint64_t read_tmp = 0; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + // overlay our data + memcpy_s_fast((void *)host_addr, count, ((char *)(&read_tmp)) + shift, count); + + return res; +} + +/** + * _write_memory_mmio_unaligned + * + * @brief Performs an unaligned write(address not 4/8/64 byte aligned) to FPGA address(device address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dev_addr FPGA address + * @param[in] host_addr Host buffer address + * @param[in] count Size in bytes, always less than 8bytes. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +static fpga_result _write_memory_mmio_unaligned(fpga_dma_handle dma_h, + uint64_t dev_addr, + uint64_t host_addr, + uint64_t count) { + fpga_result res = FPGA_OK; + + assert(count < QWORD_BYTES); + + if (0 == count) return res; + + uint64_t shift = dev_addr % QWORD_BYTES; + debug_print("shift = %08lx , count = %08lx \n", shift, count); + + _switch_to_ase_page(dma_h, dev_addr); + uint64_t dev_aligned_addr = (dev_addr - (dev_addr % QWORD_BYTES)) & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + + // read data from device memory + uint64_t read_tmp = 0; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + // overlay our data + memcpy_s_fast(((char *)(&read_tmp)) + shift, count, (void *)host_addr, count); + + // write back to device + res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + dev_aligned_addr, (uint64_t)&read_tmp, sizeof(read_tmp)); + if (res != FPGA_OK) return res; + + return res; +} + +/** + * _write_memory_mmio + * + * @brief Writes to a DWORD/QWORD aligned memory address(FPGA address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the FPGA address + * @param[in/out] src_ptr Pointer to the Host buffer address + * @param[in/out] count Pointer to the Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src, dst, and count + * + */ +static fpga_result _write_memory_mmio(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t *count) { + fpga_result res = FPGA_OK; + + if (*count < DWORD_BYTES) return res; + + assert(*count >= DWORD_BYTES); + assert(IS_ALIGNED_DWORD(*dst_ptr)); + if (!IS_ALIGNED_DWORD(*dst_ptr)) // If QWORD aligned, this will be true + return FPGA_EXCEPTION; + + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t align_bytes = *count; + uint64_t offset = 0; + + if (!IS_ALIGNED_QWORD(dst)) { + // Write out a single DWORD to get QWORD aligned + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIOWrite32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + if (0 == align_bytes) return res; + + assert(IS_ALIGNED_QWORD(dst)); + + // Write out blocks of 64-bit values + while (align_bytes >= QWORD_BYTES) { + uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW; + left_in_page -= dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1))); + if (size_to_copy < QWORD_BYTES) break; + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, size_to_copy); + ON_ERR_RETURN(res, "MMIOWrite64Blk"); + src += size_to_copy; + dst += size_to_copy; + align_bytes -= size_to_copy; + } + + if (align_bytes >= DWORD_BYTES) { + // Write out remaining DWORD + _switch_to_ase_page(dma_h, dst); + offset = dst & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIOWrite32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)src, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIOWrite32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + assert(align_bytes < DWORD_BYTES); + + *src_ptr = src; + *dst_ptr = dst; + *count = align_bytes; + return res; +} + +/** + * _ase_host_to_fpga + * + * @brief Tx "count" bytes from HOST to FPGA using Address span expander(ASE)- will internally make + * calls to handle unaligned and aligned MMIO writes. + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the FPGA address + * @param[in/out] src_ptr Pointer to the Host buffer address + * @param[in] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src and dst + * + */ +static fpga_result _ase_host_to_fpga(fpga_dma_handle dma_h, uint64_t *dst_ptr, uint64_t *src_ptr, uint64_t count) { + fpga_result res = FPGA_OK; + uint64_t dst = *dst_ptr; + uint64_t src = *src_ptr; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr); + + // Aligns address to 8 byte using dst masking method + if (!IS_ALIGNED_DWORD(dst) && !IS_ALIGNED_QWORD(dst)) { + unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size); + if (res != FPGA_OK) return res; + count_left -= unaligned_size; + src += unaligned_size; + dst += unaligned_size; + } + // Handles 8/4 byte MMIO transfer + res = _write_memory_mmio(dma_h, &dst, &src, &count_left); + if (res != FPGA_OK) return res; + + // Left over unaligned count bytes are transfered using dst masking method + unaligned_size = QWORD_BYTES - (dst % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + + res = _write_memory_mmio_unaligned(dma_h, dst, src, unaligned_size); + if (res != FPGA_OK) return res; + + count_left -= unaligned_size; + + *dst_ptr = dst + unaligned_size; + *src_ptr = src + unaligned_size; + + return FPGA_OK; +} + +/** + * _read_memory_mmio + * + * @brief Reads a DWORD/QWORD aligned memory address(FPGA address). + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the Host Buffer Address + * @param[in/out] src_ptr Pointer to the FPGA address + * @param[in/out] count Pointer to the size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src, dst, and count + * + */ +static fpga_result _read_memory_mmio(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t *count) { + fpga_result res = FPGA_OK; + + if (*count < DWORD_BYTES) return res; + + assert(*count >= DWORD_BYTES); + assert(IS_ALIGNED_DWORD(*src_ptr)); + if (!IS_ALIGNED_DWORD(*src_ptr)) // If QWORD aligned, this will be true + return FPGA_EXCEPTION; + + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t align_bytes = *count; + uint64_t offset = 0; + + if (!IS_ALIGNED_QWORD(src)) { + // Read a single DWORD to get QWORD aligned + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIORead32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + if (0 == align_bytes) return res; + + assert(IS_ALIGNED_QWORD(src)); + + // Read blocks of 64-bit values + while (align_bytes >= QWORD_BYTES) { + uint64_t left_in_page = DMA_ADDR_SPAN_EXT_WINDOW; + left_in_page -= src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + uint64_t size_to_copy = min(left_in_page, (align_bytes & ~(QWORD_BYTES - 1))); + if (size_to_copy < QWORD_BYTES) break; + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead64Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, size_to_copy); + ON_ERR_RETURN(res, "MMIORead64Blk"); + src += size_to_copy; + dst += size_to_copy; + align_bytes -= size_to_copy; + } + + if (align_bytes >= DWORD_BYTES) { + // Read remaining DWORD + _switch_to_ase_page(dma_h, src); + offset = src & DMA_ADDR_SPAN_EXT_WINDOW_MASK; + res = MMIORead32Blk(dma_h, ASE_DATA_BASE(dma_h) + offset, (uint64_t)dst, DWORD_BYTES); + ON_ERR_RETURN(res, "MMIORead32Blk"); + src += DWORD_BYTES; + dst += DWORD_BYTES; + align_bytes -= DWORD_BYTES; + } + + assert(align_bytes < DWORD_BYTES); + + *src_ptr = src; + *dst_ptr = dst; + *count = align_bytes; + return res; +} + +/** + * _ase_fpga_to_host + * + * @brief Tx "count" bytes from FPGA to HOST using Address span expander(ASE)- will internally make + * calls to handle unaligned and aligned MMIO writes. + * @param[in] dma Handle to the FPGA DMA object + * @param[in/out] dst_ptr Pointer to the Host Buffer Address + * @param[in/out] src_ptr Pointer to the FPGA address + * @param[in/out] count Size in bytes + * @return fpga_result FPGA_OK on success, return code otherwise. Updates src and dst + * + */ +static fpga_result _ase_fpga_to_host(fpga_dma_handle dma_h, uint64_t *src_ptr, uint64_t *dst_ptr, uint64_t count) { + fpga_result res = FPGA_OK; + uint64_t src = *src_ptr; + uint64_t dst = *dst_ptr; + uint64_t count_left = count; + uint64_t unaligned_size = 0; + + debug_print("dst_ptr = %08lx , count = %08lx, src = %08lx \n", *dst_ptr, count, *src_ptr); + + // Aligns address to 8 byte using src masking method + if (!IS_ALIGNED_DWORD(src) && !IS_ALIGNED_QWORD(src)) { + unaligned_size = QWORD_BYTES - (src % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size); + if (res != FPGA_OK) return res; + count_left -= unaligned_size; + dst += unaligned_size; + src += unaligned_size; + } + // Handles 8/4 byte MMIO transfer + res = _read_memory_mmio(dma_h, &src, &dst, &count_left); + if (res != FPGA_OK) return res; + + // Left over unaligned count bytes are transfered using src masking method + unaligned_size = QWORD_BYTES - (src % QWORD_BYTES); + if (unaligned_size > count_left) unaligned_size = count_left; + + res = _read_memory_mmio_unaligned(dma_h, src, dst, unaligned_size); + if (res != FPGA_OK) return res; + + count_left -= unaligned_size; + + *dst_ptr = dst + unaligned_size; + *src_ptr = src + unaligned_size; + + return FPGA_OK; +} + +static fpga_result clear_interrupt(fpga_dma_handle dma_h) { + // clear interrupt by writing 1 to IRQ bit in status register + msgdma_status_t status = {0}; + status.st.irq = 1; + + return MMIOWrite32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)&status.reg, sizeof(status.reg)); +} + +static fpga_result poll_interrupt(fpga_dma_handle dma_h) { + struct pollfd pfd = {0}; + msgdma_status_t status = { 0 }; + fpga_result res = FPGA_OK; + int poll_res; + + res = fpgaGetOSObjectFromEventHandle(dma_h->eh, &pfd.fd); + ON_ERR_GOTO(res, out, "fpgaGetOSObjectFromEventHandle failed\n"); + + pfd.events = POLLIN; + +#ifdef CHECK_DELAYS + if (0 == poll(&pfd, 1, 0)) poll_wait_count++; +#endif + poll_res = poll(&pfd, 1, FPGA_DMA_TIMEOUT_MSEC); + MMIORead32Blk(dma_h, CSR_STATUS(dma_h), (uint64_t)& status.reg, sizeof(status.reg)); + if (poll_res < 0) { + fprintf(stderr, "Poll error errno = %s DMA status reg: 0x%x\n", strerror(errno), status.reg); + res = FPGA_EXCEPTION; + goto out; + } else if (poll_res == 0) { + fprintf(stderr, "Poll(interrupt) timeout DMA status reg: 0x%x\n", status.reg); + res = FPGA_EXCEPTION; + } else { + uint64_t count = 0; + ssize_t bytes_read = read(pfd.fd, &count, sizeof(count)); + if (bytes_read > 0) { + debug_print("Poll success. Return = %d, count = %d\n", poll_res, (int)count); + res = FPGA_OK; + } else { + fprintf(stderr, "Error: poll failed read: zero bytes read"); + res = FPGA_EXCEPTION; + } + } + +out: + clear_interrupt(dma_h); + return res; +} + +static fpga_result _issue_magic(fpga_dma_handle dma_h) { + fpga_result res = FPGA_OK; + *(dma_h->magic_buf) = 0x0ULL; + + res = _do_dma(dma_h, + dma_h->magic_iova | FPGA_DMA_WF_HOST_MASK, + FPGA_DMA_WF_ROM_MAGIC_NO_MASK, + 64, + 1, + FPGA_TO_HOST_MM, + FPGA2HOST_IRQ_REQ /*intr_en */); + return res; +} + +static void _wait_magic(fpga_dma_handle dma_h) { +#ifndef SKIP_FPGA2HOST_IRQ + poll_interrupt(dma_h); +#endif + while (*(dma_h->magic_buf) != FPGA_DMA_WF_MAGIC_NO) + ; + *(dma_h->magic_buf) = 0x0ULL; +} + +fpga_result transferHostToFpga( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t count_left = count; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + int issued_intr = 0; + debug_print("Host To Fpga ----------- src = %08lx, dst = %08lx \n", src, dst); + if (!IS_DMA_ALIGNED(dst)) { + if (count_left < FPGA_DMA_ALIGN_BYTES) { + res = _ase_host_to_fpga(dma_h, &dst, &src, count_left); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + return res; + } else { + aligned_addr = ((dst / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES; + align_bytes = aligned_addr - dst; + res = _ase_host_to_fpga(dma_h, &dst, &src, align_bytes); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + count_left = count_left - align_bytes; + } + } + if (count_left) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print( + "DMA TX : dma chuncks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src); + + for (i = 0; i < dma_chunks; i++) { + // constant size transfer, no length check required for memcpy + memcpy_s_fast(dma_h->dma_buf_ptr[i % FPGA_DMA_MAX_BUF], + FPGA_DMA_BUF_SIZE, + (void *)(src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE); + // The value of FPGA_DMA_MAX_BUF is 2. Thus FPGA_DMA_MAX_BUF/2 -- 1, so the comparison + // is always i % 1 == 0, which will always be true. This means that the i == (dma_chunks -1) + // portion of the conditional will never be reached. However, for clarity and in case + // FPGA_DMA_MAX_BUF changes, I will leave the conditional as is and apply a coverity supression + // coverity[deadcode:FALSE] + if ((i % (FPGA_DMA_MAX_BUF / 2) == (FPGA_DMA_MAX_BUF / 2) - 1) || i == (dma_chunks - 1) /*last descriptor */) { + if (i == (FPGA_DMA_MAX_BUF / 2) - 1) { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + true); + } else { + if (issued_intr) poll_interrupt(dma_h); + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + true /*intr_en */); + } + issued_intr = 1; + } else { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[i % FPGA_DMA_MAX_BUF] | FPGA_DMA_HOST_MASK, + FPGA_DMA_BUF_SIZE, + 0, + type, + false /*intr_en */); + } + } + if (issued_intr) { + poll_interrupt(dma_h); + issued_intr = 0; + } + if (count_left) { + uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES; + if (dma_tx_bytes != 0) { + debug_print("dma_tx_bytes = %08lx was transfered using DMA\n", dma_tx_bytes); + if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) { + res = FPGA_NO_MEMORY; + ON_ERR_GOTO(res, out, "Illegal transfer size\n"); + } + + memcpy_s_fast( + dma_h->dma_buf_ptr[0], dma_tx_bytes, (void *)(src + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes); + res = _do_dma(dma_h, + (dst + dma_chunks * FPGA_DMA_BUF_SIZE), + dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK, + dma_tx_bytes, + 1, + type, + true /*intr_en */); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + poll_interrupt(dma_h); + } + count_left -= dma_tx_bytes; + if (count_left) { + dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + res = _ase_host_to_fpga(dma_h, &dst, &src, count_left); + ON_ERR_GOTO(res, out, "HOST_TO_FPGA_MM Transfer failed\n"); + } + } + } +out: + return res; +} + +fpga_result transferFpgaToHost( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t j = 0; + uint64_t count_left = count; + uint64_t aligned_addr = 0; + uint64_t align_bytes = 0; + int wf_issued = 0; + + debug_print("FPGA To Host ----------- src = %08lx, dst = %08lx \n", src, dst); + if (!IS_DMA_ALIGNED(src)) { + if (count_left < FPGA_DMA_ALIGN_BYTES) { + res = _ase_fpga_to_host(dma_h, &src, &dst, count_left); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + return res; + } else { + aligned_addr = ((src / FPGA_DMA_ALIGN_BYTES) + 1) * FPGA_DMA_ALIGN_BYTES; + align_bytes = aligned_addr - src; + res = _ase_fpga_to_host(dma_h, &src, &dst, align_bytes); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + count_left = count_left - align_bytes; + } + } + if (count_left) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print( + "DMA TX : dma chunks = %d, count_left = %08lx, dst = %08lx, src = %08lx \n", dma_chunks, count_left, dst, src); + uint64_t pending_buf = 0; + for (i = 0; i < dma_chunks; i++) { + res = _do_dma(dma_h, + dma_h->dma_buf_iova[i % (FPGA_DMA_MAX_BUF)] | FPGA_DMA_HOST_MASK, + (src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + + const int num_pending = i - pending_buf + 1; + if (num_pending == (FPGA_DMA_MAX_BUF / 2)) { // Enters this loop only once,after first batch of descriptors. + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + wf_issued = 1; + } + if (num_pending > (FPGA_DMA_MAX_BUF - 1) || i == (dma_chunks - 1) /*last descriptor */) { + if (wf_issued) { + _wait_magic(dma_h); + for (j = 0; j < (FPGA_DMA_MAX_BUF / 2); j++) { + // constant size transfer; no length check required + memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)], + FPGA_DMA_BUF_SIZE); + pending_buf++; + } + wf_issued = 0; + } + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + wf_issued = 1; + } + } + + if (wf_issued) _wait_magic(dma_h); + + // clear out final dma memcpy operations + while (pending_buf < dma_chunks) { + // constant size transfer; no length check required + memcpy_s_fast((void *)(dst + pending_buf * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + dma_h->dma_buf_ptr[pending_buf % (FPGA_DMA_MAX_BUF)], + FPGA_DMA_BUF_SIZE); + pending_buf++; + } + if (count_left > 0) { + uint64_t dma_tx_bytes = (count_left / FPGA_DMA_ALIGN_BYTES) * FPGA_DMA_ALIGN_BYTES; + if (dma_tx_bytes != 0) { + debug_print("dma_tx_bytes = %08lx was transfered using DMA\n", dma_tx_bytes); + res = _do_dma(dma_h, + dma_h->dma_buf_iova[0] | FPGA_DMA_HOST_MASK, + (src + dma_chunks * FPGA_DMA_BUF_SIZE), + dma_tx_bytes, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + if (dma_tx_bytes > FPGA_DMA_BUF_SIZE) { + res = FPGA_NO_MEMORY; + ON_ERR_GOTO(res, out, "Illegal transfer size\n"); + } + memcpy_s_fast( + (void *)(dst + dma_chunks * FPGA_DMA_BUF_SIZE), dma_tx_bytes, dma_h->dma_buf_ptr[0], dma_tx_bytes); + } + count_left -= dma_tx_bytes; + if (count_left) { + dst = dst + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + src = src + dma_chunks * FPGA_DMA_BUF_SIZE + dma_tx_bytes; + res = _ase_fpga_to_host(dma_h, &src, &dst, count_left); + ON_ERR_GOTO(res, out, "FPGA_TO_HOST_MM Transfer failed"); + } + } + } +out: + return res; +} + +fpga_result transferFpgaToFpga( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + uint64_t i = 0; + uint64_t count_left = count; + uint64_t *tmp_buf = NULL; + if (IS_DMA_ALIGNED(dst) && IS_DMA_ALIGNED(src) && IS_DMA_ALIGNED(count_left)) { + uint32_t dma_chunks = count_left / FPGA_DMA_BUF_SIZE; + count_left -= (dma_chunks * FPGA_DMA_BUF_SIZE); + debug_print("!!!FPGA to FPGA!!! TX :dma chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n", + dma_chunks, + count_left, + dst, + src); + + for (i = 0; i < dma_chunks; i++) { + res = _do_dma(dma_h, + (dst + i * FPGA_DMA_BUF_SIZE), + (src + i * FPGA_DMA_BUF_SIZE), + FPGA_DMA_BUF_SIZE, + 0, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed"); + if ((i + 1) % FPGA_DMA_MAX_BUF == 0 || i == (dma_chunks - 1) /*last descriptor */) { + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + } + } + if (count_left > 0) { + debug_print("Count_left = %08lx was transfered using DMA\n", count_left); + res = _do_dma(dma_h, + (dst + dma_chunks * FPGA_DMA_BUF_SIZE), + (src + dma_chunks * FPGA_DMA_BUF_SIZE), + count_left, + 1, + type, + false /*intr_en */); + ON_ERR_GOTO(res, out, "FPGA_TO_FPGA_MM Transfer failed"); + res = _issue_magic(dma_h); + ON_ERR_GOTO(res, out, "Magic number issue failed"); + _wait_magic(dma_h); + } + } else { + if ((src < dst) && (src + count_left >= dst)) { + debug_print("Overlapping addresses, Provide correct dst address\n"); + return FPGA_NOT_SUPPORTED; + } + uint32_t tx_chunks = count_left / FPGA_DMA_BUF_ALIGN_SIZE; + count_left -= (tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE); + debug_print("!!!FPGA to FPGA TX!!! : tx chunks = %d, count = %08lx, dst = %08lx, src = %08lx \n", + tx_chunks, + count_left, + dst, + src); + tmp_buf = (uint64_t *)malloc(FPGA_DMA_BUF_ALIGN_SIZE); + for (i = 0; i < tx_chunks; i++) { + res = transferFpgaToHost( + dma_h, (uint64_t)tmp_buf, (src + i * FPGA_DMA_BUF_ALIGN_SIZE), FPGA_DMA_BUF_ALIGN_SIZE, FPGA_TO_HOST_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + res = transferHostToFpga( + dma_h, (dst + i * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, FPGA_DMA_BUF_ALIGN_SIZE, HOST_TO_FPGA_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + } + if (count_left > 0) { + res = transferFpgaToHost( + dma_h, (uint64_t)tmp_buf, (src + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), count_left, FPGA_TO_HOST_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + res = transferHostToFpga( + dma_h, (dst + tx_chunks * FPGA_DMA_BUF_ALIGN_SIZE), (uint64_t)tmp_buf, count_left, HOST_TO_FPGA_MM); + ON_ERR_GOTO(res, out_spl, "FPGA_TO_FPGA_MM Transfer failed"); + } + free(tmp_buf); + } +out: + return res; +out_spl: + free(tmp_buf); + return res; +} + +fpga_result fpgaDmaTransferSync( + fpga_dma_handle dma_h, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type) { + fpga_result res = FPGA_OK; + + if (!dma_h) return FPGA_INVALID_PARAM; + + if (type >= FPGA_MAX_TRANSFER_TYPE) return FPGA_INVALID_PARAM; + + if (!dma_h->fpga_h) return FPGA_INVALID_PARAM; + + if (type == HOST_TO_FPGA_MM) { + res = transferHostToFpga(dma_h, dst, src, count, HOST_TO_FPGA_MM); + } else if (type == FPGA_TO_HOST_MM) { + res = transferFpgaToHost(dma_h, dst, src, count, FPGA_TO_HOST_MM); + } else if (type == FPGA_TO_FPGA_MM) { + res = transferFpgaToFpga(dma_h, dst, src, count, FPGA_TO_FPGA_MM); + } else { + // Should not be possible, since we have handled all fpga_dma_transfer_t types + assert(0); + } + + return res; +} + +fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma, + uint64_t dst, + uint64_t src, + size_t count, + fpga_dma_transfer_t type, + fpga_dma_transfer_cb cb, + void *context) { + // TODO + return FPGA_NOT_SUPPORTED; +} + +fpga_result fpgaDmaClose(fpga_dma_handle dma_h) { + fpga_result res = FPGA_OK; + int i = 0; + if (!dma_h) { + res = FPGA_INVALID_PARAM; + goto out; + } + + if (!dma_h->fpga_h) { + res = FPGA_INVALID_PARAM; + goto out; + } + + for (i = 0; i < FPGA_DMA_MAX_BUF; i++) { + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->dma_buf_wsid[i]); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer failed"); + } + + res = fpgaReleaseBuffer(dma_h->fpga_h, dma_h->magic_wsid); + ON_ERR_GOTO(res, out, "fpgaReleaseBuffer"); + + fpgaUnregisterEvent(dma_h->fpga_h, FPGA_EVENT_INTERRUPT, dma_h->eh); + fpgaDestroyEventHandle(&dma_h->eh); + + // turn off global interrupts + msgdma_ctrl_t ctrl = {0}; + ctrl.ct.global_intr_en_mask = 0; + res = MMIOWrite32Blk(dma_h, CSR_CONTROL(dma_h), (uint64_t)&ctrl.reg, sizeof(ctrl.reg)); + ON_ERR_GOTO(res, out, "MMIOWrite32Blk"); + +out: + free((void *)dma_h); + return res; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h new file mode 100644 index 0000000..e382696 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma.h @@ -0,0 +1,141 @@ +// Copyright 2017-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma.h + * \brief FPGA DMA BBB API Header + * + * Known Limitations + * - Supports only synchronous (blocking) transfers + */ + +#ifndef __FPGA_DMA_H__ +#define __FPGA_DMA_H__ + +#include <opae/fpga.h> + +//#define DEBUG_MEM 1 +//#define FPGA_DMA_DEBUG 1 +#define SKIP_FPGA2HOST_IRQ 1 +#ifdef SKIP_FPGA2HOST_IRQ +#define FPGA2HOST_IRQ_REQ false +#else +#define FPGA2HOST_IRQ_REQ true +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The DMA driver supports host to FPGA, FPGA to host and FPGA + * to FPGA transfers. The FPGA interface can be streaming + * or memory-mapped. Streaming interfaces are not currently + * supported. + */ +typedef enum { + HOST_TO_FPGA_MM = 0, // Memory mapped FPGA interface + FPGA_TO_HOST_MM, // Memory mapped FPGA interface + FPGA_TO_FPGA_MM, // Memory mapped FPGA interface + FPGA_MAX_TRANSFER_TYPE, +} fpga_dma_transfer_t; + +typedef struct _dma_handle_t *fpga_dma_handle; + +// Callback for asynchronous DMA transfers +typedef void (*fpga_dma_transfer_cb)(void *context); + +/** + * fpgaDmaOpen + * + * @brief Open a handle to DMA BBB. + * Scans the device feature chain looking for a DMA BBB. + * + * @param[in] fpga Handle to the FPGA AFU object obtained via fpgaOpen() + * @param[in] dma_base to DMA channel DFH + * @param[in] interrupt_num interrupt number assigned to DMA channel + * @param[out] dma DMA object handle + * @returns FPGA_OK on success, return code otherwise + */ +fpga_result fpgaDmaChannelOpen(fpga_handle fpga, uint64_t dma_base, int interrupt_num, fpga_dma_handle *dma); + +/** + * fpgaDmaTransferSync + * + * @brief Perform a blocking copy of 'count' bytes from memory area pointed + * by src to memory area pointed by dst where fpga_dma_transfer_t specifies the + * type of memory transfer. + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dst Address of the destination buffer + * @param[in] src Address of the source buffer + * @param[in] count Size in bytes + * @param[in] type Must be one of the following values: + * HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface. + * User must specify valid src and dst. + * FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory + * User must specify valid src and dst. + * FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces + * User must specify valid src and dst. + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +fpga_result fpgaDmaTransferSync( + fpga_dma_handle dma, uint64_t dst, uint64_t src, size_t count, fpga_dma_transfer_t type); + +/** + * fpgaDmaTransferAsync (Not supported) + * + * @brief Perform a non-blocking copy of 'count' bytes from memory area pointed + * by src to memory area pointed by dst where fpga_dma_transfer_t specifies the + * type of memory transfer. + * @param[in] dma Handle to the FPGA DMA object + * @param[in] dst Address of the destination buffer + * @param[in] src Address of the source buffer + * @param[in] count Size in bytes + * @param[in] type Must be one of the following values: + * HOST_TO_FPGA_MM - Copy data from host memory to memory mapped FPGA interface. + * User must specify valid src and dst. + * FPGA_TO_HOST_MM - Copy data from memory mapped FPGA interface to host memory + * User must specify valid src and dst. + * FPGA_TO_FPGA_MM - Copy data between memory mapped FPGA interfaces + * User must specify valid src and dst. + * @param[in] cb Callback to invoke when DMA transfer is complete + * @param[in] context Pointer to define user-defined context + * @return fpga_result FPGA_OK on success, return code otherwise + * + */ +fpga_result fpgaDmaTransferAsync(fpga_dma_handle dma, + uint64_t dst, + uint64_t src, + size_t count, + fpga_dma_transfer_t type, + fpga_dma_transfer_cb cb, + void *context); + +/** + * fpgaDmaClose + * + * @brief Close the DMA BBB handle. + * + * @param[in] dma DMA object handle + * @returns FPGA_OK on success, return code otherwise + */ +fpga_result fpgaDmaClose(fpga_dma_handle dma); + +#ifdef __cplusplus +} +#endif + +#endif // __FPGA_DMA_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h new file mode 100644 index 0000000..e4c8373 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/fpga_dma_internal.h @@ -0,0 +1,289 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +/** + * \fpga_dma_internal.h + * \brief FPGA DMA BBB Internal Header + */ + +#ifndef __FPGA_DMA_INT_H__ +#define __FPGA_DMA_INT_H__ + +#include <opae/fpga.h> +#include "x86-sse2.h" + +#ifdef CHECK_DELAYS +#pragma message "Compiled with -DCHECK_DELAYS. Not to be used in production" +#endif + +#ifdef FPGA_DMA_DEBUG +#pragma message "Compiled with -DFPGA_DMA_DEBUG. Not to be used in production" +#endif + +#ifndef max +#define max(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#endif + +#ifndef min +#define min(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + +#define FPGA_DMA_TIMEOUT_MSEC (5000) + +#define QWORD_BYTES 8 +#define DWORD_BYTES 4 +#define IS_ALIGNED_DWORD(addr) (addr % 4 == 0) +#define IS_ALIGNED_QWORD(addr) (addr % 8 == 0) + +#define FPGA_DMA_UUID_H 0xef82def7f6ec40fc +#define FPGA_DMA_UUID_L 0xa9149a35bace01ea +#define FPGA_DMA_WF_MAGIC_NO 0x5772745F53796E63ULL +#define FPGA_DMA_HOST_MASK 0x2000000000000 +#define FPGA_DMA_WF_HOST_MASK 0x3000000000000 +#define FPGA_DMA_WF_ROM_MAGIC_NO_MASK 0x1000000000000 + +#define AFU_DFH_REG 0x0 +#define AFU_DFH_NEXT_OFFSET 16 +#define AFU_DFH_EOL_OFFSET 40 +#define AFU_DFH_TYPE_OFFSET 60 + +// BBB Feature ID (refer CCI-P spec) +#define FPGA_DMA_BBB 0x2 + +// Feature ID for DMA BBB +#define FPGA_DMA_BBB_FEATURE_ID 0x765 + +// DMA Register offsets from base +#define FPGA_DMA_CSR 0x40 +#define FPGA_DMA_DESC 0x60 +#define FPGA_DMA_ADDR_SPAN_EXT_CNTL 0x200 +#define FPGA_DMA_ADDR_SPAN_EXT_DATA 0x1000 + +#define DMA_ADDR_SPAN_EXT_WINDOW (4 * 1024) +#define DMA_ADDR_SPAN_EXT_WINDOW_MASK ((uint64_t)(DMA_ADDR_SPAN_EXT_WINDOW - 1)) + +#define FPGA_DMA_MASK_32_BIT 0xFFFFFFFF + +#define FPGA_DMA_CSR_BUSY (1 << 0) +#define FPGA_DMA_DESC_BUFFER_EMPTY 0x2 +#define FPGA_DMA_DESC_BUFFER_FULL 0x4 + +#define FPGA_DMA_ALIGN_BYTES 64 +#define IS_DMA_ALIGNED(addr) (addr % FPGA_DMA_ALIGN_BYTES == 0) + +#define CSR_BASE(dma_handle) ((uint64_t)dma_handle->dma_csr_base) +#define ASE_DATA_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_data_base) +#define ASE_CNTL_BASE(dma_handle) ((uint64_t)dma_handle->dma_ase_cntl_base) +#define HOST_MMIO_32_ADDR(dma_handle, offset) \ + ((volatile uint32_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset))) +#define HOST_MMIO_64_ADDR(dma_handle, offset) \ + ((volatile uint64_t *)((uint64_t)(dma_handle)->mmio_va + (uint64_t)(offset))) +#define HOST_MMIO_32(dma_handle, offset) (*HOST_MMIO_32_ADDR(dma_handle, offset)) +#define HOST_MMIO_64(dma_handle, offset) (*HOST_MMIO_64_ADDR(dma_handle, offset)) + +#define CSR_STATUS(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, status)) +#define CSR_CONTROL(dma_h) (CSR_BASE(dma_h) + offsetof(msgdma_csr_t, ctrl)) + +// Granularity of DMA transfer (maximum bytes that can be packed +// in a single descriptor).This value must match configuration of +// the DMA IP. Larger transfers will be broken down into smaller +// transactions. +#define FPGA_DMA_BUF_SIZE (1024 * 1024 * 2UL) +#define FPGA_DMA_BUF_ALIGN_SIZE FPGA_DMA_BUF_SIZE + +// Convenience macros + +#ifdef FPGA_DMA_DEBUG +#define debug_print(fmt, ...) \ + do { \ + if (FPGA_DMA_DEBUG) { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + } \ + } while (0) +#define error_print(fmt, ...) \ + do { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + err_cnt++; \ + } while (0) +#else +#define debug_print(...) +#define error_print(...) +#endif + +#define FPGA_DMA_MAX_BUF 2 + +typedef struct __attribute__((__packed__)) { + uint64_t dfh; + uint64_t feature_uuid_lo; + uint64_t feature_uuid_hi; +} dfh_feature_t; + +typedef union { + uint64_t reg; + struct { + uint64_t feature_type : 4; + uint64_t reserved_8 : 8; + uint64_t afu_minor : 4; + uint64_t reserved_7 : 7; + uint64_t end_dfh : 1; + uint64_t next_dfh : 24; + uint64_t afu_major : 4; + uint64_t feature_id : 12; + } bits; +} dfh_reg_t; + +struct _dma_handle_t { + fpga_handle fpga_h; + uint32_t mmio_num; + uint64_t mmio_va; + uint64_t cur_ase_page; + uint64_t dma_base; + uint64_t dma_offset; + uint64_t dma_csr_base; + uint64_t dma_desc_base; + uint64_t dma_ase_cntl_base; + uint64_t dma_ase_data_base; + // Interrupt event handle + fpga_event_handle eh; + // magic number buffer + volatile uint64_t *magic_buf; + uint64_t magic_iova; + uint64_t magic_wsid; + uint64_t *dma_buf_ptr[FPGA_DMA_MAX_BUF]; + uint64_t dma_buf_wsid[FPGA_DMA_MAX_BUF]; + uint64_t dma_buf_iova[FPGA_DMA_MAX_BUF]; +}; + +typedef union { + uint32_t reg; + struct { + uint32_t tx_channel : 8; + uint32_t generate_sop : 1; + uint32_t generate_eop : 1; + uint32_t park_reads : 1; + uint32_t park_writes : 1; + uint32_t end_on_eop : 1; + uint32_t reserved_1 : 1; + uint32_t transfer_irq_en : 1; + uint32_t early_term_irq_en : 1; + uint32_t trans_error_irq_en : 8; + uint32_t early_done_en : 1; + uint32_t reserved_2 : 6; + uint32_t go : 1; + }; +} msgdma_desc_ctrl_t; + +typedef struct __attribute__((__packed__)) { + // 0x0 + uint32_t rd_address; + // 0x4 + uint32_t wr_address; + // 0x8 + uint32_t len; + // 0xC + uint16_t seq_num; + uint8_t rd_burst_count; + uint8_t wr_burst_count; + // 0x10 + uint16_t rd_stride; + uint16_t wr_stride; + // 0x14 + uint32_t rd_address_ext; + // 0x18 + uint32_t wr_address_ext; + // 0x1c + msgdma_desc_ctrl_t control; +} msgdma_ext_desc_t; + +typedef union { + uint32_t reg; + struct { + uint32_t busy : 1; + uint32_t desc_buf_empty : 1; + uint32_t desc_buf_full : 1; + uint32_t rsp_buf_empty : 1; + uint32_t rsp_buf_full : 1; + uint32_t stopped : 1; + uint32_t resetting : 1; + uint32_t stopped_on_errror : 1; + uint32_t stopped_on_early_term : 1; + uint32_t irq : 1; + uint32_t reserved : 22; + } st; +} msgdma_status_t; + +typedef union { + uint32_t reg; + struct { + uint32_t stop_dispatcher : 1; + uint32_t reset_dispatcher : 1; + uint32_t stop_on_error : 1; + uint32_t stopped_on_early_term : 1; + uint32_t global_intr_en_mask : 1; + uint32_t stop_descriptors : 1; + uint32_t rsvd : 22; + } ct; +} msgdma_ctrl_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rd_fill_level : 16; + uint32_t wr_fill_level : 16; + } fl; +} msgdma_fill_level_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rsp_fill_level : 16; + uint32_t rsvd : 16; + } rsp; +} msgdma_rsp_level_t; + +typedef union { + uint32_t reg; + struct { + uint32_t rd_seq_num : 16; + uint32_t wr_seq_num : 16; + } seq; +} msgdma_seq_num_t; + +typedef struct __attribute__((__packed__)) { + // 0x0 + msgdma_status_t status; + // 0x4 + msgdma_ctrl_t ctrl; + // 0x8 + msgdma_fill_level_t fill_level; + // 0xc + msgdma_rsp_level_t rsp; + // 0x10 + msgdma_seq_num_t seq_num; +} msgdma_csr_t; + +#endif // __FPGA_DMA_INT_H__ diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp new file mode 100644 index 0000000..206b98a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.cpp @@ -0,0 +1,278 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <poll.h> +#include <stdlib.h> + +#include <thread> + +#include "ccip_mmd_device.h" +#include "eventfd_wrapper.h" +#include "kernel_interrupt.h" + +using namespace intel_opae_mmd; + +// if ENABLE_OPENCL_KERNEL_INTERRUPTS is set at compile time, interrupts will +// be enabled. +#define ENABLE_OPENCL_KERNEL_INTERRUPTS + +// if ENABLE_OPENCL_KERNEL_POLLING_THREAD is set at compile time, a thread will +// replace yield and the thread will call runtime call back + +// DLA runtime assumes interrupt service routing will run on its own (instead of runtime yielding to MMD) when hardware +// interrupts +#ifdef DLA_MMD +#define ENABLE_OPENCL_KERNEL_POLLING_THREAD +#endif + +// ccip interrupt line that is used for kernel +#define MMD_KERNEL_INTERRUPT_LINE_NUM 1 + +KernelInterrupt::KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle) + : m_initialized(false), + m_eventfd_wrapper(NULL), + m_thread(NULL), + m_kernel_interrupt_fn(NULL), + m_kernel_interrupt_user_data(NULL), + m_fpga_handle(fpga_handle_arg), + m_mmd_handle(mmd_handle), + m_event_handle(0) { + enable_interrupts(); +} + +KernelInterrupt::~KernelInterrupt() { disable_interrupts(); } + +void KernelInterrupt::disable_interrupts() { + // kill the thread + if (m_thread) { + // send message to thread to end it + m_eventfd_wrapper->notify(1); + + // join with thread until it ends + m_thread->join(); + + delete m_thread; + m_thread = NULL; + } + + if (m_eventfd_wrapper) { + delete m_eventfd_wrapper; + m_eventfd_wrapper = NULL; + } + + if (m_event_handle) { + fpga_result res; +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + res = fpgaUnregisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error fpgaUnregisterEvent"); + } +#endif + + res = fpgaDestroyEventHandle(&m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error fpgaDestroyEventHandle"); + } + } + + // disable opencl kernel interrupts +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + set_interrupt_mask(0x00000000); +#endif + + m_initialized = false; +} + +void KernelInterrupt::enable_interrupts() { + m_eventfd_wrapper = new eventfd_wrapper(); + if (!m_eventfd_wrapper->initialized()) return; + +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + m_thread = new std::thread(interrupt_polling_thread, std::ref(*this)); +#endif + + fpga_result res; + // Create event + res = fpgaCreateEventHandle(&m_event_handle); + if (res != FPGA_OK) { + fprintf(stderr, "error creating event handle"); + return; + } + +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + // Register user interrupt with event handle + res = fpgaRegisterEvent(m_fpga_handle, FPGA_EVENT_INTERRUPT, m_event_handle, MMD_KERNEL_INTERRUPT_LINE_NUM); + if (res != FPGA_OK) { + fprintf(stderr, "error registering event"); + res = fpgaDestroyEventHandle(&m_event_handle); + return; + } + + // enable opencl kernel interrupts +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + set_interrupt_mask(0x00000001); +#endif +#endif + + m_initialized = true; +} + +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) +void KernelInterrupt::set_interrupt_mask(uint32_t intr_mask) { + fpga_result res; + res = fpgaWriteMMIO32(m_fpga_handle, 0, AOCL_IRQ_MASKING_BASE, intr_mask); + if (res != FPGA_OK) { + fprintf(stderr, "Error fpgaWriteMMIO32: %d\n", res); + return; + } +} +#endif + +void KernelInterrupt::interrupt_polling_thread(KernelInterrupt& obj) { + bool thread_is_active = true; + while (thread_is_active) { +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + const int timeout = -1; +#else + const int timeout = 0; + usleep(100); +#endif + thread_is_active = obj.poll_interrupt(timeout); + } +} + +bool KernelInterrupt::poll_interrupt(int poll_timeout_arg) { + fpga_result fpga_res; + + int res; + // get eventfd handles + int intr_fd; + fpga_res = fpgaGetOSObjectFromEventHandle(m_event_handle, &intr_fd); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "error getting event file handle"); + return false; + } + int thread_signal_fd = m_eventfd_wrapper->get_fd(); + + struct pollfd pollfd_arr[2]; + pollfd_arr[0].fd = intr_fd; + pollfd_arr[0].events = POLLIN; + pollfd_arr[0].revents = 0; + pollfd_arr[1].fd = thread_signal_fd; + pollfd_arr[1].events = POLLIN; + pollfd_arr[1].revents = 0; + res = poll(pollfd_arr, 2, poll_timeout_arg); + if (res < 0) { + fprintf(stderr, "Poll error errno = %s\n", strerror(errno)); + return false; + } else if (res > 0 && pollfd_arr[0].revents == POLLIN) { + uint64_t count; + ssize_t bytes_read = read(intr_fd, &count, sizeof(count)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + // TODO: remove exit call. Revist this when fixing kernel interrupts + exit(-1); + } + } else if (res > 0 && pollfd_arr[1].revents == POLLIN) { + uint64_t count; + ssize_t bytes_read = read(thread_signal_fd, &count, sizeof(count)); + if (bytes_read > 0) { + DEBUG_PRINT("Poll success. Return=%d count=%lu\n", res, count); + } else { + fprintf(stderr, "Error: poll failed: %s\n", bytes_read < 0 ? strerror(errno) : "zero bytes read"); + // TODO: remove exit call. Revist this when fixing kernel interrupts + exit(-1); + } + return false; + } else { + // no event fd event happened +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + return false; +#endif + } + +#ifdef DLA_MMD + run_kernel_interrupt_fn(); +#else // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + + // probobly not required for interrupt polling but we poll the interrupt + // csr line to make sure an interrupt was actually triggered + uint32_t irqval = 0; + fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res); + return false; + } + + DEBUG_PRINT("irqval: %u\n", irqval); + if (irqval) run_kernel_interrupt_fn(); + +#ifdef ENABLE_OPENCL_KERNEL_INTERRUPTS + // workaround for fb:530016 + // check if irq line is still high and generate another interrupt event + fpga_res = fpgaReadMMIO32(m_fpga_handle, 0, AOCL_IRQ_POLLING_BASE, &irqval); + if (fpga_res != FPGA_OK) { + fprintf(stderr, "Error fpgaReadMMIO32: %d\n", fpga_res); + return false; + } + + // signal intr event fd + if (irqval) { + DEBUG_PRINT("CRITICAL WARNING: irqval has not been cleared by aocl runtime\n"); + uint64_t count = 1; + ssize_t res = write(intr_fd, &count, sizeof(count)); + if (res < 0) { + fprintf(stderr, "eventfd : %s", strerror(errno)); + return false; + } + } +#endif +#endif + + return true; +} + +bool KernelInterrupt::yield_is_enabled() { +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + return false; +#else + return true; +#endif +} + +void KernelInterrupt::yield() { +#ifdef ENABLE_OPENCL_KERNEL_POLLING_THREAD + usleep(0); +#else + poll_interrupt(0); +#endif +} + +void KernelInterrupt::run_kernel_interrupt_fn() { + if (m_kernel_interrupt_fn) { + m_kernel_interrupt_fn(m_mmd_handle, m_kernel_interrupt_user_data); + } else { + fprintf(stderr, "m_kernel_interrupt_fn is NULL. No interrupt handler set!\n"); + } +} + +void KernelInterrupt::set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data) { + m_kernel_interrupt_fn = fn; + m_kernel_interrupt_user_data = user_data; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h new file mode 100644 index 0000000..44e9b50 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/kernel_interrupt.h @@ -0,0 +1,75 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _KERNEL_INTERRUPT_H +#define _KERNEL_INTERRUPT_H + +#include <opae/fpga.h> + +#include <atomic> +#include <thread> + +#include "aocl_mmd.h" + +namespace intel_opae_mmd { + +class eventfd_wrapper; + +class KernelInterrupt final { + public: + KernelInterrupt(fpga_handle fpga_handle_arg, int mmd_handle); + ~KernelInterrupt(); + + bool initialized() { return m_initialized; } + + void set_kernel_interrupt(aocl_mmd_interrupt_handler_fn fn, void* user_data); + void yield(); + static bool yield_is_enabled(); + + void enable_interrupts(); + void disable_interrupts(); + + private: +#ifndef DLA_MMD // IRQ offsets no longer exist in DLA hardware (removed from board.qsys) + void set_interrupt_mask(uint32_t intr_mask); +#endif + void run_kernel_interrupt_fn(); + bool poll_interrupt(int poll_timeout_arg); + + static void interrupt_polling_thread(KernelInterrupt& obj); + + bool m_initialized; + eventfd_wrapper* m_eventfd_wrapper; + + std::thread* m_thread; + + aocl_mmd_interrupt_handler_fn m_kernel_interrupt_fn; + void* m_kernel_interrupt_user_data; + + fpga_handle m_fpga_handle; + int m_mmd_handle; + + fpga_event_handle m_event_handle; + + // not used and not implemented + KernelInterrupt(KernelInterrupt& other); + KernelInterrupt& operator=(const KernelInterrupt& other); +}; // class KernelInterrupt + +}; // namespace intel_opae_mmd + +#endif // _KERNEL_INTERRUPT_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c new file mode 100644 index 0000000..65d7f1a --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.c @@ -0,0 +1,133 @@ +// Copyright 2018-2020 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// This is derived from OPAE + OpenCL PAC BSP + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE + +#include <assert.h> +#include <safe_string/safe_string.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +#include "memcpy_s_fast.h" +#include "x86-sse2.h" + +#pragma pop_macro("_GNU_SOURCE") + +static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n); + +memcpy_fn_t p_memcpy = memcpy_setup; // Initial value points to setup routine + +/** + * SSE2_memcpy + * + * @brief memcpy using SSE2 or REP MOVSB + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ +static void *SSE2_memcpy(void *dst, size_t max, const void *src, size_t n) { + assert(n <= max); + + void *ldst = dst; + void *lsrc = (void *)src; + if (IS_CL_ALIGNED(src) && IS_CL_ALIGNED(dst)) // 64-byte aligned + { + if (n >= MIN_SSE2_SIZE) // Arbitrary crossover performance point + { + debug_print("copying 0x%lx bytes with SSE2\n", (uint64_t)ALIGN_TO_CL(n)); + aligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n)); + ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n)); + lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n)); + n -= ALIGN_TO_CL(n); + } + } else { + if (n >= MIN_SSE2_SIZE) // Arbitrary crossover performance point + { + debug_print("copying 0x%lx bytes (unaligned) with SSE2\n", (uint64_t)ALIGN_TO_CL(n)); + unaligned_block_copy_sse2((int64_t * __restrict) dst, (int64_t * __restrict) src, ALIGN_TO_CL(n)); + ldst = (void *)((uint64_t)dst + ALIGN_TO_CL(n)); + lsrc = (void *)((uint64_t)src + ALIGN_TO_CL(n)); + n -= ALIGN_TO_CL(n); + } + } + + if (n) { + register unsigned long int dummy; + debug_print("copying 0x%lx bytes with REP MOVSB\n", n); + __asm__ __volatile__("rep movsb\n" + : "=&D"(ldst), "=&S"(lsrc), "=&c"(dummy) + : "0"(ldst), "1"(lsrc), "2"(n) + : "memory"); + } + + return dst; +} + +/** + * memcpy_wrap + * + * @brief Trampoline for memcpy + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ + +#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK +static void *memcpy_wrap(void *dst, size_t max, const void *src, size_t n) { return memcpy(dst, src, n); } +#endif // ENABLE_MEMCPY_ENV_VAR_CHECK + +/** + * memcpy_setup + * Will be called on the first memcpy_s_fast invocation only. + * + * @brief Set up which memcpy routine will be used at runtime + * @param[in] dst Pointer to the destination memory + * @param[in] max Size in bytes of destination + * @param[in] src Pointer to the source memory + * @param[in] n Size in bytes to copy + * @return dst + * + */ + +static void *memcpy_setup(void *dst, size_t max, const void *src, size_t n) { + // Default to SSE2_memcpy + p_memcpy = SSE2_memcpy; + +// +#ifdef ENABLE_MEMCPY_ENV_VAR_CHECK + char *pmemcpy = getenv(USE_MEMCPY_ENV); + + if (pmemcpy) { + if (!strcasecmp(pmemcpy, "libc")) { + p_memcpy = memcpy_wrap; + } else if (!strcasecmp(pmemcpy, "sse2")) { + p_memcpy = SSE2_memcpy; + } else if (!strcasecmp(pmemcpy, "memcpy_s")) { + p_memcpy = (memcpy_fn_t)memcpy_s; + } + } +#endif // #ifdef ENABLE_MEMCPY_ENV_VAR_CHECK + + return p_memcpy(dst, max, src, n); +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h new file mode 100644 index 0000000..08056d3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/memcpy_s_fast.h @@ -0,0 +1,69 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef MEMCPY_S_FAST_H_ +#define MEMCPY_S_FAST_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Constants needed in memcpy routines +// Arbitrary crossover point for using SSE2 over rep movsb +#define MIN_SSE2_SIZE 4096 + +// TODO: hidden environment variables to experiment with performance +// in production software are not a good idea in my opinion. Commenting out +// for now but hopefully can remove this code completely in the long term. +//#define USE_MEMCPY_ENV "PAC_MEMCPY" + +#define CACHE_LINE_SIZE 64 +#define ALIGN_TO_CL(x) ((uint64_t)(x) & ~(CACHE_LINE_SIZE - 1)) +#define IS_CL_ALIGNED(x) (((uint64_t)(x) & (CACHE_LINE_SIZE - 1)) == 0) + +// Convenience macros +#ifdef DEBUG_MEM +#define debug_print(fmt, ...) \ + do { \ + if (FPGA_DMA_DEBUG) { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +#define error_print(fmt, ...) \ + do { \ + fprintf(stderr, "%s (%d) : ", __FUNCTION__, __LINE__); \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ + err_cnt++; \ + } while (0) +#else +#define debug_print(...) +#define error_print(...) +#endif + +typedef void *(*memcpy_fn_t)(void *dst, size_t max, const void *src, size_t len); + +extern memcpy_fn_t p_memcpy; + +#define memcpy_s_fast(a, b, c, d) p_memcpy(a, b, c, d) + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // MEMCPY_S_FAST_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp new file mode 100644 index 0000000..92337a3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.cpp @@ -0,0 +1,434 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> + +#include <safe_string/safe_string.h> +#include "memcpy_s_fast.h" + +#include "ccip_mmd_device.h" +#include "mmd_dma.h" + +using namespace intel_opae_mmd; + +// disable dma and only use mmio. this is very slow. +//#define DISABLE_DMA + +// Each MSGDMA_BBB DFH is now 0x100 instead of 0x2_0000 (it needed to be 0x2_0000 previously because +// the ASE component was within the msgdma_bbb.qsys). +// Original addressing: +// board_afu_dfh: 0x0-0x3f. +// msgdma_bbb_csr: 0x2_0000-0x2_1fff. +// Original range at board.ddr_board.msgdma_bbb: 0x2_0000- 0x2_1fff. +// DFH : 0x0-0x3f. +// ASE.cntl : 0x200-0x207. +// ASE.windowed_slave : 0x1000-0x1fff. +// Current addressing (with ASE removed from the msgdma_bbb and now living on its own in ddr_board.qsys): +// From top-level board.qsys (base address 0x0): +// board | dfh : 0x0_0000 - 0x0_003f +// board | ddr_board.ase : 0x1_0000 - 0x1_1fff +// board | ddr_board.msgdma_bbb_0 : 0x2_0000 - 0x2_007f +// board | ddr_board.msgdma_bbb_1 : 0x2_0100 - 0x2_017f +// board | ddr_board.null_dfh : 0x2_0200 - 0x2_023f +// From ase.qsys (base address: 0x1_0000): +// board.ddr_board.ase.dfh_csr : 0x0-0x3f +// board.ddr_board.ase.ASE.cntl : 0x200-0x207 +// board.ddr_board.ase.ASE.windowed_slave : 0x1000-0x1fff +// From msgdma_bbb.qsys inst0 (base address: 0x2_0000) +// board.ddr_board.msgdma_bbb_inst_0.dfh_csr : 0x0-0x3f +// board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.CSR : 0x40-0x5f +// board.ddr_board.msgdma_bbb_inst_0.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f +// From msgdma_bbb.qsys inst1 (base address: 0x2_0100) +// board.ddr_board.msgdma_bbb_inst_1.dfh_csr : 0x0-0x3f +// board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.CSR : 0x40-0x5f +// board.ddr_board.msgdma_bbb_inst_1.modular_sgdma_dispatcher.Descriptor_slave : 0x60-0x7f + +#define MEM_WINDOW_CRTL 0x200 +#define MEM_WINDOW_MEM 0x1000 +#define MEM_WINDOW_SPAN (4 * 1024) +#define MEM_WINDOW_SPAN_MASK ((long)(MEM_WINDOW_SPAN - 1)) +#define MINIMUM_DMA_SIZE 256 +#define DMA_ALIGNMENT 256 + +#ifdef DEBUG_MEM +#define DCP_DEBUG_DMA(...) fprintf(stderr, __VA_ARGS__) +#else +#define DCP_DEBUG_DMA(...) +#endif + +mmd_dma::mmd_dma(fpga_handle fpga_handle_arg, + int mmd_handle, + uint64_t dfh_offset_arg, + uint64_t ase_bbb_addr_arg, + int interrupt_num_arg) + : m_initialized(false), + m_dma_op_mutex(), + m_status_handler_fn(NULL), + m_status_handler_user_data(NULL), + m_fpga_handle(fpga_handle_arg), + m_mmd_handle(mmd_handle), + dfh_offset(dfh_offset_arg), + interrupt_num(interrupt_num_arg), + dma_h(NULL), + msgdma_bbb_base_addr(0), + ase_bbb_base_addr(ase_bbb_addr_arg) { +#ifndef DISABLE_DMA + + fpga_result res; + res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h); + if (res != FPGA_OK) { + m_dma_work_thread = NULL; + fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res)); + return; + } +#endif // DISABLE_DMA + + m_dma_work_thread = new dma_work_thread(*this); + if (!m_dma_work_thread->initialized()) { + return; + } + + m_initialized = true; +} + +mmd_dma::~mmd_dma() { + // kill the thread + if (m_dma_work_thread) { + delete m_dma_work_thread; + m_dma_work_thread = NULL; + } + + if (dma_h) { + if (fpgaDmaClose(dma_h) != FPGA_OK) fprintf(stderr, "Error closing DMA\n"); + } + m_initialized = false; +} + +void mmd_dma::reinit_dma() { + if (!m_initialized) return; + + if (dma_h) { + m_initialized = false; + + fpga_result res; + res = fpgaDmaClose(dma_h); + dma_h = NULL; + if (res != FPGA_OK) { + fprintf(stderr, "Error closing DMA\n"); + return; + } + + res = fpgaDmaChannelOpen(m_fpga_handle, dfh_offset, interrupt_num, &dma_h); + if (res != FPGA_OK) { + fprintf(stderr, "Error initializing DMA: %s\n", fpgaErrStr(res)); + return; + } + + m_initialized = true; + } +} + +void mmd_dma::set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data) { + m_status_handler_fn = fn; + m_status_handler_user_data = user_data; +} + +void mmd_dma::event_update_fn(aocl_mmd_op_t op, int status) { + m_status_handler_fn(m_mmd_handle, m_status_handler_user_data, op, status); +} + +fpga_result mmd_dma::do_dma(dma_work_item &item) { + // main dma function needs to be thread safe because dma csr operations + // are not thread safe + std::lock_guard<std::mutex> lock(m_dma_op_mutex); + + fpga_result res = FPGA_OK; + assert(item.rd_host_addr != NULL || item.wr_host_addr != NULL); + + // Tell the kernel we'll need these and they're sequential + uint64_t addr = item.rd_host_addr ? (uint64_t)item.rd_host_addr : (uint64_t)item.wr_host_addr; + addr = addr & ~((uint64_t)getpagesize() - 1); // Align to page boundary + size_t remainder = ((size_t)getpagesize() - (addr & getpagesize())) & ~(getpagesize() - 1); + madvise((void *)addr, item.size + remainder, MADV_SEQUENTIAL); + + if (item.rd_host_addr) { + res = read_memory(item.rd_host_addr, item.dev_addr, item.size); + } else { + assert(item.wr_host_addr); + res = write_memory(item.wr_host_addr, item.dev_addr, item.size); + } + + if (item.op) { + // TODO: check what 'status' value should really be. Right now just + // using 0 as was done in previous CCIP MMD. Also handle case if op is NULL + event_update_fn(item.op, 0); + } + + return res; +} + +fpga_result mmd_dma::enqueue_dma(dma_work_item &item) { + return static_cast<fpga_result>(m_dma_work_thread->enqueue_dma(item)); +} + +fpga_result mmd_dma::read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size) { + assert(host_addr); + dma_work_item item; + item.op = op; + item.rd_host_addr = host_addr; + item.wr_host_addr = NULL; + item.dev_addr = dev_addr; + item.size = size; + + return enqueue_dma(item); +} + +fpga_result mmd_dma::write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size) { + assert(host_addr); + dma_work_item item; + item.op = op; + item.rd_host_addr = NULL; + item.wr_host_addr = host_addr; + item.dev_addr = dev_addr; + item.size = size; + + return enqueue_dma(item); +} + +fpga_result mmd_dma::read_memory(uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + // check for alignment + if (dev_addr % DMA_ALIGNMENT != 0) { + // check for mmio alignment + uint64_t mmio_shift = dev_addr % 8; + if (mmio_shift != 0) { + size_t unaligned_size = 8 - mmio_shift; + if (unaligned_size > size) unaligned_size = size; + + read_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size); + + if (size > unaligned_size) + res = read_memory( + (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size); + return res; + } + + // TODO: need to do a shift here + return read_memory_mmio(host_addr, dev_addr, size); + } + + // check size + if (size < MINIMUM_DMA_SIZE) return read_memory_mmio(host_addr, dev_addr, size); + + size_t remainder = (size % DMA_ALIGNMENT); + size_t dma_size = size - remainder; + +#ifdef DISABLE_DMA + res = read_memory_mmio(host_addr, dev_addr, dma_size); +#else + res = fpgaDmaTransferSync(dma_h, (uint64_t)host_addr /*dst*/, dev_addr /*src*/, dma_size, FPGA_TO_HOST_MM); +#endif + if (res != FPGA_OK) return res; + + if (remainder) res = read_memory_mmio(host_addr + dma_size / 8, dev_addr + dma_size, remainder); + + if (res != FPGA_OK) return res; + + DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size); + DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size); + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + uint64_t shift = dev_addr % 8; + + assert(size + shift <= 8); + + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + + uint64_t dev_aligned_addr = dev_addr - shift; + + // read data from device memory + uint64_t read_tmp; + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp); + if (res != FPGA_OK) return res; + // overlay our data + memcpy_s_fast(host_addr, size, ((char *)(&read_tmp)) + shift, size); + + return FPGA_OK; +} + +fpga_result mmd_dma::read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: read_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size); + + fpga_result res = FPGA_OK; + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + for (size_t i = 0; i < size / 8; i++) { + uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + if (mem_page != cur_mem_page) { + cur_mem_page = mem_page; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + } + DCP_DEBUG_DMA("DCP DEBUG: read data %8p %08lx %16p\n", host_addr, dev_addr, host_addr); + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), host_addr); + if (res != FPGA_OK) return res; + + host_addr += 1; + dev_addr += 8; + } + + if (size % 8 != 0) { + res = read_memory_mmio_unaligned(host_addr, dev_addr, size % 8); + if (res != FPGA_OK) return res; + } + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::read_memory_mmio done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + // check for alignment + if (dev_addr % DMA_ALIGNMENT != 0) { + // check for mmio alignment + uint64_t mmio_shift = dev_addr % 8; + if (mmio_shift != 0) { + size_t unaligned_size = 8 - mmio_shift; + if (unaligned_size > size) unaligned_size = size; + + DCP_DEBUG_DMA("DCP DEBUG: write_memory %ld %ld %ld\n", mmio_shift, unaligned_size, size); + write_memory_mmio_unaligned(host_addr, dev_addr, unaligned_size); + + if (size > unaligned_size) + res = write_memory( + (uint64_t *)(((char *)host_addr) + unaligned_size), dev_addr + unaligned_size, size - unaligned_size); + return res; + } + + // TODO: need to do a shift here + return write_memory_mmio(host_addr, dev_addr, size); + } + + // check size + if (size < MINIMUM_DMA_SIZE) return write_memory_mmio(host_addr, dev_addr, size); + + size_t remainder = (size % DMA_ALIGNMENT); + size_t dma_size = size - remainder; + +// TODO: make switch for MMIO +#ifdef DISABLE_DMA + res = write_memory_mmio(host_addr, dev_addr, dma_size); +#else + res = fpgaDmaTransferSync(dma_h, dev_addr /*dst*/, (uint64_t)host_addr /*src*/, dma_size, HOST_TO_FPGA_MM); +#endif + if (res != FPGA_OK) return res; + + if (remainder) res = write_memory(host_addr + dma_size / 8, dev_addr + dma_size, remainder); + + if (res != FPGA_OK) return res; + + DCP_DEBUG_DMA("DCP DEBUG: host_addr=%p, dev_addr=%lx, size=%ld\n", host_addr, dev_addr, size); + DCP_DEBUG_DMA("DCP DEBUG: remainder=%ld, dma_size=%ld, size=%ld\n", remainder, dma_size, size); + + DCP_DEBUG_DMA("DCP DEBUG: mmd_dma::write_memory done!\n"); + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio_unaligned %p %lx %ld\n", host_addr, dev_addr, size); + fpga_result res = FPGA_OK; + + uint64_t shift = dev_addr % 8; + + assert(size + shift <= 8); + + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + + uint64_t dev_aligned_addr = dev_addr - shift; + + // read data from device memory + uint64_t read_tmp; + res = fpgaReadMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + ((dev_aligned_addr)&MEM_WINDOW_SPAN_MASK), &read_tmp); + if (res != FPGA_OK) return res; + // overlay our data + memcpy_s_fast(((char *)(&read_tmp)) + shift, size, host_addr, size); + + // write back to device + res = fpgaWriteMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_aligned_addr & MEM_WINDOW_SPAN_MASK), read_tmp); + if (res != FPGA_OK) return res; + + return FPGA_OK; +} + +fpga_result mmd_dma::write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size) { + DCP_DEBUG_DMA("DCP DEBUG: write_memory_mmio %p %lx %ld\n", host_addr, dev_addr, size); + + fpga_result res = FPGA_OK; + uint64_t cur_mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + for (size_t i = 0; i < size / 8; i++) { + uint64_t mem_page = dev_addr & ~MEM_WINDOW_SPAN_MASK; + if (mem_page != cur_mem_page) { + cur_mem_page = mem_page; + res = fpgaWriteMMIO64(m_fpga_handle, 0, ase_bbb_base_addr + MEM_WINDOW_CRTL, cur_mem_page); + if (res != FPGA_OK) return res; + DCP_DEBUG_DMA("DCP DEBUG: set page %08lx\n", cur_mem_page); + } + DCP_DEBUG_DMA("DCP DEBUG: write data %8p %08lx %016lx\n", host_addr, dev_addr, *host_addr); + res = fpgaWriteMMIO64( + m_fpga_handle, 0, (ase_bbb_base_addr + MEM_WINDOW_MEM) + (dev_addr & MEM_WINDOW_SPAN_MASK), *host_addr); + if (res != FPGA_OK) return res; + + host_addr += 1; + dev_addr += 8; + } + + if (size % 8 != 0) { + res = write_memory_mmio_unaligned(host_addr, dev_addr, size % 8); + if (res != FPGA_OK) return res; + } + + DCP_DEBUG_DMA("DCP DEBUG: aocl_mmd_write done!\n"); + return FPGA_OK; +} diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h new file mode 100644 index 0000000..ff33aed --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/mmd_dma.h @@ -0,0 +1,97 @@ +/* (C) 1992-2017 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifndef _MMD_DMA_H +#define _MMD_DMA_H + +#pragma push_macro("_GNU_SOURCE") +#undef _GNU_SOURCE +#define _GNU_SOURCE +#include <sched.h> +#pragma pop_macro("_GNU_SOURCE") + +#include <opae/fpga.h> + +#include <mutex> + +#include "aocl_mmd.h" +#include "dma_work_thread.h" +#include "fpga_dma.h" + +namespace intel_opae_mmd { + +class eventfd_wrapper; + +class mmd_dma final { + public: + mmd_dma(fpga_handle fpga_handle_arg, + int mmd_handle, + uint64_t dfh_offset_arg, + uint64_t ase_bbb_addr_arg, + int interrupt_num_arg); + ~mmd_dma(); + + bool initialized() { return m_initialized; } + + fpga_result read_memory(aocl_mmd_op_t op, uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory(aocl_mmd_op_t op, const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result do_dma(dma_work_item &item); + + void set_status_handler(aocl_mmd_status_handler_fn fn, void *user_data); + + // used after reconfigation + void reinit_dma(); + + void bind_to_node(void); + + private: + // Helper functions + fpga_result enqueue_dma(dma_work_item &item); + fpga_result read_memory(uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result read_memory_mmio(uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory_mmio(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result write_memory_mmio_unaligned(const uint64_t *host_addr, size_t dev_addr, size_t size); + fpga_result read_memory_mmio_unaligned(void *host_addr, size_t dev_addr, size_t size); + + void event_update_fn(aocl_mmd_op_t op, int status); + + bool m_initialized; + + dma_work_thread *m_dma_work_thread; + std::mutex m_dma_op_mutex; + + aocl_mmd_status_handler_fn m_status_handler_fn; + void *m_status_handler_user_data; + + fpga_handle m_fpga_handle; + int m_mmd_handle; + + uint64_t dfh_offset; + int interrupt_num; + fpga_dma_handle dma_h; + uint64_t msgdma_bbb_base_addr; + uint64_t ase_bbb_base_addr; + + // not used and not implemented + mmd_dma(mmd_dma &other); + mmd_dma &operator=(const mmd_dma &other); +}; // class mmd_dma + +}; // namespace intel_opae_mmd + +#endif // _MMD_DMA_H diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S new file mode 100644 index 0000000..e1fb5d3 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.S @@ -0,0 +1,269 @@ +// From TinyMembench v0.4, with slight modifications for Windows. +/* + * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#if defined(__i386__) || defined(__amd64__) + +.intel_syntax noprefix +.text + +#define PREFETCH_DISTANCE 256 + +.macro asm_function_helper function_name + .global \function_name +.func \function_name +\function_name: +#ifdef __amd64__ + #ifdef _WIN64 + .set DST, rcx + .set SRC, rdx + .set SIZE, r8 + #else + .set DST, rdi + .set SRC, rsi + .set SIZE, rdx + #endif +#else + mov eax, [esp + 4] + mov ecx, [esp + 8] + mov edx, [esp + 12] + .set DST, eax + .set SRC, ecx + .set SIZE, edx +#endif +.endm + +.macro asm_function function_name +#if defined(_WIN32) && !defined(_WIN64) + asm_function_helper _\function_name +#else + asm_function_helper \function_name +#endif +.endm + +.macro push3 a, b, c + push \a + push \b + push \c +.endm + +.macro pop3 a, b, c + pop \c + pop \b + pop \a +.endm + +/*****************************************************************************/ + +asm_function aligned_block_copy_movsb +0: +#ifdef __amd64__ + push3 rdi rsi rcx + push3 DST SRC SIZE + pop3 rdi rsi rcx + rep movsb + pop3 rdi rsi rcx +#else + push3 edi esi ecx + push3 DST SRC SIZE + pop3 edi esi ecx + rep movsb + pop3 edi esi ecx +#endif + ret +.endfunc + +asm_function aligned_block_copy_movsd +0: +#ifdef __amd64__ + push3 rdi rsi rcx + push3 DST SRC SIZE + pop3 rdi rsi rcx + sar rcx, 2 + rep movsd + pop3 rdi rsi rcx +#else + push3 edi esi ecx + push3 DST SRC SIZE + pop3 edi esi ecx + sar ecx, 2 + rep movsd + pop3 edi esi ecx +#endif + ret +.endfunc + +asm_function unaligned_block_copy_sse2 +0: + movdqu xmm0, [SRC + 0] + movdqu xmm1, [SRC + 16] + movdqu xmm2, [SRC + 32] + movdqu xmm3, [SRC + 48] + movdqu [DST + 0], xmm0 + movdqu [DST + 16], xmm1 + movdqu [DST + 32], xmm2 + movdqu [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_sse2 +0: + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_sse2 +0: + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_pf32_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + prefetchnta [SRC + PREFETCH_DISTANCE + 32] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_pf32_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + prefetchnta [SRC + PREFETCH_DISTANCE + 32] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_pf64_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm1 + movdqa [DST + 32], xmm2 + movdqa [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_copy_nt_pf64_sse2 +0: + prefetchnta [SRC + PREFETCH_DISTANCE] + movdqa xmm0, [SRC + 0] + movdqa xmm1, [SRC + 16] + movdqa xmm2, [SRC + 32] + movdqa xmm3, [SRC + 48] + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm1 + movntdq [DST + 32], xmm2 + movntdq [DST + 48], xmm3 + add SRC, 64 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_fill_sse2 + movdqa xmm0, [SRC + 0] +0: + movdqa [DST + 0], xmm0 + movdqa [DST + 16], xmm0 + movdqa [DST + 32], xmm0 + movdqa [DST + 48], xmm0 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +asm_function aligned_block_fill_nt_sse2 + movdqa xmm0, [SRC + 0] +0: + movntdq [DST + 0], xmm0 + movntdq [DST + 16], xmm0 + movntdq [DST + 32], xmm0 + movntdq [DST + 48], xmm0 + add DST, 64 + sub SIZE, 64 + jg 0b + ret +.endfunc + +/*****************************************************************************/ + +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h new file mode 100644 index 0000000..6ebe2ef --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/host/x86-sse2.h @@ -0,0 +1,54 @@ +// From TinyMembench v0.4, with slight modifications for Windows. +/* + * Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __X86_SSE2_H__ +#define __X86_SSE2_H__ + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +void aligned_block_copy_movsb(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_movsd(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void unaligned_block_copy_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_copy_nt_pf32_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); +void aligned_block_copy_nt_pf64_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_fill_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +void aligned_block_fill_nt_sse2(int64_t* __restrict dst, int64_t* __restrict src, int size); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h new file mode 100644 index 0000000..edb46c7 --- /dev/null +++ b/python/openvino/runtime/coredla_device/mmd/dcp_a10_pac/include/aocl_mmd.h @@ -0,0 +1,489 @@ +#ifndef AOCL_MMD_H +#define AOCL_MMD_H + +/* (C) 1992-2019 Intel Corporation. */ +/* Intel, the Intel logo, Intel, MegaCore, NIOS II, Quartus and TalkBack words */ +/* and logos are trademarks of Intel Corporation or its subsidiaries in the U.S. */ +/* and/or other countries. Other marks and brands may be claimed as the property */ +/* of others. See Trademarks on intel.com for full list of Intel trademarks or */ +/* the Trademarks & Brands Names Database (if Intel) or See www.Intel.com/legal (if Altera) */ +/* Your use of Intel Corporation's design tools, logic functions and other */ +/* software and tools, and its AMPP partner logic functions, and any output */ +/* files any of the foregoing (including device programming or simulation */ +/* files), and any associated documentation or information are expressly subject */ +/* to the terms and conditions of the Altera Program License Subscription */ +/* Agreement, Intel MegaCore Function License Agreement, or other applicable */ +/* license agreement, including, without limitation, that your use is for the */ +/* sole purpose of programming logic devices manufactured by Intel and sold by */ +/* Intel or its authorized distributors. Please refer to the applicable */ +/* agreement for further details. */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Support for memory mapped ACL devices. + * + * Typical API lifecycle, from the perspective of the caller. + * + * 1. aocl_mmd_open must be called first, to provide a handle for further + * operations. + * + * 2. The interrupt and status handlers must be set. + * + * 3. Read and write operations are performed. + * + * 4. aocl_mmd_close may be called to shut down the device. No further + * operations are permitted until a subsequent aocl_mmd_open call. + * + * aocl_mmd_get_offline_info can be called anytime including before + * open. aocl_mmd_get_info can be called anytime between open and close. + */ + +#ifndef AOCL_MMD_CALL +#if defined(_WIN32) +#define AOCL_MMD_CALL __declspec(dllimport) +#else +#define AOCL_MMD_CALL __attribute__((visibility ("default"))) +#endif +#endif + +#ifndef WEAK +#if defined(_WIN32) +#define WEAK +#else +/* This normally comes with "__attribute__((weak))" but for reasons not presently + * understood, the shared library is not properly loaded on Ubuntu18 when the functions + * are weak. + */ +#define WEAK +#endif +#endif + +#include <cstddef> //size_t + +/* The MMD API's version - the runtime expects this string when + * AOCL_MMD_VERSION is queried. This changes only if the API has changed */ +#define AOCL_MMD_VERSION_STRING "18.1" + +/* Memory types that can be supported - bitfield. Other than physical memory + * these types closely align with the OpenCL SVM types. + * + * AOCL_MMD_PHYSICAL_MEMORY - The vendor interface includes IP to communicate + * directly with physical memory such as DDR, QDR, etc. + * + * AOCL_MMD_SVM_COARSE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data andy requires explicit function calls from the user + * to sychronize the cache between the host processor and the FPGA. This level + * of SVM is not currently supported by Altera except as a subset of + * SVM_FINE_GAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_BUFFER - The vendor interface includes support for + * caching SVM pointer data and requires additional information from the user + * and/or host runtime that can be collected during pointer allocation in order + * to sychronize the cache between the host processor and the FPGA. Once this + * additional data is provided for an SVM pointer, the vendor interface handles + * cache synchronization between the host processor & the FPGA automatically. + * This level of SVM is not currently supported by Altera except as a subset + * of SVM_FINE_GRAIN_SYSTEM support. + * + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM - The vendor interface includes support for + * caching SVM pointer data and does not require any additional information to + * sychronize the cache between the host processor and the FPGA. The vendor + * interface handles cache synchronization between the host processor & the + * FPGA automatically for all SVM pointers. This level of SVM support is + * currently under development by Altera and some features may not be fully + * supported. + */ +#define AOCL_MMD_PHYSICAL_MEMORY (1 << 0) +#define AOCL_MMD_SVM_COARSE_GRAIN_BUFFER (1 << 1) +#define AOCL_MMD_SVM_FINE_GRAIN_BUFFER (1 << 2) +#define AOCL_MMD_SVM_FINE_GRAIN_SYSTEM (1 << 3) + +/* program modes - bitfield + * + * AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM - preserve contents of global memory + * when this bit is is set to 1. If programming can't occur without preserving + * global memory contents, the program function must fail, in which case the + * runtime may re-invoke program with this bit set to 0, allowing programming + * to occur even if doing so destroys global memory contents. + * + * more modes are reserved for stacking on in the future + */ +#define AOCL_MMD_PROGRAM_PRESERVE_GLOBAL_MEM (1 << 0) +typedef int aocl_mmd_program_mode_t; + +typedef void* aocl_mmd_op_t; + +typedef struct { + unsigned lo; /* 32 least significant bits of time value. */ + unsigned hi; /* 32 most significant bits of time value. */ +} aocl_mmd_timestamp_t; + +/* Defines the set of characteristics that can be probed about the board before + * opening a device. The type of data returned by each is specified in + * parentheses in the adjacent comment. + * + * AOCL_MMD_NUM_BOARDS and AOCL_MMD_BOARD_NAMES + * These two fields can be used to implement multi-device support. The MMD + * layer may have a list of devices it is capable of interacting with, each + * identified with a unique name. The length of the list should be returned + * in AOCL_MMD_NUM_BOARDS, and the names of these devices returned in + * AOCL_MMD_BOARD_NAMES. The OpenCL runtime will try to call aocl_mmd_open + * for each board name returned in AOCL_MMD_BOARD_NAMES. + * + * */ +typedef enum { + AOCL_MMD_VERSION = 0, /* Version of MMD (char*)*/ + AOCL_MMD_NUM_BOARDS = 1, /* Number of candidate boards (int)*/ + AOCL_MMD_BOARD_NAMES = 2, /* Names of boards available delimiter=; (char*)*/ + AOCL_MMD_VENDOR_NAME = 3, /* Name of vendor (char*) */ + AOCL_MMD_VENDOR_ID = 4, /* An integer ID for the vendor (int) */ + AOCL_MMD_USES_YIELD = 5, /* 1 if yield must be called to poll hw (int) */ + /* The following can be combined in a bit field: + * AOCL_MMD_PHYSICAL_MEMORY, AOCL_MMD_SVM_COARSE_GRAIN_BUFFER, AOCL_MMD_SVM_FINE_GRAIN_BUFFER, + * AOCL_MMD_SVM_FINE_GRAIN_SYSTEM. Prior to 14.1, all existing devices supported physical memory and no types of SVM + * memory, so this is the default when this operation returns '0' for board MMDs with a version prior to 14.1 + */ + AOCL_MMD_MEM_TYPES_SUPPORTED = 6, +} aocl_mmd_offline_info_t; + +/* Defines the set of characteristics that can be probed about the board after + * opening a device. This can involve communication to the device + * + * AOCL_MMD_NUM_KERNEL_INTERFACES - The number of kernel interfaces, usually 1 + * + * AOCL_MMD_KERNEL_INTERFACES - the handle for each kernel interface. + * param_value will have size AOCL_MMD_NUM_KERNEL_INTERFACES * sizeof int + * + * AOCL_MMD_PLL_INTERFACES - the handle for each pll associated with each + * kernel interface. If a kernel interface is not clocked by acl_kernel_clk + * then return -1 + * + * */ +typedef enum { + AOCL_MMD_NUM_KERNEL_INTERFACES = 1, /* Number of Kernel interfaces (int) */ + AOCL_MMD_KERNEL_INTERFACES = 2, /* Kernel interface (int*) */ + AOCL_MMD_PLL_INTERFACES = 3, /* Kernel clk handles (int*) */ + AOCL_MMD_MEMORY_INTERFACE = 4, /* Global memory handle (int) */ + AOCL_MMD_TEMPERATURE = 5, /* Temperature measurement (float) */ + AOCL_MMD_PCIE_INFO = 6, /* PCIe information (char*) */ + AOCL_MMD_BOARD_NAME = 7, /* Name of board (char*) */ + AOCL_MMD_BOARD_UNIQUE_ID = 8, /* Unique ID of board (int) */ + AOCL_MMD_CONCURRENT_READS = 9, /* # of parallel reads; 1 is serial*/ + AOCL_MMD_CONCURRENT_WRITES = 10, /* # of parallel writes; 1 is serial*/ + AOCL_MMD_CONCURRENT_READS_OR_WRITES = 11 /* total # of concurent operations read + writes*/ +} aocl_mmd_info_t; + +typedef struct { + unsigned long long int exception_type; + void* user_private_info; + size_t user_cb; +} aocl_mmd_interrupt_info; + +typedef void (*aocl_mmd_interrupt_handler_fn)(int handle, void* user_data); +typedef void (*aocl_mmd_device_interrupt_handler_fn)(int handle, aocl_mmd_interrupt_info* data_in, void* user_data); +typedef void (*aocl_mmd_status_handler_fn)(int handle, void* user_data, aocl_mmd_op_t op, int status); + +/* Get information about the board using the enum aocl_mmd_offline_info_t for + * offline info (called without a handle), and the enum aocl_mmd_info_t for + * info specific to a certain board. + * Arguments: + * + * requested_info_id - a value from the aocl_mmd_offline_info_t enum + * + * param_value_size - size of the param_value field in bytes. This should + * match the size of the return type expected as indicated in the enum + * definition. For example, the AOCL_MMD_TEMPERATURE returns a float, so + * the param_value_size should be set to sizeof(float) and you should + * expect the same number of bytes returned in param_size_ret. + * + * param_value - pointer to the variable that will receive the returned info + * + * param_size_ret - receives the number of bytes of data actually returned + * + * Returns: a negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_get_offline_info(aocl_mmd_offline_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +AOCL_MMD_CALL int aocl_mmd_get_info(int handle, + aocl_mmd_info_t requested_info_id, + size_t param_value_size, + void* param_value, + size_t* param_size_ret) WEAK; + +/* Open and initialize the named device. + * + * The name is typically one specified by the AOCL_MMD_BOARD_NAMES offline + * info. + * + * Arguments: + * name - open the board with this name (provided as a C-style string, + * i.e. NUL terminated ASCII.) + * + * Returns: the non-negative integer handle for the board, otherwise a + * negative value to indicate error. Upon receiving the error, the OpenCL + * runtime will proceed to open other known devices, hence the MMD mustn't + * exit the application if an open call fails. + */ +AOCL_MMD_CALL int aocl_mmd_open(const char* name) WEAK; + +/* Close an opened device, by its handle. + * Returns: 0 on success, negative values on error. + */ +AOCL_MMD_CALL int aocl_mmd_close(int handle) WEAK; + +/* Set the interrupt handler for the opened device. + * The interrupt handler is called whenever the client needs to be notified + * of an asynchronous event signalled by the device internals. + * For example, the kernel has completed or is stalled. + * + * Important: Interrupts from the kernel must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a kernel interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_interrupt_handler(int handle, aocl_mmd_interrupt_handler_fn fn, void* user_data) WEAK; + +/* Set the device interrupt handler for the opened device. + * The device interrupt handler is called whenever the client needs to be notified + * of a device event signalled by the device internals. + * For example, an ECC error has been reported. + * + * Important: Interrupts from the device must be ignored until this handler is + * set + * + * Arguments: + * fn - the callback function to invoke when a device interrupt occurs + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_device_interrupt_handler(int handle, + aocl_mmd_device_interrupt_handler_fn fn, + void* user_data) WEAK; + +/* Set the operation status handler for the opened device. + * The operation status handler is called with + * status 0 when the operation has completed successfully. + * status negative when the operation completed with errors. + * + * Arguments: + * fn - the callback function to invoke when a status update is to be + * performed. + * user_data - the data that should be passed to fn when it is called. + * + * Returns: 0 if successful, negative on error + */ +AOCL_MMD_CALL int aocl_mmd_set_status_handler(int handle, aocl_mmd_status_handler_fn fn, void* user_data) WEAK; + +/* If AOCL_MMD_USES_YIELD is 1, this function is called when the host is idle + * and hence possibly waiting for events to be processed by the device. + * If AOCL_MMD_USES_YIELD is 0, this function is never called and the MMD is + * assumed to provide status/event updates via some other execution thread + * such as through an interrupt handler. + * + * Returns: non-zero if the yield function performed useful work such as + * processing DMA transactions, 0 if there is no useful work to be performed + * + * NOTE: yield may be called continuously as long as it reports that it has useful work + */ +AOCL_MMD_CALL int aocl_mmd_yield(int handle) WEAK; + +/* Read, write and copy operations on a single interface. + * If op is NULL + * - Then these calls must block until the operation is complete. + * - The status handler is not called for this operation. + * + * If op is non-NULL, then: + * - These may be non-blocking calls + * - The status handler must be called upon completion, with status 0 + * for success, and a negative value for failure. + * + * Arguments: + * op - the operation object used to track this operations progress + * + * len - the size in bytes to transfer + * + * src - the host buffer being read from + * + * dst - the host buffer being written to + * + * mmd_interface - the handle to the interface being accessed. E.g. To + * access global memory this handle will be whatever is returned by + * aocl_mmd_get_info when called with AOCL_MMD_MEMORY_INTERFACE. + * + * offset/src_offset/dst_offset - the byte offset within the interface that + * the transfer will begin at. + * + * The return value is 0 if the operation launch was successful, and + * negative otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_read( + int handle, aocl_mmd_op_t op, size_t len, void* dst, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_write( + int handle, aocl_mmd_op_t op, size_t len, const void* src, int mmd_interface, size_t offset) WEAK; +AOCL_MMD_CALL int aocl_mmd_copy( + int handle, aocl_mmd_op_t op, size_t len, int mmd_interface, size_t src_offset, size_t dst_offset) WEAK; + +/* Host Channel create operation + * Opens channel between host and kernel. + * + * Arguments: + * channel_name - name of channel to initialize. Same name as used in board_spec.xml + * + * queue_depth - the size in bytes of pinned memory queue in system memory + * + * direction - the direction of the channel + * + * The return value is negative if initialization was unsuccessful, and + * positive otherwise. Positive return value is handle to the channel to be used for + * subsequent calls for the channel. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_create(int handle, char* channel_name, size_t queue_depth, int direction) WEAK; + +/* Host Channel destroy operation + * Closes channel between host and kernel. + * + * Arguments: + * channel - the handle to the channel to close, that was obtained with + * create channel + * + * The return value is 0 if the destroy was successful, and negative + * otherwise. + */ +AOCL_MMD_CALL int aocl_mmd_hostchannel_destroy(int handle, int channel) WEAK; + +/* Host Channel get buffer operation + * Provide host with pointer to buffer they can access to to write or + * read from kernel, along with space or data available in the buffer + * in bytes. + * + * Arguments: + * channel - the handle to the channel to get the buffer for + * + * buffer_size - the address that this call will write the amount of + * space or data that's available in the buffer, + * depending on direction of the channel, in bytes + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is the pointer to the buffer that host can write + * to or read from. NULL if the status is negative. + */ +AOCL_MMD_CALL void* aocl_mmd_hostchannel_get_buffer(int handle, int channel, size_t* buffer_size, int* status) WEAK; + +/* Host Channel acknowledge buffer operation + * Acknowledge to the channel that the user has written or read data from + * it. This will make the data or additional buffer space available to + * write to or read from kernel. + * + * Arguments: + * channel - the handle to the channel that user is acknowledging + * + * send_size - the size in bytes that the user is acknowledging + * + * status - the address that this call will write to for result of this + * call. Value will be 0 for success, and negative otherwise + * + * The return value is equal to send_size if send_size was less than or + * equal to the buffer_size from get buffer call. If send_size was + * greater, then return value is the amount that was actually sent. + */ +AOCL_MMD_CALL size_t aocl_mmd_hostchannel_ack_buffer(int handle, int channel, size_t send_size, int* status) WEAK; + +/* Program the device + * + * The host will guarantee that no operations are currently executing on the + * device. That means the kernels will be idle and no read/write/copy + * commands are active. Interrupts should be disabled and the FPGA should + * be reprogrammed with the data from user_data which has size size. The host + * will then call aocl_mmd_set_status_handler and aocl_mmd_set_interrupt_handler + * again. At this point interrupts can be enabled. + * + * The new handle to the board after reprogram does not have to be the same as + * the one before. + * + * Arguments: + * user_data - The binary contents of the fpga.bin file created during + * Quartus II compilation. + * size - the size in bytes of user_data + * program_mode - bit field for programming attributes. See + * aocl_mmd_program_mode_t definition + * + * Returns: the new non-negative integer handle for the board, otherwise a + * negative value to indicate error. + */ +AOCL_MMD_CALL int aocl_mmd_program(int handle, void* user_data, size_t size, aocl_mmd_program_mode_t program_mode) WEAK; + +/* Shared memory allocator + * Allocates memory that is shared between the host and the FPGA. The + * host will access this memory using the pointer returned by + * aocl_mmd_shared_mem_alloc, while the FPGA will access the shared memory + * using device_ptr_out. If shared memory is not supported this should return + * NULL. + * + * Shared memory survives FPGA reprogramming if the CPU is not rebooted. + * + * Arguments: + * size - the size of the shared memory to allocate + * device_ptr_out - will receive the pointer value used by the FPGA (the device) + * to access the shared memory. Cannot be NULL. The type is + * unsigned long long to handle the case where the host has a + * smaller pointer size than the device. + * + * Returns: The pointer value to be used by the host to access the shared + * memory if successful, otherwise NULL. + */ +AOCL_MMD_CALL void* aocl_mmd_shared_mem_alloc(int handle, size_t size, unsigned long long* device_ptr_out) WEAK; + +/* Shared memory de-allocator + * Frees previously allocated shared memory. If shared memory is not supported, + * this function should do nothing. + * + * Arguments: + * host_ptr - the host pointer that points to the shared memory, as returned by + * aocl_mmd_shared_mem_alloc + * size - the size of the shared memory to free. Must match the size + * originally passed to aocl_mmd_shared_mem_alloc + */ +AOCL_MMD_CALL void aocl_mmd_shared_mem_free(int handle, void* host_ptr, size_t size) WEAK; + +/* DEPRECATED. Use aocl_mmd_program instead + * This reprogram API is only for mmd version previous than 18.1 + */ +AOCL_MMD_CALL int aocl_mmd_reprogram(int handle, void* user_data, size_t size) WEAK; + +// CoreDLA modifications +// To support multiple different FPGA boards, anything board specific must be implemented in a +// board-specific MMD instead of the CoreDLA runtime layer. +#ifdef DLA_MMD +#include <cstdint> +// Query functions to get board-specific values +AOCL_MMD_CALL int dla_mmd_get_max_num_instances() WEAK; +AOCL_MMD_CALL uint64_t dla_mmd_get_ddr_size_per_instance() WEAK; +AOCL_MMD_CALL double dla_mmd_get_ddr_clock_freq() WEAK; + +// Wrappers around CSR and DDR reads and writes to abstract away board-specific offsets +AOCL_MMD_CALL int dla_mmd_csr_write(int handle, int instance, uint64_t addr, const uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_csr_read(int handle, int instance, uint64_t addr, uint32_t* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_write(int handle, int instance, uint64_t addr, uint64_t length, const void* data) WEAK; +AOCL_MMD_CALL int dla_mmd_ddr_read(int handle, int instance, uint64_t addr, uint64_t length, void* data) WEAK; + +// Get the clk_dla PLL clock frequency in MHz, returns a negative value if there is an error +AOCL_MMD_CALL double dla_mmd_get_coredla_clock_freq(int handle) WEAK; +#endif + +#ifdef __cplusplus +} +#endif + +#endif |
