// Copyright 2024 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. /** * dla_input_streamer.sv * * FPGA AI Suite input streaming is handled here. This provides an AXI interface to the top-level entity, * and is responsible for clock-crossing from AXI to DLA clock domains. This module is also responsible for * applying the layout transform to incoming data. * * For now, enabling streaming implies enabling the hardware layout transform module. * The layout transform assumes input tensors in HWC format; and the input bus width is arbitrary, width * conversion is done in the layout transform where the output is always CVEC*sizeof(fp16). * * Flow control is handled by the configuration of the stream-buffer writer. Backpressuring from the SB * writer will be propagated to this AXI connection to avoid the need to configure the transfer frame sizes * in this module. * */ /** TODO (arooney): - Consider behaviour when a frame is done, it gets consumed by the SB, and the LT can accept a few frames before backpressuring. But then the producer is a few packets into the transmission. Maybe its best to only accept data when the layout transform is done AND the SB is ready. - Implement strobe signal handling. - Remove unused AXI signals. */ `resetall `undefineall `default_nettype none module dla_input_streamer #( parameter int TDATA_WIDTH, parameter int FIFO_DEPTH, parameter int TID_WIDTH, parameter int TDEST_WIDTH, parameter int TUSER_WIDTH, parameter dla_lt_pkg::lt_arch_t LT_ARCH, parameter int OUTPUT_WIDTH ) ( input wire clk_dla, input wire clk_ddr, // AMBA AXI-Stream signals input wire clk_axi, input wire i_resetn_async, input wire [LT_ARCH.CONFIG_BYTES*8-1:0] i_config_data, input wire i_config_valid, output logic o_config_ready, input wire i_streaming_enable, input wire i_tvalid, // indicates the transmitter is driving a valid transfer output logic o_tready, // indicates that the receiver can accept a transfer input wire [TDATA_WIDTH-1:0] i_tdata, // the primary payload of the interface input wire [TDATA_WIDTH/8-1:0] i_tstrb, // (NOT USED) byte qualifier indicating whether the // associated byte in tdata should be processed // as a data, or position byte input wire [TDATA_WIDTH/8-1:0] i_tkeep, // (NOT USED) byte qualifier indicating whether the // contents of tdata is processed as part of the data stream input wire i_tlast, // (NOT USED) indicates the boundary of a packet input wire [TID_WIDTH-1:0] i_tid, // (NOT USED) a data stream identifier input wire [TDEST_WIDTH-1:0] i_tdest, // (NOT USED) provides routing information for the data stream input wire [TUSER_WIDTH-1:0] i_tuser, // (NOT USED) user-defined sideband information input wire i_twakeup, // (AXI5-S ONLY, NOT USED) identifies any activity associated with the AXI-s interface // output output logic [OUTPUT_WIDTH-1:0] o_istream_data, output logic o_istream_valid, input wire i_istream_ready, // from input feeder output logic o_reading_first_word, // for CSR active-jobs counter output logic o_param_error ); logic resetn; logic resetn_clk_dla; logic reader_empty; logic [OUTPUT_WIDTH-1:0] dcfifo_data; logic dcfifo_valid, dcfifo_stall; logic lt_done; logic ready_input_state; // state-based input ready signal that accounts for inter-frame back-pressure logic lt_ready; // ready signal from layout transform, accounts for intra-frame back-pressure logic axi_param_error; //reset parameterization localparam int RESET_USE_SYNCHRONIZER = 1; localparam int RESET_PIPE_DEPTH = 3; localparam int RESET_NUM_COPIES = 1; dla_reset_handler_simple #( .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), .PIPE_DEPTH (RESET_PIPE_DEPTH), .NUM_COPIES (RESET_NUM_COPIES) ) istream_reset_synchronizer ( .clk (clk_axi), .i_resetn (i_resetn_async), .o_sclrn (resetn) ); dla_reset_handler_simple #( .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), .PIPE_DEPTH (RESET_PIPE_DEPTH), .NUM_COPIES (RESET_NUM_COPIES) ) istream_clk_dla_reset_synchronizer ( .clk (clk_dla), .i_resetn (i_resetn_async), .o_sclrn (resetn_clk_dla) ); dla_streamer_fsm streamer_fsm ( .clk_dla (clk_dla), .clk_axi (clk_axi), .i_resetn_axi (resetn), .i_resetn_async (i_resetn_async), .i_dla_ready (i_istream_ready), .i_lt_ready (lt_ready), .i_streaming_enable (i_streaming_enable), .i_lt_done_frame (lt_done), .i_tvalid (i_tvalid), .o_stream_ready (ready_input_state), .o_reading_first_word (o_reading_first_word) ); // accept new data when LT and input feeder are both ready. This should translate to // only accepting data when we're prepared to accept a whole image (as opposed to accepting // a couple transfers until LT is full, then waiting for previous inference, then accepting the rest, // since this would probably compilate frame dropping). dla_layout_transform #( .CNT_BITS(20), .DDR_BYTES(TDATA_WIDTH/8), .CONFIG_DATA_BYTES(LT_ARCH.CONFIG_BYTES), .DATA_ELEMENT_WIDTH(LT_ARCH.DATA_ELEMENT_WIDTH), .MAX_CHANNELS(LT_ARCH.MAX_CHANNELS), .MAX_FEATURE_HEIGHT(LT_ARCH.MAX_FEATURE_HEIGHT), .MAX_FEATURE_WIDTH(LT_ARCH.MAX_FEATURE_WIDTH), .MAX_FEATURE_DEPTH(LT_ARCH.MAX_FEATURE_DEPTH), .MAX_STRIDE_HEIGHT(LT_ARCH.MAX_STRIDE_HEIGHT), .MAX_STRIDE_WIDTH(LT_ARCH.MAX_STRIDE_WIDTH), .MAX_STRIDE_DEPTH(LT_ARCH.MAX_STRIDE_DEPTH), .CVEC(LT_ARCH.CVEC), .MAX_PAD_FRONT(LT_ARCH.MAX_PAD_FRONT), .MAX_PAD_LEFT(LT_ARCH.MAX_PAD_LEFT), .MAX_PAD_TOP(LT_ARCH.MAX_PAD_TOP), .MAX_FILTER_WIDTH(LT_ARCH.MAX_FILTER_WIDTH), .MAX_FILTER_HEIGHT(LT_ARCH.MAX_FILTER_HEIGHT), .MAX_FILTER_DEPTH(LT_ARCH.MAX_FILTER_DEPTH), .MAX_DILATION_WIDTH(LT_ARCH.MAX_DILATION_WIDTH), .MAX_DILATION_HEIGHT(LT_ARCH.MAX_DILATION_HEIGHT), .MAX_DILATION_DEPTH(LT_ARCH.MAX_DILATION_DEPTH), .DO_U8_CONV(LT_ARCH.DO_U8_CONV), .DEVICE(LT_ARCH.DEVICE) ) reader_layout_transform ( .clk(clk_axi), .i_rstn(resetn), .i_config_data(i_config_data), .i_config_valid(i_config_valid), .o_config_ready(o_config_ready), .i_data(i_tdata), .i_valid(i_tvalid & ready_input_state), .o_ready(lt_ready), .o_data(dcfifo_data), .o_valid(dcfifo_valid), .i_stall(dcfifo_stall), .o_last(lt_done), .o_param_error(axi_param_error) ); localparam int DCFIFO_ALMOST_FULL_CUTOFF = 0; dla_acl_dcfifo #( .WIDTH (OUTPUT_WIDTH), .DEPTH (FIFO_DEPTH), .ALMOST_FULL_CUTOFF (DCFIFO_ALMOST_FULL_CUTOFF) ) clock_crosser ( .async_resetn (i_resetn_async), //reset synchronization is handled internally //write side .wr_clock (clk_axi), .wr_req (dcfifo_valid), .wr_data (dcfifo_data), .wr_almost_full (dcfifo_stall), //read side .rd_clock (clk_dla), .rd_empty (reader_empty), .rd_data (o_istream_data), .rd_ack (i_istream_ready) ); dla_clock_cross_full_sync cc_param_error ( .clk_src(clk_axi), .i_src_async_resetn(1'b1), .i_src_data(axi_param_error), .o_src_data(), .clk_dst(clk_ddr), .i_dst_async_resetn(1'b1), .o_dst_data(o_param_error) ); assign o_istream_valid = ~reader_empty; assign o_tready = lt_ready & ready_input_state; endmodule