// Copyright 2020-2024 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. // Description of functionality: // This module is responsible for receiving DLA data from the cross bar (essentially coming from PE array) // in HWC format and dispatches it to an AXI stream interface // This module is fed by width adapter that converts between xbar_k_vec and the AXI bus width (specified at the arch file // through the output_stream_interface:bus_width) // Once data is converted to the correct interface width, it gets stored in a dual-clock FIFO which acts // as a clock crosser between the clk_dla and the clk_axi. It also acts as a conversion from the simple // ready-valid protocol in DLA to an AXI stream protocol (most of the signals won't be used) // Some control logic is used to specify which bytes are valid (through the tstrb signal), which will be used // when the number of output channels is not a multiple of k_vec. For example, if k_vec = c_vec = 8, // output_channels (O_C) = 12, we will have two transfers of size 8, the first transfer will have 8 valid outputs, // but the second transfer will have 4 valid outputs and 4 unvalid outputs (zeros), so we use the t_strb to indicate // which bytes (essentially which FP16 elements) are valid, and it's expected to be consumed by the downstream blocks // (receiver of the AXI signals) `resetall `undefineall `default_nettype none `include "dla_acl_parameter_assert.svh" module dla_output_streamer import dla_common_pkg::*, dla_output_streamer_pkg::*; #( // DLA (input data) side parameters parameter int CONFIG_WIDTH = 32, // AXI side parameters parameter int TDATA_WIDTH = 128, // an integer number of bits (typically a power of 2 from 8 - 1024) parameter int TID_WIDTH = 8, // recommended to be no more than 8. parameter int TDEST_WIDTH = 8, // recommended to be no more than 8. // Data DC FIFO Depth parameter int FIFO_DEPTH = 1024, parameter int INPUT_WIDTH_ELEMENTS = 1, parameter int INPUT_ELEMENT_WIDTH = 1, // Decide if Width adaptaion resides before or after the data CDC FIFO localparam int INPUT_DATA_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH, localparam int OUTPUT_WIDTH_ELEMENTS = TDATA_WIDTH / INPUT_ELEMENT_WIDTH, localparam int WA_BEFORE_CDC = INPUT_WIDTH_ELEMENTS < OUTPUT_WIDTH_ELEMENTS, // DLA (input data) side derived parameters // AXI side derived parameters localparam int TSTRB_WIDTH = TDATA_WIDTH / 8, localparam int TUSER_WIDTH = TDATA_WIDTH / 8 ) ( // Master/driver (DLA) signals input wire clk_dla, input wire i_aresetn, // config input for output streaming input wire [CONFIG_WIDTH-1:0] i_config_data, input wire i_config_valid, output logic o_config_ready, // input data output logic o_ready, // backpressure to xbar input wire i_valid, // valid from xbar input wire [INPUT_DATA_BITS-1:0] i_data, // data from xbar after width adaptation input wire i_data_done, // data from xbar sent was the last one (the actual last data comes after WA) output logic o_last_data_received, // got the last data // config input for flush handling input wire [CONFIG_WIDTH-1:0] i_config_flush_data, input wire i_config_flush_valid, output logic o_config_flush_ready, // input signals for flush generation output wire o_input_done, // xbar input for a layer is done and received // Receiver (AXI) signals input wire clk_axi, input wire i_axi_aresetn, output logic o_axi_t_valid, // indicates the Transmitter is driving a valid transfer input wire i_axi_t_ready, // indicates that a Receiver can accept a transfer. output wire o_axi_t_last, // Unused - indicates the boundary of a packet output wire [TDATA_WIDTH-1:0] o_axi_t_data, // the primary payload used to provide the data that is passing across the interface output wire [TSTRB_WIDTH-1:0] o_axi_t_strb, // the byte qualifier that indicates whether the content of the associated byte of TDATA is valid output wire [TSTRB_WIDTH-1:0] o_axi_t_keep, // Unused output wire [TID_WIDTH-1:0] o_axi_t_id, // Unused - data stream identifier output wire [TDEST_WIDTH-1:0] o_axi_t_dest, // Unused - provides routing information for the data stream output wire [TUSER_WIDTH-1:0] o_axi_t_user, // Unused - user-defined sideband information that can be transmitted along the data stream. output wire o_axi_t_wakeup // Unused - identifies any activity associated with AXI-Stream interface ); //reset parameterization localparam int RESET_USE_SYNCHRONIZER = 1; localparam int RESET_PIPE_DEPTH = 3; localparam int RESET_NUM_COPIES = 1; ////////////////////////////////////////// // Reset Synchronization onto DLA clk // ///////////////////////////////////////// logic [RESET_NUM_COPIES-1:0] sync_dla_resetn; dla_reset_handler_simple #( .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), .PIPE_DEPTH (RESET_PIPE_DEPTH), .NUM_COPIES (RESET_NUM_COPIES) ) dla_resetn_synchronizer ( .clk (clk_dla), .i_resetn (i_aresetn), .o_sclrn (sync_dla_resetn) ); ////////////////////////////////////////// // Reset Synchronization onto AXI clk // ///////////////////////////////////////// logic [RESET_NUM_COPIES-1:0] sync_axi_resetn; dla_reset_handler_simple #( .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), .PIPE_DEPTH (RESET_PIPE_DEPTH), .NUM_COPIES (RESET_NUM_COPIES) ) axi_resetn_synchronizer ( .clk (clk_axi), .i_resetn (i_aresetn), .o_sclrn (sync_axi_resetn) ); // last data logic received_last_data; logic xbar_sent_last_data; always_ff @ (posedge clk_dla) begin received_last_data <= 1'b0; if (i_data_done) begin xbar_sent_last_data <= 1'b1; end if (xbar_sent_last_data & i_valid & o_ready) begin received_last_data <= 1'b1; xbar_sent_last_data <= 1'b0; end if (~sync_dla_resetn) begin received_last_data <= 1'b0; xbar_sent_last_data <= 1'b0; end end assign o_last_data_received = received_last_data; logic w_flush; // Instaniate the flush generation block dla_output_streamer_flush_handler # ( .CONFIG_WIDTH(CONFIG_WIDTH) ) flush_generator ( .clk_dla(clk_dla), .i_aresetn(sync_dla_resetn[0]), .i_config_data(i_config_flush_data), .i_config_valid(i_config_flush_valid), .o_config_ready(o_config_flush_ready), .i_ready(o_ready), .i_valid(i_valid), .o_flush(w_flush), .o_input_done(o_input_done) ); // Handle Config data and strb generation // Writing side dla_clk logic config_is_loaded; logic cfg_rd_empty, cfg_rd_ack, cfg_rd_amost_empty, cfg_wr_almost_full; logic [CONFIG_WIDTH-1:0] cfg_rd_data; dla_acl_dcfifo #( .DEPTH (32), .WIDTH (CONFIG_WIDTH), .ALMOST_FULL_CUTOFF (2) ) dla_acl_fifo_inst_cfg ( .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally .wr_clock (clk_dla), .wr_req (i_config_valid), .wr_data (i_config_data), .wr_almost_full (cfg_wr_almost_full), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF .wr_full (), // inform upstream that we cannot accept data .rd_clock (clk_axi), .rd_empty (cfg_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack .rd_ack (~config_is_loaded), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream .rd_data (cfg_rd_data), .rd_almost_empty (cfg_rd_amost_empty) // early indication to downstream that soon fifo may no longer be able to supply data, threshold controlled by ALMOST_EMPTY_CUTOFF ); assign o_config_ready = ~cfg_wr_almost_full; // Reading side AXI clock logic [CONFIG_WIDTH-1:0] config_offset; output_streamer_config_t cfg; localparam int NUM_CONFIG_OFFSETS = divCeil($bits(cfg), CONFIG_WIDTH); // For now, ensure size of config is exact multiple of CONFIG_WIDTH `DLA_ACL_PARAMETER_ASSERT($bits(cfg) == NUM_CONFIG_OFFSETS * CONFIG_WIDTH); logic [CONFIG_WIDTH-1:0] config_total_transfers; // total number of axi_data transfers for a layer logic [CONFIG_WIDTH-1:0] config_total_transfers_adjusted; // total number of axi_data transfers for a layer miuns any invalid last transactions logic [CONFIG_WIDTH-1:0] config_transfers_per_hw_pixel; // Decides the total number of transfers needed to send a full set of output channels for a single piexel // for a single width/height pixel given a specific data_width for the axi interface. logic [CONFIG_WIDTH-1:0] config_valid_bytes_stream_width; // Determines how many elements of the last transfer are valid. logic [CONFIG_WIDTH-1:0] config_last_index; // Determines index of last valid transaction per height/width. logic [CONFIG_WIDTH-1:0] config_last_stream; // Determines if this stream is the last stream to geenrate tlast logic [CONFIG_WIDTH-1:0] channel_chunks_counter; // Counter for the config_transfers_per_hw_pixel logic [CONFIG_WIDTH-1:0] total_counter_out; logic ostreamer_downstream_ready; assign config_total_transfers = cfg.total_transfers; assign config_total_transfers_adjusted = cfg.total_transfers_adjusted; assign config_transfers_per_hw_pixel = cfg.transfers_per_hw_pixel; assign config_valid_bytes_stream_width = cfg.valid_bytes_stream_width; assign config_last_index = cfg.last_index; assign config_last_stream = cfg.last_stream; logic output_valid; logic output_tx_received; always_ff @(posedge clk_axi) begin // config state machine if (~config_is_loaded & ~cfg_rd_empty) begin // update progress in accepting NUM_CONFIG_OFFSETS transactions if (config_offset == NUM_CONFIG_OFFSETS-1) begin config_offset <= '0; config_is_loaded <= 1'b1; end else begin config_offset <= config_offset + 1'b1; end cfg <= (cfg_rd_data[CONFIG_WIDTH-1:0] << ($bits(cfg) - CONFIG_WIDTH)) | (cfg >> CONFIG_WIDTH); end else begin // keep track of how many transactions are read by AXI to drive t_strb if (config_is_loaded & output_tx_received) begin total_counter_out <= total_counter_out + 1; if (total_counter_out == (config_total_transfers - 1)) begin config_is_loaded <= 1'b0; total_counter_out <= '0; end channel_chunks_counter <= channel_chunks_counter + 1; // increment counter if (channel_chunks_counter == (config_transfers_per_hw_pixel - 1)) begin channel_chunks_counter <= '0; // end end end // resetn if (~sync_axi_resetn[0]) begin config_is_loaded <= 1'b0; channel_chunks_counter <= '0; total_counter_out <= '0; config_offset <= '0; end end logic [TDATA_WIDTH-1:0] ostreamer_output_data; localparam int FIFO_CUTOFF = 0; // No need for slack cycles as the full goes back and gets handled in the same cycle in the width adapter if (!WA_BEFORE_CDC) begin : GEN_WA_AFTER_DC_FIFO // In this situation we want the width adaptation to happen in the slow clock domain // so that the upstream IP can continue producing data that goes straight into the fifo localparam int XBAR_WIDTH_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH; logic [XBAR_WIDTH_BITS-1:0] fifo_data; logic fifo_rd_empty, fifo_downstream_ready; logic wr_full; dla_acl_dcfifo #( .DEPTH (FIFO_DEPTH), .WIDTH (XBAR_WIDTH_BITS), .ALMOST_FULL_CUTOFF (FIFO_CUTOFF) ) dla_acl_fifo_inst ( .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally .wr_clock (clk_dla), .wr_req (i_valid && o_ready), .wr_data (i_data), .wr_full (wr_full), // inform upstream that we cannot accept data .rd_clock (clk_axi), .rd_empty (fifo_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack .rd_ack (fifo_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream .rd_data (fifo_data) ); logic adapted_valid; // Instantiate a width adapter to convert from xbar_k_vec width to AXI width dla_width_adapter #( .GROUP_NUM ( 1 ), // hardcoded .GROUP_DELAY ( 0 ), .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ), .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS), .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ), .FLUSH_ENABLE ( 0 ) ) wa_output_stream_inst ( .clock ( clk_axi ), .i_aresetn ( i_aresetn ), .i_flush ( 1'b0 ), .o_din_ready ( fifo_downstream_ready ), .i_din_valid ( ~fifo_rd_empty ), .i_din_data ( fifo_data ), .i_dout_ready ( ostreamer_downstream_ready ), // to be received from output streamer .o_dout_valid ( adapted_valid ), .o_dout_data ( ostreamer_output_data ) ); assign output_tx_received = adapted_valid & ostreamer_downstream_ready; assign output_valid = adapted_valid; // We backpressure the upstream if the fifo is full, or if we want to flush (empty) the exit fifo // out of any leftover invalid transactions that might come out of cvec != axi assign o_ready = ~wr_full; end else begin: GEN_WA_BEFORE_DC_FIFO logic adapted_valid; logic [TDATA_WIDTH-1:0] adapted_data; logic wr_full; // dc fifo for data after width adaptation // Instantiate a width adapter to convert from xbar_k_vec width to AXI width dla_width_adapter #( .GROUP_NUM ( 1 ), // hardcoded .GROUP_DELAY ( 0 ), .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ), .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS ), .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ), .FLUSH_ENABLE ( 1 ) ) wa_output_stream_inst ( .clock ( clk_dla ), .i_aresetn ( i_aresetn ), .i_flush ( w_flush ), // flush only activated with an incoming valid transaction .o_din_ready ( o_ready ), .i_din_valid ( i_valid ), .i_din_data ( i_data ), .i_dout_ready ( ~wr_full ), // to be received from output streamer .o_dout_valid ( adapted_valid ), .o_dout_data ( adapted_data ) ); logic rd_empty; // Instantiate the output FIFO to perform clock domain crossing dla_acl_dcfifo #( .DEPTH (FIFO_DEPTH), .WIDTH (TDATA_WIDTH), .ALMOST_FULL_CUTOFF (FIFO_CUTOFF) ) dla_acl_fifo_inst ( .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally .wr_clock (clk_dla), .wr_req (adapted_valid && !wr_full), .wr_data (adapted_data), .wr_almost_full (), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF .wr_full (wr_full), // inform upstream that we cannot accept data .rd_clock (clk_axi), .rd_empty (rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack .rd_ack (ostreamer_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream .rd_data (ostreamer_output_data) ); assign output_tx_received = ~rd_empty & ostreamer_downstream_ready; assign output_valid = ~rd_empty; end // Convert FIFO interface to AXI signals logic flush_exit_fifo; // signal used to flush the fifo out of any leftover invalid transactions logic [TSTRB_WIDTH-1:0] strb_signal_normal, strb_signal_last; assign o_axi_t_valid = output_valid & config_is_loaded & ((o_axi_t_strb != '0)) & ~flush_exit_fifo; assign ostreamer_downstream_ready = (i_axi_t_ready & config_is_loaded) || (flush_exit_fifo); assign o_axi_t_data = ostreamer_output_data; assign o_axi_t_last = (total_counter_out == (config_total_transfers_adjusted - 1)) & output_valid & config_is_loaded & ostreamer_downstream_ready & config_last_stream; // Control logic to produce the o_axi_t_strb signal assign strb_signal_normal = '1; assign strb_signal_last = ((1 << config_valid_bytes_stream_width) - 1); assign o_axi_t_strb = flush_exit_fifo ? '0 : channel_chunks_counter > config_last_index ? '0 : channel_chunks_counter == config_last_index ? strb_signal_last : strb_signal_normal; // // state machine to decide if output streamer is producing output from the exit fifo // or flushing (emptying) the exit fifo // we need the ability to flush the exit fifo in some situations where part of the last transactions from the xbar // do not carry actual data (all zeros). In these situations, we need the tlast to come out with the // actual last valid transaction with valid data, and these invalid transactions to be removed from the exit fifo // // for example, if cvec=32 elements, axi=8 elements (128 bits), and channels=6, each cvec produces 4 axi transactions // In the last cvec transaction (4 axi transactions), we would have the first with valid data // and the remaining three with zeros // Up until 2024.3 release, we would produce tlast at the last axi transaction (#4), but this is // not efficient since the last valid transaction happens three transactions earlier, TX number 1 // out of the last 4 transactions. With the state machine, we produce tlast at the first transaction // of the last four alongside the tlast. Finally we enter a flush state and flush the exit fifo // to empty it out of these last three invalid transactions // typedef enum logic { ACTIVE = 1'b0, FLUSH = 1'b1 } state_t; state_t state, state_next; always_ff @(posedge clk_axi) begin if (~sync_axi_resetn[0]) begin state <= ACTIVE; end else begin state <= state_next; end end logic no_exit_fifo_flush_needed; assign no_exit_fifo_flush_needed = config_is_loaded && (config_total_transfers == config_total_transfers_adjusted); always_comb begin state_next = state; flush_exit_fifo = 0; case(state) ACTIVE: begin flush_exit_fifo = 0; if (no_exit_fifo_flush_needed) begin state_next = ACTIVE; end else if ((total_counter_out == config_total_transfers_adjusted - 1) && config_is_loaded) begin state_next = FLUSH; end end FLUSH: begin flush_exit_fifo = 1; if ((total_counter_out == config_total_transfers - 1) && config_is_loaded) begin state_next = ACTIVE; end end default: state_next = ACTIVE; // Default state endcase end endmodule