diff options
| author | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
|---|---|---|
| committer | Eric Dao <eric@erickhangdao.com> | 2025-03-10 17:54:31 -0400 |
| commit | ab224e2e6ba65f5a369ec392f99cd8845ad06c98 (patch) | |
| tree | a1e757e9341863ed52b8ad4c5a1c45933aab9da4 /python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv | |
| parent | 40da1752f2c8639186b72f6838aa415e854d0b1d (diff) | |
| download | thesis-master.tar.gz thesis-master.tar.bz2 thesis-master.zip | |
Diffstat (limited to 'python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv')
| -rw-r--r-- | python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv | 432 |
1 files changed, 432 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv new file mode 100644 index 0000000..90b7fb8 --- /dev/null +++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_output_streamer.sv @@ -0,0 +1,432 @@ +// Copyright 2020-2024 Intel Corporation. +// +// This software and the related documents are Intel copyrighted materials, +// and your use of them is governed by the express license under which they +// were provided to you ("License"). Unless the License provides otherwise, +// you may not use, modify, copy, publish, distribute, disclose or transmit +// this software or the related documents without Intel's prior written +// permission. +// +// This software and the related documents are provided as is, with no express +// or implied warranties, other than those that are expressly stated in the +// License. + +// Description of functionality: +// This module is responsible for receiving DLA data from the cross bar (essentially coming from PE array) +// in HWC format and dispatches it to an AXI stream interface +// This module is fed by width adapter that converts between xbar_k_vec and the AXI bus width (specified at the arch file +// through the output_stream_interface:bus_width) +// Once data is converted to the correct interface width, it gets stored in a dual-clock FIFO which acts +// as a clock crosser between the clk_dla and the clk_axi. It also acts as a conversion from the simple +// ready-valid protocol in DLA to an AXI stream protocol (most of the signals won't be used) +// Some control logic is used to specify which bytes are valid (through the tstrb signal), which will be used +// when the number of output channels is not a multiple of k_vec. For example, if k_vec = c_vec = 8, +// output_channels (O_C) = 12, we will have two transfers of size 8, the first transfer will have 8 valid outputs, +// but the second transfer will have 4 valid outputs and 4 unvalid outputs (zeros), so we use the t_strb to indicate +// which bytes (essentially which FP16 elements) are valid, and it's expected to be consumed by the downstream blocks +// (receiver of the AXI signals) + +`resetall +`undefineall +`default_nettype none +`include "dla_acl_parameter_assert.svh" + +module dla_output_streamer import dla_common_pkg::*, dla_output_streamer_pkg::*; #( + // DLA (input data) side parameters + parameter int CONFIG_WIDTH = 32, + // AXI side parameters + parameter int TDATA_WIDTH = 128, // an integer number of bits (typically a power of 2 from 8 - 1024) + parameter int TID_WIDTH = 8, // recommended to be no more than 8. + parameter int TDEST_WIDTH = 8, // recommended to be no more than 8. + + // Data DC FIFO Depth + parameter int FIFO_DEPTH = 1024, + + + parameter int INPUT_WIDTH_ELEMENTS = 1, + parameter int INPUT_ELEMENT_WIDTH = 1, + + // Decide if Width adaptaion resides before or after the data CDC FIFO + localparam int INPUT_DATA_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH, + localparam int OUTPUT_WIDTH_ELEMENTS = TDATA_WIDTH / INPUT_ELEMENT_WIDTH, + localparam int WA_BEFORE_CDC = INPUT_WIDTH_ELEMENTS < OUTPUT_WIDTH_ELEMENTS, + + // DLA (input data) side derived parameters + + // AXI side derived parameters + localparam int TSTRB_WIDTH = TDATA_WIDTH / 8, + localparam int TUSER_WIDTH = TDATA_WIDTH / 8 +) ( + // Master/driver (DLA) signals + input wire clk_dla, + input wire i_aresetn, + + // config input for output streaming + input wire [CONFIG_WIDTH-1:0] i_config_data, + input wire i_config_valid, + output logic o_config_ready, + + // input data + output logic o_ready, // backpressure to xbar + input wire i_valid, // valid from xbar + input wire [INPUT_DATA_BITS-1:0] i_data, // data from xbar after width adaptation + input wire i_data_done, // data from xbar sent was the last one (the actual last data comes after WA) + output logic o_last_data_received, // got the last data + + // config input for flush handling + input wire [CONFIG_WIDTH-1:0] i_config_flush_data, + input wire i_config_flush_valid, + output logic o_config_flush_ready, + + // input signals for flush generation + output wire o_input_done, // xbar input for a layer is done and received + + // Receiver (AXI) signals + input wire clk_axi, + input wire i_axi_aresetn, + output logic o_axi_t_valid, // indicates the Transmitter is driving a valid transfer + input wire i_axi_t_ready, // indicates that a Receiver can accept a transfer. + output wire o_axi_t_last, // Unused - indicates the boundary of a packet + output wire [TDATA_WIDTH-1:0] o_axi_t_data, // the primary payload used to provide the data that is passing across the interface + output wire [TSTRB_WIDTH-1:0] o_axi_t_strb, // the byte qualifier that indicates whether the content of the associated byte of TDATA is valid + output wire [TSTRB_WIDTH-1:0] o_axi_t_keep, // Unused + output wire [TID_WIDTH-1:0] o_axi_t_id, // Unused - data stream identifier + output wire [TDEST_WIDTH-1:0] o_axi_t_dest, // Unused - provides routing information for the data stream + output wire [TUSER_WIDTH-1:0] o_axi_t_user, // Unused - user-defined sideband information that can be transmitted along the data stream. + output wire o_axi_t_wakeup // Unused - identifies any activity associated with AXI-Stream interface +); + //reset parameterization + localparam int RESET_USE_SYNCHRONIZER = 1; + localparam int RESET_PIPE_DEPTH = 3; + localparam int RESET_NUM_COPIES = 1; + + ////////////////////////////////////////// + // Reset Synchronization onto DLA clk // + ///////////////////////////////////////// + + logic [RESET_NUM_COPIES-1:0] sync_dla_resetn; + + dla_reset_handler_simple #( + .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), + .PIPE_DEPTH (RESET_PIPE_DEPTH), + .NUM_COPIES (RESET_NUM_COPIES) + ) dla_resetn_synchronizer ( + .clk (clk_dla), + .i_resetn (i_aresetn), + .o_sclrn (sync_dla_resetn) + ); + + ////////////////////////////////////////// + // Reset Synchronization onto AXI clk // + ///////////////////////////////////////// + + logic [RESET_NUM_COPIES-1:0] sync_axi_resetn; + + dla_reset_handler_simple #( + .USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER), + .PIPE_DEPTH (RESET_PIPE_DEPTH), + .NUM_COPIES (RESET_NUM_COPIES) + ) axi_resetn_synchronizer ( + .clk (clk_axi), + .i_resetn (i_aresetn), + .o_sclrn (sync_axi_resetn) + ); + + // last data + logic received_last_data; + logic xbar_sent_last_data; + always_ff @ (posedge clk_dla) begin + received_last_data <= 1'b0; + if (i_data_done) begin + xbar_sent_last_data <= 1'b1; + end + if (xbar_sent_last_data & i_valid & o_ready) begin + received_last_data <= 1'b1; + xbar_sent_last_data <= 1'b0; + end + if (~sync_dla_resetn) begin + received_last_data <= 1'b0; + xbar_sent_last_data <= 1'b0; + end + end + assign o_last_data_received = received_last_data; + + logic w_flush; + // Instaniate the flush generation block + dla_output_streamer_flush_handler # ( + .CONFIG_WIDTH(CONFIG_WIDTH) + ) flush_generator ( + .clk_dla(clk_dla), + .i_aresetn(sync_dla_resetn[0]), + .i_config_data(i_config_flush_data), + .i_config_valid(i_config_flush_valid), + .o_config_ready(o_config_flush_ready), + .i_ready(o_ready), + .i_valid(i_valid), + .o_flush(w_flush), + .o_input_done(o_input_done) + ); + + // Handle Config data and strb generation + // Writing side dla_clk + logic config_is_loaded; + logic cfg_rd_empty, cfg_rd_ack, cfg_rd_amost_empty, cfg_wr_almost_full; + logic [CONFIG_WIDTH-1:0] cfg_rd_data; + dla_acl_dcfifo #( + .DEPTH (32), + .WIDTH (CONFIG_WIDTH), + .ALMOST_FULL_CUTOFF (2) + ) dla_acl_fifo_inst_cfg ( + .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally + .wr_clock (clk_dla), + .wr_req (i_config_valid), + .wr_data (i_config_data), + .wr_almost_full (cfg_wr_almost_full), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF + .wr_full (), // inform upstream that we cannot accept data + + .rd_clock (clk_axi), + .rd_empty (cfg_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack + .rd_ack (~config_is_loaded), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream + .rd_data (cfg_rd_data), + .rd_almost_empty (cfg_rd_amost_empty) // early indication to downstream that soon fifo may no longer be able to supply data, threshold controlled by ALMOST_EMPTY_CUTOFF + ); + + assign o_config_ready = ~cfg_wr_almost_full; + + // Reading side AXI clock + logic [CONFIG_WIDTH-1:0] config_offset; + output_streamer_config_t cfg; + localparam int NUM_CONFIG_OFFSETS = divCeil($bits(cfg), CONFIG_WIDTH); + + // For now, ensure size of config is exact multiple of CONFIG_WIDTH + `DLA_ACL_PARAMETER_ASSERT($bits(cfg) == NUM_CONFIG_OFFSETS * CONFIG_WIDTH); + + logic [CONFIG_WIDTH-1:0] config_total_transfers; // total number of axi_data transfers for a layer + logic [CONFIG_WIDTH-1:0] config_total_transfers_adjusted; // total number of axi_data transfers for a layer miuns any invalid last transactions + logic [CONFIG_WIDTH-1:0] config_transfers_per_hw_pixel; // Decides the total number of transfers needed to send a full set of output channels for a single piexel + // for a single width/height pixel given a specific data_width for the axi interface. + logic [CONFIG_WIDTH-1:0] config_valid_bytes_stream_width; // Determines how many elements of the last transfer are valid. + logic [CONFIG_WIDTH-1:0] config_last_index; // Determines index of last valid transaction per height/width. + logic [CONFIG_WIDTH-1:0] config_last_stream; // Determines if this stream is the last stream to geenrate tlast + logic [CONFIG_WIDTH-1:0] channel_chunks_counter; // Counter for the config_transfers_per_hw_pixel + logic [CONFIG_WIDTH-1:0] total_counter_out; + logic ostreamer_downstream_ready; + + assign config_total_transfers = cfg.total_transfers; + assign config_total_transfers_adjusted = cfg.total_transfers_adjusted; + assign config_transfers_per_hw_pixel = cfg.transfers_per_hw_pixel; + assign config_valid_bytes_stream_width = cfg.valid_bytes_stream_width; + assign config_last_index = cfg.last_index; + assign config_last_stream = cfg.last_stream; + logic output_valid; + logic output_tx_received; + always_ff @(posedge clk_axi) begin + // config state machine + if (~config_is_loaded & ~cfg_rd_empty) begin + // update progress in accepting NUM_CONFIG_OFFSETS transactions + if (config_offset == NUM_CONFIG_OFFSETS-1) begin + config_offset <= '0; + config_is_loaded <= 1'b1; + end + else begin + config_offset <= config_offset + 1'b1; + end + cfg <= (cfg_rd_data[CONFIG_WIDTH-1:0] << ($bits(cfg) - CONFIG_WIDTH)) | (cfg >> CONFIG_WIDTH); + end else begin + // keep track of how many transactions are read by AXI to drive t_strb + if (config_is_loaded & output_tx_received) begin + total_counter_out <= total_counter_out + 1; + if (total_counter_out == (config_total_transfers - 1)) begin + config_is_loaded <= 1'b0; + total_counter_out <= '0; + end + channel_chunks_counter <= channel_chunks_counter + 1; // increment counter + if (channel_chunks_counter == (config_transfers_per_hw_pixel - 1)) begin + channel_chunks_counter <= '0; // + end + end + end + // resetn + if (~sync_axi_resetn[0]) begin + config_is_loaded <= 1'b0; + channel_chunks_counter <= '0; + total_counter_out <= '0; + config_offset <= '0; + end + end + + logic [TDATA_WIDTH-1:0] ostreamer_output_data; + localparam int FIFO_CUTOFF = 0; // No need for slack cycles as the full goes back and gets handled in the same cycle in the width adapter + + if (!WA_BEFORE_CDC) begin : GEN_WA_AFTER_DC_FIFO + // In this situation we want the width adaptation to happen in the slow clock domain + // so that the upstream IP can continue producing data that goes straight into the fifo + localparam int XBAR_WIDTH_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH; + logic [XBAR_WIDTH_BITS-1:0] fifo_data; + logic fifo_rd_empty, fifo_downstream_ready; + logic wr_full; + + dla_acl_dcfifo #( + .DEPTH (FIFO_DEPTH), + .WIDTH (XBAR_WIDTH_BITS), + .ALMOST_FULL_CUTOFF (FIFO_CUTOFF) + ) dla_acl_fifo_inst ( + .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally + .wr_clock (clk_dla), + .wr_req (i_valid && o_ready), + .wr_data (i_data), + .wr_full (wr_full), // inform upstream that we cannot accept data + + .rd_clock (clk_axi), + .rd_empty (fifo_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack + .rd_ack (fifo_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream + .rd_data (fifo_data) + ); + + logic adapted_valid; + // Instantiate a width adapter to convert from xbar_k_vec width to AXI width + dla_width_adapter #( + .GROUP_NUM ( 1 ), // hardcoded + .GROUP_DELAY ( 0 ), + .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ), + .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS), + .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ), + .FLUSH_ENABLE ( 0 ) + ) wa_output_stream_inst ( + .clock ( clk_axi ), + .i_aresetn ( i_aresetn ), + .i_flush ( 1'b0 ), + .o_din_ready ( fifo_downstream_ready ), + .i_din_valid ( ~fifo_rd_empty ), + .i_din_data ( fifo_data ), + .i_dout_ready ( ostreamer_downstream_ready ), // to be received from output streamer + .o_dout_valid ( adapted_valid ), + .o_dout_data ( ostreamer_output_data ) + ); + assign output_tx_received = adapted_valid & ostreamer_downstream_ready; + assign output_valid = adapted_valid; + // We backpressure the upstream if the fifo is full, or if we want to flush (empty) the exit fifo + // out of any leftover invalid transactions that might come out of cvec != axi + assign o_ready = ~wr_full; + + end else begin: GEN_WA_BEFORE_DC_FIFO + logic adapted_valid; + logic [TDATA_WIDTH-1:0] adapted_data; + logic wr_full; // dc fifo for data after width adaptation + + // Instantiate a width adapter to convert from xbar_k_vec width to AXI width + dla_width_adapter #( + .GROUP_NUM ( 1 ), // hardcoded + .GROUP_DELAY ( 0 ), + .INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ), + .OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS ), + .ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ), + .FLUSH_ENABLE ( 1 ) + ) wa_output_stream_inst ( + .clock ( clk_dla ), + .i_aresetn ( i_aresetn ), + .i_flush ( w_flush ), // flush only activated with an incoming valid transaction + .o_din_ready ( o_ready ), + .i_din_valid ( i_valid ), + .i_din_data ( i_data ), + .i_dout_ready ( ~wr_full ), // to be received from output streamer + .o_dout_valid ( adapted_valid ), + .o_dout_data ( adapted_data ) + ); + + logic rd_empty; + // Instantiate the output FIFO to perform clock domain crossing + dla_acl_dcfifo #( + .DEPTH (FIFO_DEPTH), + .WIDTH (TDATA_WIDTH), + .ALMOST_FULL_CUTOFF (FIFO_CUTOFF) + ) dla_acl_fifo_inst ( + .async_resetn (i_aresetn), // dcfifo will synchronize the reset internally + .wr_clock (clk_dla), + .wr_req (adapted_valid && !wr_full), + .wr_data (adapted_data), + .wr_almost_full (), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF + .wr_full (wr_full), // inform upstream that we cannot accept data + + .rd_clock (clk_axi), + .rd_empty (rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack + .rd_ack (ostreamer_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream + .rd_data (ostreamer_output_data) + ); + assign output_tx_received = ~rd_empty & ostreamer_downstream_ready; + assign output_valid = ~rd_empty; + end + + // Convert FIFO interface to AXI signals + logic flush_exit_fifo; // signal used to flush the fifo out of any leftover invalid transactions + logic [TSTRB_WIDTH-1:0] strb_signal_normal, strb_signal_last; + + assign o_axi_t_valid = output_valid & config_is_loaded & ((o_axi_t_strb != '0)) & ~flush_exit_fifo; + assign ostreamer_downstream_ready = (i_axi_t_ready & config_is_loaded) || (flush_exit_fifo); + assign o_axi_t_data = ostreamer_output_data; + assign o_axi_t_last = (total_counter_out == (config_total_transfers_adjusted - 1)) & + output_valid & config_is_loaded & ostreamer_downstream_ready & config_last_stream; + + + // Control logic to produce the o_axi_t_strb signal + assign strb_signal_normal = '1; + assign strb_signal_last = ((1 << config_valid_bytes_stream_width) - 1); + assign o_axi_t_strb = flush_exit_fifo ? '0 : + channel_chunks_counter > config_last_index ? '0 : + channel_chunks_counter == config_last_index ? strb_signal_last : + strb_signal_normal; + // + // state machine to decide if output streamer is producing output from the exit fifo + // or flushing (emptying) the exit fifo + // we need the ability to flush the exit fifo in some situations where part of the last transactions from the xbar + // do not carry actual data (all zeros). In these situations, we need the tlast to come out with the + // actual last valid transaction with valid data, and these invalid transactions to be removed from the exit fifo + // + // for example, if cvec=32 elements, axi=8 elements (128 bits), and channels=6, each cvec produces 4 axi transactions + // In the last cvec transaction (4 axi transactions), we would have the first with valid data + // and the remaining three with zeros + // Up until 2024.3 release, we would produce tlast at the last axi transaction (#4), but this is + // not efficient since the last valid transaction happens three transactions earlier, TX number 1 + // out of the last 4 transactions. With the state machine, we produce tlast at the first transaction + // of the last four alongside the tlast. Finally we enter a flush state and flush the exit fifo + // to empty it out of these last three invalid transactions + // + typedef enum logic { + ACTIVE = 1'b0, + FLUSH = 1'b1 + } state_t; + state_t state, state_next; + + always_ff @(posedge clk_axi) begin + if (~sync_axi_resetn[0]) begin + state <= ACTIVE; + end else begin + state <= state_next; + end + end + logic no_exit_fifo_flush_needed; + + assign no_exit_fifo_flush_needed = config_is_loaded && (config_total_transfers == config_total_transfers_adjusted); + always_comb begin + state_next = state; + flush_exit_fifo = 0; + case(state) + ACTIVE: begin + flush_exit_fifo = 0; + if (no_exit_fifo_flush_needed) begin + state_next = ACTIVE; + end else if ((total_counter_out == config_total_transfers_adjusted - 1) && config_is_loaded) begin + state_next = FLUSH; + end + end + FLUSH: begin + flush_exit_fifo = 1; + if ((total_counter_out == config_total_transfers - 1) && config_is_loaded) begin + state_next = ACTIVE; + end + end + default: state_next = ACTIVE; // Default state + endcase + end + +endmodule |
