1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
|
// Copyright 2020-2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.
// Description of functionality:
// This module is responsible for receiving DLA data from the cross bar (essentially coming from PE array)
// in HWC format and dispatches it to an AXI stream interface
// This module is fed by width adapter that converts between xbar_k_vec and the AXI bus width (specified at the arch file
// through the output_stream_interface:bus_width)
// Once data is converted to the correct interface width, it gets stored in a dual-clock FIFO which acts
// as a clock crosser between the clk_dla and the clk_axi. It also acts as a conversion from the simple
// ready-valid protocol in DLA to an AXI stream protocol (most of the signals won't be used)
// Some control logic is used to specify which bytes are valid (through the tstrb signal), which will be used
// when the number of output channels is not a multiple of k_vec. For example, if k_vec = c_vec = 8,
// output_channels (O_C) = 12, we will have two transfers of size 8, the first transfer will have 8 valid outputs,
// but the second transfer will have 4 valid outputs and 4 unvalid outputs (zeros), so we use the t_strb to indicate
// which bytes (essentially which FP16 elements) are valid, and it's expected to be consumed by the downstream blocks
// (receiver of the AXI signals)
`resetall
`undefineall
`default_nettype none
`include "dla_acl_parameter_assert.svh"
module dla_output_streamer import dla_common_pkg::*, dla_output_streamer_pkg::*; #(
// DLA (input data) side parameters
parameter int CONFIG_WIDTH = 32,
// AXI side parameters
parameter int TDATA_WIDTH = 128, // an integer number of bits (typically a power of 2 from 8 - 1024)
parameter int TID_WIDTH = 8, // recommended to be no more than 8.
parameter int TDEST_WIDTH = 8, // recommended to be no more than 8.
// Data DC FIFO Depth
parameter int FIFO_DEPTH = 1024,
parameter int INPUT_WIDTH_ELEMENTS = 1,
parameter int INPUT_ELEMENT_WIDTH = 1,
// Decide if Width adaptaion resides before or after the data CDC FIFO
localparam int INPUT_DATA_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH,
localparam int OUTPUT_WIDTH_ELEMENTS = TDATA_WIDTH / INPUT_ELEMENT_WIDTH,
localparam int WA_BEFORE_CDC = INPUT_WIDTH_ELEMENTS < OUTPUT_WIDTH_ELEMENTS,
// DLA (input data) side derived parameters
// AXI side derived parameters
localparam int TSTRB_WIDTH = TDATA_WIDTH / 8,
localparam int TUSER_WIDTH = TDATA_WIDTH / 8
) (
// Master/driver (DLA) signals
input wire clk_dla,
input wire i_aresetn,
// config input for output streaming
input wire [CONFIG_WIDTH-1:0] i_config_data,
input wire i_config_valid,
output logic o_config_ready,
// input data
output logic o_ready, // backpressure to xbar
input wire i_valid, // valid from xbar
input wire [INPUT_DATA_BITS-1:0] i_data, // data from xbar after width adaptation
input wire i_data_done, // data from xbar sent was the last one (the actual last data comes after WA)
output logic o_last_data_received, // got the last data
// config input for flush handling
input wire [CONFIG_WIDTH-1:0] i_config_flush_data,
input wire i_config_flush_valid,
output logic o_config_flush_ready,
// input signals for flush generation
output wire o_input_done, // xbar input for a layer is done and received
// Receiver (AXI) signals
input wire clk_axi,
input wire i_axi_aresetn,
output logic o_axi_t_valid, // indicates the Transmitter is driving a valid transfer
input wire i_axi_t_ready, // indicates that a Receiver can accept a transfer.
output wire o_axi_t_last, // Unused - indicates the boundary of a packet
output wire [TDATA_WIDTH-1:0] o_axi_t_data, // the primary payload used to provide the data that is passing across the interface
output wire [TSTRB_WIDTH-1:0] o_axi_t_strb, // the byte qualifier that indicates whether the content of the associated byte of TDATA is valid
output wire [TSTRB_WIDTH-1:0] o_axi_t_keep, // Unused
output wire [TID_WIDTH-1:0] o_axi_t_id, // Unused - data stream identifier
output wire [TDEST_WIDTH-1:0] o_axi_t_dest, // Unused - provides routing information for the data stream
output wire [TUSER_WIDTH-1:0] o_axi_t_user, // Unused - user-defined sideband information that can be transmitted along the data stream.
output wire o_axi_t_wakeup // Unused - identifies any activity associated with AXI-Stream interface
);
//reset parameterization
localparam int RESET_USE_SYNCHRONIZER = 1;
localparam int RESET_PIPE_DEPTH = 3;
localparam int RESET_NUM_COPIES = 1;
//////////////////////////////////////////
// Reset Synchronization onto DLA clk //
/////////////////////////////////////////
logic [RESET_NUM_COPIES-1:0] sync_dla_resetn;
dla_reset_handler_simple #(
.USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
.PIPE_DEPTH (RESET_PIPE_DEPTH),
.NUM_COPIES (RESET_NUM_COPIES)
) dla_resetn_synchronizer (
.clk (clk_dla),
.i_resetn (i_aresetn),
.o_sclrn (sync_dla_resetn)
);
//////////////////////////////////////////
// Reset Synchronization onto AXI clk //
/////////////////////////////////////////
logic [RESET_NUM_COPIES-1:0] sync_axi_resetn;
dla_reset_handler_simple #(
.USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
.PIPE_DEPTH (RESET_PIPE_DEPTH),
.NUM_COPIES (RESET_NUM_COPIES)
) axi_resetn_synchronizer (
.clk (clk_axi),
.i_resetn (i_aresetn),
.o_sclrn (sync_axi_resetn)
);
// last data
logic received_last_data;
logic xbar_sent_last_data;
always_ff @ (posedge clk_dla) begin
received_last_data <= 1'b0;
if (i_data_done) begin
xbar_sent_last_data <= 1'b1;
end
if (xbar_sent_last_data & i_valid & o_ready) begin
received_last_data <= 1'b1;
xbar_sent_last_data <= 1'b0;
end
if (~sync_dla_resetn) begin
received_last_data <= 1'b0;
xbar_sent_last_data <= 1'b0;
end
end
assign o_last_data_received = received_last_data;
logic w_flush;
// Instaniate the flush generation block
dla_output_streamer_flush_handler # (
.CONFIG_WIDTH(CONFIG_WIDTH)
) flush_generator (
.clk_dla(clk_dla),
.i_aresetn(sync_dla_resetn[0]),
.i_config_data(i_config_flush_data),
.i_config_valid(i_config_flush_valid),
.o_config_ready(o_config_flush_ready),
.i_ready(o_ready),
.i_valid(i_valid),
.o_flush(w_flush),
.o_input_done(o_input_done)
);
// Handle Config data and strb generation
// Writing side dla_clk
logic config_is_loaded;
logic cfg_rd_empty, cfg_rd_ack, cfg_rd_amost_empty, cfg_wr_almost_full;
logic [CONFIG_WIDTH-1:0] cfg_rd_data;
dla_acl_dcfifo #(
.DEPTH (32),
.WIDTH (CONFIG_WIDTH),
.ALMOST_FULL_CUTOFF (2)
) dla_acl_fifo_inst_cfg (
.async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
.wr_clock (clk_dla),
.wr_req (i_config_valid),
.wr_data (i_config_data),
.wr_almost_full (cfg_wr_almost_full), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
.wr_full (), // inform upstream that we cannot accept data
.rd_clock (clk_axi),
.rd_empty (cfg_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
.rd_ack (~config_is_loaded), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
.rd_data (cfg_rd_data),
.rd_almost_empty (cfg_rd_amost_empty) // early indication to downstream that soon fifo may no longer be able to supply data, threshold controlled by ALMOST_EMPTY_CUTOFF
);
assign o_config_ready = ~cfg_wr_almost_full;
// Reading side AXI clock
logic [CONFIG_WIDTH-1:0] config_offset;
output_streamer_config_t cfg;
localparam int NUM_CONFIG_OFFSETS = divCeil($bits(cfg), CONFIG_WIDTH);
// For now, ensure size of config is exact multiple of CONFIG_WIDTH
`DLA_ACL_PARAMETER_ASSERT($bits(cfg) == NUM_CONFIG_OFFSETS * CONFIG_WIDTH);
logic [CONFIG_WIDTH-1:0] config_total_transfers; // total number of axi_data transfers for a layer
logic [CONFIG_WIDTH-1:0] config_total_transfers_adjusted; // total number of axi_data transfers for a layer miuns any invalid last transactions
logic [CONFIG_WIDTH-1:0] config_transfers_per_hw_pixel; // Decides the total number of transfers needed to send a full set of output channels for a single piexel
// for a single width/height pixel given a specific data_width for the axi interface.
logic [CONFIG_WIDTH-1:0] config_valid_bytes_stream_width; // Determines how many elements of the last transfer are valid.
logic [CONFIG_WIDTH-1:0] config_last_index; // Determines index of last valid transaction per height/width.
logic [CONFIG_WIDTH-1:0] config_last_stream; // Determines if this stream is the last stream to geenrate tlast
logic [CONFIG_WIDTH-1:0] channel_chunks_counter; // Counter for the config_transfers_per_hw_pixel
logic [CONFIG_WIDTH-1:0] total_counter_out;
logic ostreamer_downstream_ready;
assign config_total_transfers = cfg.total_transfers;
assign config_total_transfers_adjusted = cfg.total_transfers_adjusted;
assign config_transfers_per_hw_pixel = cfg.transfers_per_hw_pixel;
assign config_valid_bytes_stream_width = cfg.valid_bytes_stream_width;
assign config_last_index = cfg.last_index;
assign config_last_stream = cfg.last_stream;
logic output_valid;
logic output_tx_received;
always_ff @(posedge clk_axi) begin
// config state machine
if (~config_is_loaded & ~cfg_rd_empty) begin
// update progress in accepting NUM_CONFIG_OFFSETS transactions
if (config_offset == NUM_CONFIG_OFFSETS-1) begin
config_offset <= '0;
config_is_loaded <= 1'b1;
end
else begin
config_offset <= config_offset + 1'b1;
end
cfg <= (cfg_rd_data[CONFIG_WIDTH-1:0] << ($bits(cfg) - CONFIG_WIDTH)) | (cfg >> CONFIG_WIDTH);
end else begin
// keep track of how many transactions are read by AXI to drive t_strb
if (config_is_loaded & output_tx_received) begin
total_counter_out <= total_counter_out + 1;
if (total_counter_out == (config_total_transfers - 1)) begin
config_is_loaded <= 1'b0;
total_counter_out <= '0;
end
channel_chunks_counter <= channel_chunks_counter + 1; // increment counter
if (channel_chunks_counter == (config_transfers_per_hw_pixel - 1)) begin
channel_chunks_counter <= '0; //
end
end
end
// resetn
if (~sync_axi_resetn[0]) begin
config_is_loaded <= 1'b0;
channel_chunks_counter <= '0;
total_counter_out <= '0;
config_offset <= '0;
end
end
logic [TDATA_WIDTH-1:0] ostreamer_output_data;
localparam int FIFO_CUTOFF = 0; // No need for slack cycles as the full goes back and gets handled in the same cycle in the width adapter
if (!WA_BEFORE_CDC) begin : GEN_WA_AFTER_DC_FIFO
// In this situation we want the width adaptation to happen in the slow clock domain
// so that the upstream IP can continue producing data that goes straight into the fifo
localparam int XBAR_WIDTH_BITS = INPUT_WIDTH_ELEMENTS * INPUT_ELEMENT_WIDTH;
logic [XBAR_WIDTH_BITS-1:0] fifo_data;
logic fifo_rd_empty, fifo_downstream_ready;
logic wr_full;
dla_acl_dcfifo #(
.DEPTH (FIFO_DEPTH),
.WIDTH (XBAR_WIDTH_BITS),
.ALMOST_FULL_CUTOFF (FIFO_CUTOFF)
) dla_acl_fifo_inst (
.async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
.wr_clock (clk_dla),
.wr_req (i_valid && o_ready),
.wr_data (i_data),
.wr_full (wr_full), // inform upstream that we cannot accept data
.rd_clock (clk_axi),
.rd_empty (fifo_rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
.rd_ack (fifo_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
.rd_data (fifo_data)
);
logic adapted_valid;
// Instantiate a width adapter to convert from xbar_k_vec width to AXI width
dla_width_adapter #(
.GROUP_NUM ( 1 ), // hardcoded
.GROUP_DELAY ( 0 ),
.INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ),
.OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS),
.ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ),
.FLUSH_ENABLE ( 0 )
) wa_output_stream_inst (
.clock ( clk_axi ),
.i_aresetn ( i_aresetn ),
.i_flush ( 1'b0 ),
.o_din_ready ( fifo_downstream_ready ),
.i_din_valid ( ~fifo_rd_empty ),
.i_din_data ( fifo_data ),
.i_dout_ready ( ostreamer_downstream_ready ), // to be received from output streamer
.o_dout_valid ( adapted_valid ),
.o_dout_data ( ostreamer_output_data )
);
assign output_tx_received = adapted_valid & ostreamer_downstream_ready;
assign output_valid = adapted_valid;
// We backpressure the upstream if the fifo is full, or if we want to flush (empty) the exit fifo
// out of any leftover invalid transactions that might come out of cvec != axi
assign o_ready = ~wr_full;
end else begin: GEN_WA_BEFORE_DC_FIFO
logic adapted_valid;
logic [TDATA_WIDTH-1:0] adapted_data;
logic wr_full; // dc fifo for data after width adaptation
// Instantiate a width adapter to convert from xbar_k_vec width to AXI width
dla_width_adapter #(
.GROUP_NUM ( 1 ), // hardcoded
.GROUP_DELAY ( 0 ),
.INPUT_DATA_WIDTH_IN_ELEMENTS ( INPUT_WIDTH_ELEMENTS ),
.OUTPUT_DATA_WIDTH_IN_ELEMENTS ( OUTPUT_WIDTH_ELEMENTS ),
.ELEMENT_WIDTH ( INPUT_ELEMENT_WIDTH ),
.FLUSH_ENABLE ( 1 )
) wa_output_stream_inst (
.clock ( clk_dla ),
.i_aresetn ( i_aresetn ),
.i_flush ( w_flush ), // flush only activated with an incoming valid transaction
.o_din_ready ( o_ready ),
.i_din_valid ( i_valid ),
.i_din_data ( i_data ),
.i_dout_ready ( ~wr_full ), // to be received from output streamer
.o_dout_valid ( adapted_valid ),
.o_dout_data ( adapted_data )
);
logic rd_empty;
// Instantiate the output FIFO to perform clock domain crossing
dla_acl_dcfifo #(
.DEPTH (FIFO_DEPTH),
.WIDTH (TDATA_WIDTH),
.ALMOST_FULL_CUTOFF (FIFO_CUTOFF)
) dla_acl_fifo_inst (
.async_resetn (i_aresetn), // dcfifo will synchronize the reset internally
.wr_clock (clk_dla),
.wr_req (adapted_valid && !wr_full),
.wr_data (adapted_data),
.wr_almost_full (), // early indication to upstream that soon fifo may no longer be able to accept data, threshold controlled by ALMOST_FULL_CUTOFF
.wr_full (wr_full), // inform upstream that we cannot accept data
.rd_clock (clk_axi),
.rd_empty (rd_empty), // advertise to downstream that fifo is empty, a read only occurs when ~rd_empty & rd_ack
.rd_ack (ostreamer_downstream_ready), // read acknowledge from downstream, ignored when fifo is empty -- this is like an active low backpressure from downstream
.rd_data (ostreamer_output_data)
);
assign output_tx_received = ~rd_empty & ostreamer_downstream_ready;
assign output_valid = ~rd_empty;
end
// Convert FIFO interface to AXI signals
logic flush_exit_fifo; // signal used to flush the fifo out of any leftover invalid transactions
logic [TSTRB_WIDTH-1:0] strb_signal_normal, strb_signal_last;
assign o_axi_t_valid = output_valid & config_is_loaded & ((o_axi_t_strb != '0)) & ~flush_exit_fifo;
assign ostreamer_downstream_ready = (i_axi_t_ready & config_is_loaded) || (flush_exit_fifo);
assign o_axi_t_data = ostreamer_output_data;
assign o_axi_t_last = (total_counter_out == (config_total_transfers_adjusted - 1)) &
output_valid & config_is_loaded & ostreamer_downstream_ready & config_last_stream;
// Control logic to produce the o_axi_t_strb signal
assign strb_signal_normal = '1;
assign strb_signal_last = ((1 << config_valid_bytes_stream_width) - 1);
assign o_axi_t_strb = flush_exit_fifo ? '0 :
channel_chunks_counter > config_last_index ? '0 :
channel_chunks_counter == config_last_index ? strb_signal_last :
strb_signal_normal;
//
// state machine to decide if output streamer is producing output from the exit fifo
// or flushing (emptying) the exit fifo
// we need the ability to flush the exit fifo in some situations where part of the last transactions from the xbar
// do not carry actual data (all zeros). In these situations, we need the tlast to come out with the
// actual last valid transaction with valid data, and these invalid transactions to be removed from the exit fifo
//
// for example, if cvec=32 elements, axi=8 elements (128 bits), and channels=6, each cvec produces 4 axi transactions
// In the last cvec transaction (4 axi transactions), we would have the first with valid data
// and the remaining three with zeros
// Up until 2024.3 release, we would produce tlast at the last axi transaction (#4), but this is
// not efficient since the last valid transaction happens three transactions earlier, TX number 1
// out of the last 4 transactions. With the state machine, we produce tlast at the first transaction
// of the last four alongside the tlast. Finally we enter a flush state and flush the exit fifo
// to empty it out of these last three invalid transactions
//
typedef enum logic {
ACTIVE = 1'b0,
FLUSH = 1'b1
} state_t;
state_t state, state_next;
always_ff @(posedge clk_axi) begin
if (~sync_axi_resetn[0]) begin
state <= ACTIVE;
end else begin
state <= state_next;
end
end
logic no_exit_fifo_flush_needed;
assign no_exit_fifo_flush_needed = config_is_loaded && (config_total_transfers == config_total_transfers_adjusted);
always_comb begin
state_next = state;
flush_exit_fifo = 0;
case(state)
ACTIVE: begin
flush_exit_fifo = 0;
if (no_exit_fifo_flush_needed) begin
state_next = ACTIVE;
end else if ((total_counter_out == config_total_transfers_adjusted - 1) && config_is_loaded) begin
state_next = FLUSH;
end
end
FLUSH: begin
flush_exit_fifo = 1;
if ((total_counter_out == config_total_transfers - 1) && config_is_loaded) begin
state_next = ACTIVE;
end
end
default: state_next = ACTIVE; // Default state
endcase
end
endmodule
|