1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
|
// Copyright 2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.
/**
* dla_input_streamer.sv
*
* FPGA AI Suite input streaming is handled here. This provides an AXI interface to the top-level entity,
* and is responsible for clock-crossing from AXI to DLA clock domains. This module is also responsible for
* applying the layout transform to incoming data.
*
* For now, enabling streaming implies enabling the hardware layout transform module.
* The layout transform assumes input tensors in HWC format; and the input bus width is arbitrary, width
* conversion is done in the layout transform where the output is always CVEC*sizeof(fp16).
*
* Flow control is handled by the configuration of the stream-buffer writer. Backpressuring from the SB
* writer will be propagated to this AXI connection to avoid the need to configure the transfer frame sizes
* in this module.
*
*/
/**
TODO (arooney):
- Consider behaviour when a frame is done, it gets consumed by the SB, and the
LT can accept a few frames before backpressuring. But then the producer is a few packets into
the transmission. Maybe its best to only accept data when the layout transform is done AND the
SB is ready.
- Implement strobe signal handling.
- Remove unused AXI signals.
*/
`resetall
`undefineall
`default_nettype none
module dla_input_streamer
#(
parameter int TDATA_WIDTH,
parameter int FIFO_DEPTH,
parameter int TID_WIDTH,
parameter int TDEST_WIDTH,
parameter int TUSER_WIDTH,
parameter dla_lt_pkg::lt_arch_t LT_ARCH,
parameter int OUTPUT_WIDTH
) (
input wire clk_dla,
input wire clk_ddr,
// AMBA AXI-Stream signals
input wire clk_axi,
input wire i_resetn_async,
input wire [LT_ARCH.CONFIG_BYTES*8-1:0] i_config_data,
input wire i_config_valid,
output logic o_config_ready,
input wire i_streaming_enable,
input wire i_tvalid, // indicates the transmitter is driving a valid transfer
output logic o_tready, // indicates that the receiver can accept a transfer
input wire [TDATA_WIDTH-1:0] i_tdata, // the primary payload of the interface
input wire [TDATA_WIDTH/8-1:0] i_tstrb, // (NOT USED) byte qualifier indicating whether the
// associated byte in tdata should be processed
// as a data, or position byte
input wire [TDATA_WIDTH/8-1:0] i_tkeep, // (NOT USED) byte qualifier indicating whether the
// contents of tdata is processed as part of the data stream
input wire i_tlast, // (NOT USED) indicates the boundary of a packet
input wire [TID_WIDTH-1:0] i_tid, // (NOT USED) a data stream identifier
input wire [TDEST_WIDTH-1:0] i_tdest, // (NOT USED) provides routing information for the data stream
input wire [TUSER_WIDTH-1:0] i_tuser, // (NOT USED) user-defined sideband information
input wire i_twakeup, // (AXI5-S ONLY, NOT USED) identifies any activity associated with the AXI-s interface
// output
output logic [OUTPUT_WIDTH-1:0] o_istream_data,
output logic o_istream_valid,
input wire i_istream_ready, // from input feeder
output logic o_reading_first_word, // for CSR active-jobs counter
output logic o_param_error
);
logic resetn;
logic resetn_clk_dla;
logic reader_empty;
logic [OUTPUT_WIDTH-1:0] dcfifo_data;
logic dcfifo_valid, dcfifo_stall;
logic lt_done;
logic ready_input_state; // state-based input ready signal that accounts for inter-frame back-pressure
logic lt_ready; // ready signal from layout transform, accounts for intra-frame back-pressure
logic axi_param_error;
//reset parameterization
localparam int RESET_USE_SYNCHRONIZER = 1;
localparam int RESET_PIPE_DEPTH = 3;
localparam int RESET_NUM_COPIES = 1;
dla_reset_handler_simple #(
.USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
.PIPE_DEPTH (RESET_PIPE_DEPTH),
.NUM_COPIES (RESET_NUM_COPIES)
)
istream_reset_synchronizer
(
.clk (clk_axi),
.i_resetn (i_resetn_async),
.o_sclrn (resetn)
);
dla_reset_handler_simple #(
.USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
.PIPE_DEPTH (RESET_PIPE_DEPTH),
.NUM_COPIES (RESET_NUM_COPIES)
)
istream_clk_dla_reset_synchronizer
(
.clk (clk_dla),
.i_resetn (i_resetn_async),
.o_sclrn (resetn_clk_dla)
);
dla_streamer_fsm streamer_fsm (
.clk_dla (clk_dla),
.clk_axi (clk_axi),
.i_resetn_axi (resetn),
.i_resetn_async (i_resetn_async),
.i_dla_ready (i_istream_ready),
.i_lt_ready (lt_ready),
.i_streaming_enable (i_streaming_enable),
.i_lt_done_frame (lt_done),
.i_tvalid (i_tvalid),
.o_stream_ready (ready_input_state),
.o_reading_first_word (o_reading_first_word)
);
// accept new data when LT and input feeder are both ready. This should translate to
// only accepting data when we're prepared to accept a whole image (as opposed to accepting
// a couple transfers until LT is full, then waiting for previous inference, then accepting the rest,
// since this would probably compilate frame dropping).
dla_layout_transform #(
.CNT_BITS(20),
.DDR_BYTES(TDATA_WIDTH/8),
.CONFIG_DATA_BYTES(LT_ARCH.CONFIG_BYTES),
.DATA_ELEMENT_WIDTH(LT_ARCH.DATA_ELEMENT_WIDTH),
.MAX_CHANNELS(LT_ARCH.MAX_CHANNELS),
.MAX_FEATURE_HEIGHT(LT_ARCH.MAX_FEATURE_HEIGHT),
.MAX_FEATURE_WIDTH(LT_ARCH.MAX_FEATURE_WIDTH),
.MAX_FEATURE_DEPTH(LT_ARCH.MAX_FEATURE_DEPTH),
.MAX_STRIDE_HEIGHT(LT_ARCH.MAX_STRIDE_HEIGHT),
.MAX_STRIDE_WIDTH(LT_ARCH.MAX_STRIDE_WIDTH),
.MAX_STRIDE_DEPTH(LT_ARCH.MAX_STRIDE_DEPTH),
.CVEC(LT_ARCH.CVEC),
.MAX_PAD_FRONT(LT_ARCH.MAX_PAD_FRONT),
.MAX_PAD_LEFT(LT_ARCH.MAX_PAD_LEFT),
.MAX_PAD_TOP(LT_ARCH.MAX_PAD_TOP),
.MAX_FILTER_WIDTH(LT_ARCH.MAX_FILTER_WIDTH),
.MAX_FILTER_HEIGHT(LT_ARCH.MAX_FILTER_HEIGHT),
.MAX_FILTER_DEPTH(LT_ARCH.MAX_FILTER_DEPTH),
.MAX_DILATION_WIDTH(LT_ARCH.MAX_DILATION_WIDTH),
.MAX_DILATION_HEIGHT(LT_ARCH.MAX_DILATION_HEIGHT),
.MAX_DILATION_DEPTH(LT_ARCH.MAX_DILATION_DEPTH),
.DO_U8_CONV(LT_ARCH.DO_U8_CONV),
.DEVICE(LT_ARCH.DEVICE)
) reader_layout_transform (
.clk(clk_axi),
.i_rstn(resetn),
.i_config_data(i_config_data),
.i_config_valid(i_config_valid),
.o_config_ready(o_config_ready),
.i_data(i_tdata),
.i_valid(i_tvalid & ready_input_state),
.o_ready(lt_ready),
.o_data(dcfifo_data),
.o_valid(dcfifo_valid),
.i_stall(dcfifo_stall),
.o_last(lt_done),
.o_param_error(axi_param_error)
);
localparam int DCFIFO_ALMOST_FULL_CUTOFF = 0;
dla_acl_dcfifo #(
.WIDTH (OUTPUT_WIDTH),
.DEPTH (FIFO_DEPTH),
.ALMOST_FULL_CUTOFF (DCFIFO_ALMOST_FULL_CUTOFF)
)
clock_crosser
(
.async_resetn (i_resetn_async), //reset synchronization is handled internally
//write side
.wr_clock (clk_axi),
.wr_req (dcfifo_valid),
.wr_data (dcfifo_data),
.wr_almost_full (dcfifo_stall),
//read side
.rd_clock (clk_dla),
.rd_empty (reader_empty),
.rd_data (o_istream_data),
.rd_ack (i_istream_ready)
);
dla_clock_cross_full_sync cc_param_error (
.clk_src(clk_axi),
.i_src_async_resetn(1'b1),
.i_src_data(axi_param_error),
.o_src_data(),
.clk_dst(clk_ddr),
.i_dst_async_resetn(1'b1),
.o_dst_data(o_param_error)
);
assign o_istream_valid = ~reader_empty;
assign o_tready = lt_ready & ready_input_state;
endmodule
|