1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
|
// Copyright 2020-2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.
/**
* dla_layout_transform.sv
*
* Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled)
* and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also
* fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1.
* The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node
* in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler.
*
* The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension
* which improves the efficiency of the PE array. The parameters of the first convolution in the graph are
* required as input to this module. The input tensor is partitioned into volumes equal to the
* STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution.
* The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete.
*
* To achieve the folding transform, the DLA layout transform module instantiates the following modules:
* > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level
* signals and submodules.
*
* > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types.
*
* > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet.
*
* > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping
* target output position; this includes which RAM module, RAM line, and posisiton within the RAM line
* (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this
* data to emplace the incoming data into its position in the RAM.
*
* > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace
* incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold
* data, we have to buffer an output CVEC for many cycles before all the data becomes available.
* Has these submodules,
* > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources.
* > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in
* a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info
* module.
*
* > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output.
* This module is also responsible for keeping track of the output dimensions and writing padding lines when
* required.
*
*/
`resetall
`undefineall
`default_nettype none
`include "dla_acl_parameter_assert.svh"
function int calc_output_channels(
input int cvec, channels, stride_height, stride_width, stride_depth
);
integer div_result;
div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec;
calc_output_channels = div_result * cvec;
endfunction
function int calc_output_dim_max(
input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim
);
integer conv_dim;
// conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1;
conv_dim = (feature_dim + pad_dim) + 1;
// ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1;
calc_output_dim_max = conv_dim + filter_dim - 1;
endfunction
module dla_layout_transform
import dla_common_pkg::*,dla_lt_pkg::*;
#(
// Convolution parameters:
parameter int MAX_CHANNELS =0,
parameter int MAX_FEATURE_HEIGHT=0,
parameter int MAX_FEATURE_WIDTH=0,
parameter int MAX_FEATURE_DEPTH=0,
parameter int MAX_STRIDE_HEIGHT=0,
parameter int MAX_STRIDE_WIDTH=0,
parameter int MAX_STRIDE_DEPTH=0,
parameter int MAX_PAD_FRONT=0,
parameter int MAX_PAD_LEFT=0,
parameter int MAX_PAD_TOP=0,
parameter int MAX_FILTER_WIDTH=4,
parameter int MAX_FILTER_HEIGHT=4,
parameter int MAX_FILTER_DEPTH=4,
parameter int MAX_DILATION_WIDTH,
parameter int MAX_DILATION_HEIGHT,
parameter int MAX_DILATION_DEPTH,
// Exact parameters
parameter int CVEC=0,
parameter bit DO_U8_CONV=1,
parameter int DATA_ELEMENT_WIDTH = 32,
parameter int CNT_BITS = 32,
parameter int DDR_BYTES = 4,
parameter int CONFIG_DATA_BYTES,
device_family_t DEVICE,
// Derived Params
localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC),
localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH,
//todo: Capitalize constants...
localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH,
localparam int unsigned OUTPUT_DATA_WIDTH = 16,
localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR
) (
// Module connections
input wire clk,
input wire i_rstn,
input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data,
input wire i_config_valid,
output logic o_config_ready,
input wire [8*DDR_BYTES-1:0] i_data,
input wire i_valid,
input wire i_stall,
output logic o_ready,
output logic o_stall,
output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data,
output logic o_valid,
output logic o_last,
output logic o_param_error
);
`DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently.
localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH);
localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH);
localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT);
localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH);
localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0;
localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C;
// if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area!
localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions(
MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR
) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works.
localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!!
localparam int n_pool_bits = $clog2((max_num_partitions));
localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits));
localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places.
localparam int total_buffers = n_buffer_pools * cvec_per_buffer;
localparam int buffers_in_progress = max_num_partitions*7;
localparam int available = total_buffers - buffers_in_progress;
logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written;
logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0];
logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0];
logic ready_for_config;
logic ready_for_transfer;
logic input_data_valid;
logic cnt_ready;
logic next_transfer_overflow;
logic internal_reset;
logic resetn_condition, start_rx, done_frame;
logic config_ready;
logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count;
layout_transform_config_if layout_transform_config();
shortint unsigned finished_lines_reg;
int buffer_usage;
// counters used by dimension counter logic:
logic [max($clog2(MAX_CHANNELS), 1)-1:0] C_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] W_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0] IN_W_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0] S_W_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] H_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0] IN_H_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0] S_H_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] D_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0] IN_D_d [ELEM_PER_DDR-1:0];
logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0] S_D_d [ELEM_PER_DDR-1:0];
logic [MAX_DIM_BITS-1:0] Index_d [ELEM_PER_DDR-1:0];
assign resetn_condition = i_rstn & !internal_reset;
assign o_config_ready = config_ready & ready_for_config;
dla_config_deserialize #(
.CONFIG_WIDTH(CONFIG_DATA_BYTES*8)
) lt_config_deserialize (
.clk(clk),
.i_resetn(resetn_condition),
.i_valid(i_config_valid),
.i_config(i_config_data),
.o_ready(config_ready),
.if_config(layout_transform_config)
);
always_ff @( posedge clk ) begin : latch_last_frame
frame_finished <= frame_finished;
if (i_valid & o_ready & ~done_frame) begin
// We've started to accept output for this inference. This
// signals to the output logic that we can start writing outputs.
start_rx <= 1;
frame_finished <= frame_finished - 1;
done_frame <= frame_finished[$clog2(MAX_TRANSFERS)];
end else if (layout_transform_config.valid & ~start_rx) begin
// Don't accept a new config until we're finished with this transform.
ready_for_config <= 1'b0;
frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2;
end
if (~resetn_condition) begin
start_rx <= 0;
frame_finished <= MAX_TRANSFERS-2;
done_frame <= 0;
ready_for_config <= 1'b1;
end
end
assign buffer_usage = (finished_lines_reg - lines_written);
assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'.
assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame;
assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame;
assign internal_reset = o_last & o_valid & !i_stall;
// Dimension counter: Keeps track of position within tensor of incoming data.
dla_lt_dimension_counter #(
.ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
.MAX_CHANNELS(MAX_CHANNELS),
.MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
.MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
.MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
.MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
.MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
.MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
.MAX_INPUT_VOLUME(MAX_INPUT_VOLUME),
.MAX_DIM_BITS(MAX_DIM_BITS)
) dim_counter (
.clk(clk),
.i_rstn(resetn_condition),
.i_increment(input_data_valid | ~ready_for_transfer),
.if_lt_config(layout_transform_config),
.o_ready(cnt_ready),
.o_c_dim(C_d),
.o_w_dim(W_d),
.o_h_dim(H_d),
.o_d_dim(D_d),
.o_w_inner(IN_W_d),
.o_h_inner(IN_H_d),
.o_d_inner(IN_D_d),
.o_w_stride(S_W_d),
.o_h_stride(S_H_d),
.o_d_stride(S_D_d),
.o_index(Index_d)
);
// TODO(arooney): add more conversions
logic [ELEM_PER_DDR-1:0][15:0] fp16_val;
if (DO_U8_CONV) begin
dla_lt_data_conversion #(
.DDR_BYTES(DDR_BYTES),
.DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH),
.ELEMENTS_PER_CYCLE(ELEM_PER_DDR)
) data_conversion (
.clk(clk),
.i_valid(i_valid & o_ready),
.i_data(i_data),
.o_fp16_val(fp16_val),
.o_valid(input_data_valid)
);
end
else begin
assign input_data_valid = i_valid & o_ready;
always_ff @(posedge clk) begin
fp16_val <= i_data;
end
end
dla_lt_gen_index_info #(
.ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
.N_BUFFER_POOLS(n_buffer_pools),
.CVEC_PER_BUFFER(cvec_per_buffer),
.N_POOL_BITS(n_pool_bits),
.CNT_BITS(CNT_BITS),
.MAX_CHANNELS(MAX_CHANNELS),
.CVEC(CVEC),
.MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
.MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
.MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
.MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
.MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
.MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
.MAX_DIM_BITS(MAX_DIM_BITS),
.MAX_INPUT_VOLUME(MAX_INPUT_VOLUME)
) gen_index_info (
.clk(clk),
.i_rstn(resetn_condition),
.i_next_overflow(next_transfer_overflow),
.i_valid(input_data_valid),
.i_ready(cnt_ready),
.i_c_dim(C_d),
.i_w_inner(IN_W_d),
.i_h_inner(IN_H_d),
.i_d_inner(IN_D_d),
.i_w_stride(S_W_d),
.i_h_stride(S_H_d),
.i_d_stride(S_D_d),
.i_index(Index_d),
.if_lt_config(layout_transform_config),
.o_addr_queue(addr_queue),
.o_completed_vol_tally(completed_vol_tally[0]),
.o_ready_for_transfer(ready_for_transfer)
);
logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0];
logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0];
logic [n_buffer_pools-1:0] actively_reading;
logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0];
dla_lt_memory_manager #(
.NUM_BUFFER_POOLS(n_buffer_pools),
.CVEC_PER_BUFFER(cvec_per_buffer),
.ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
.CNT_BITS(CNT_BITS),
.CVEC(CVEC),
.MAX_OUTPUT_C(MAX_OUTPUT_C),
.DEVICE(DEVICE)
) memory_manager (
.clk(clk),
.i_rstn(resetn_condition),
.i_addr_queue(addr_queue),
.i_output_line_num(output_line_num),
.i_actively_reading(actively_reading),
.i_fp16_data(fp16_val),
.i_completed_vol_tally(completed_vol_tally[0]),
.o_completed_vol_tally(completed_vol_tally[1]),
.o_output_line_data(output_line_data),
.o_curr_out_line(curr_out_line)
);
dla_lt_output_logic #(
.NUM_BUFFER_POOLS(n_buffer_pools),
.CVEC_PER_BUFFER(cvec_per_buffer),
.ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
.N_POOL_BITS(n_pool_bits),
.MAX_OUTPUT_W(MAX_OUTPUT_W),
.MAX_OUTPUT_H(MAX_OUTPUT_H),
.MAX_OUTPUT_D(MAX_OUTPUT_D),
.MAX_OUTPUT_C(MAX_OUTPUT_C),
.CVEC(CVEC),
.CNT_BITS(CNT_BITS),
.MAX_DIM_BITS(MAX_DIM_BITS)
) output_logic (
.clk(clk),
.i_rstn(resetn_condition),
.i_output_line_data(output_line_data),
.i_curr_out_line(curr_out_line),
.i_completed_vol_tally(completed_vol_tally[1]),
.i_stall(i_stall),
.if_lt_config(layout_transform_config),
.i_ready(start_rx),
.o_line_num(output_line_num),
.o_read_req(actively_reading),
.o_data(o_data),
.o_valid(o_valid),
.o_last(o_last),
.o_lines_written(lines_written),
.o_finished_lines(finished_lines_reg)
);
endmodule
|