blob: ab78d2ec14b0b45061f7191b2f4f0b7fe0a7b480 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
|
# This design example only supports an AXI Width of 128 bits = 16 bytes
variable AXI_STREAM_DATA_WIDTH_BYTES 16
# This design example has a limit to ingress on-chip memory size in bytes
variable INGRESS_ON_CHIP_MEMORY_SIZE_BYTES 524288
# This design example has a limit to egress on-chip memory size in bytes
variable EGRESS_ON_CHIP_MEMORY_SIZE_BYTES 131072
# DDR-Free ED Address Map Constants
variable DLA_IP_0_CSR_ADDR 0x00038000
variable INGRESS_SGDMA_CSR_ADDR 0x00030000
variable INGRESS_SGDMA_DESCRIPTOR_ADDR 0x00030020
variable EGRESS_SGDMA_CSR_ADDR 0x00030040
variable EGRESS_SGDMA_DESCRIPTOR_ADDR 0x00030060
# Process to validate arguments to script
proc validate_args {input_file num_inferences} {
global INGRESS_ON_CHIP_MEMORY_SIZE_BYTES
global AXI_STREAM_DATA_WIDTH_BYTES
# Make sure user requested number of inferences is valid
if {$num_inferences < 0} {
puts "Number of inferences must be greater than 0."
exit 1
}
# Check if the file exists
if {![file exists $input_file]} {
puts "Error: The file '$input_file' does not exist."
exit 1
}
# Get the size of the file in bytes
set file_size [file size $input_file]
# Make sure the input file can fit into on-chip memory
if {$file_size > $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
puts "Input file '$input_file' is too large to fully fit into on-chip memory of size
$INGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes. Input file will be partitioned and transferred partially.\n"
}
# Make sure the input file is aligned to the mSGDMA/FPGA AI Suite stream width
if {[expr {$file_size % $AXI_STREAM_DATA_WIDTH_BYTES}] != 0} {
puts "Error: this design example only supports input sizes aligned to 128 bits. Please pad accordingly."
exit 1
}
# Format input file size into hex representation
set file_size_hex [format "0x%X" $file_size]
return $file_size
}
# Process to calculate # of AXI transfers that will be sent out of output streamer
# The output streamer will send out a number of AXI transfers based on the output shape
# H, W, C and AXI stream data width
proc calulate_egress_axi_transfers {C H W} {
global EGRESS_ON_CHIP_MEMORY_SIZE_BYTES
global AXI_STREAM_DATA_WIDTH_BYTES
# Calculation for # of AXI transfers from output streamer
# # of transfers in bytes = H * W * ceil(C/8)*16
set output_streamer_transfers_bytes [expr {
$H * $W * (int(($C + 7) / 8) * 16)
}]
# Make sure output streamer # of transfer bytes is aligned to AXI_STREAM_DATA_WIDTH
if {$output_streamer_transfers_bytes <=0 || [expr {$output_streamer_transfers_bytes % $AXI_STREAM_DATA_WIDTH_BYTES}] != 0} {
puts "Error with egress AXI transfer calculation. Please check your output shape size arguments (C H W)"
exit 1
}
# Ensure output inference result can fit into on-chip memory
if {$output_streamer_transfers_bytes > $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
puts "Output inference results is too large to fully fit into on-chip memory of size
$EGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes. Output inference results will be partitioned and transferred partially.\n"
}
# Format input file size into hex representation
set output_streamer_transfers_hex [format "0x%X" $output_streamer_transfers_bytes]
puts "Expecting $output_streamer_transfers_hex bytes to be transferred by FPGA AI Suite output streamer"
return $output_streamer_transfers_bytes
}
# Initiate reset via source/probe IP
proc assert_reset {} {
set issp_index 0
set issp [lindex [get_service_paths issp] 0]
set claimed_issp [claim_service issp $issp mylib]
set source_data 0x0
issp_write_source_data $claimed_issp $source_data
set source_data 0x1
issp_write_source_data $claimed_issp $source_data
}
# Initializing coreDLA (register map: fpga/csr/rtl/inc/dla_csr_constants.svh)
proc initialize_coredla {master_path} {
global DLA_IP_0_CSR_ADDR
global INGRESS_SGDMA_CSR_ADDR
global EGRESS_SGDMA_CSR_ADDR
set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x220}]
master_write_32 $master_path $csr_register_addr 0
set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x204}]
master_write_32 $master_path $csr_register_addr 0
set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x200}]
master_write_32 $master_path $csr_register_addr 3
# Writing 0x1 to this register will instruct DLA to accept input until register is cleared
set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x22c}]
master_write_32 $master_path $csr_register_addr 1
# Reset egress SGDMA
set csr_register_addr [expr {$EGRESS_SGDMA_CSR_ADDR + 0x4}]
master_write_32 $master_path $csr_register_addr 0x2
# Reset ingress SGDMA
set csr_register_addr [expr {$INGRESS_SGDMA_CSR_ADDR + 0x4}]
master_write_32 $master_path $csr_register_addr 0x2
}
proc stage_input {input_file master_path} {
# Initializing rom with input image
master_write_from_file $master_path $input_file 0x00200000
}
# Adding descriptor to egress streaming mSGDMA
proc queue_egress_descriptor {master_path size} {
global EGRESS_SGDMA_DESCRIPTOR_ADDR
# Destination addr
set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0x4}]
master_write_32 $master_path $csr_register_addr 0x00280000
# Length should be 128 bit aligned
set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0x8}]
master_write_32 $master_path $csr_register_addr $size
# Queue descriptor (Writing 0x8000_0000)
set csr_register_addr [expr {$EGRESS_SGDMA_DESCRIPTOR_ADDR + 0xc}]
master_write_32 $master_path $csr_register_addr 0x80000000
}
# Adding descriptor to ingress streaming mSGDMA
proc queue_ingress_descriptor {master_path size} {
global INGRESS_SGDMA_DESCRIPTOR_ADDR
# Source addr
master_write_32 $master_path $INGRESS_SGDMA_DESCRIPTOR_ADDR 0x00200000
# Transfer length in bytes (input size)
set csr_register_addr [expr {$INGRESS_SGDMA_DESCRIPTOR_ADDR + 0x8}]
master_write_32 $master_path $csr_register_addr $size
# Queue descriptor
set csr_register_addr [expr {$INGRESS_SGDMA_DESCRIPTOR_ADDR + 0xc}]
master_write_32 $master_path $csr_register_addr 0x80000000
}
# Read output from on-chip memory
proc read_output {master_path output_file size} {
master_read_to_file $master_path $output_file 0x00280000 $size
}
# Read output from on-chip memory
proc check_inference_count {master_path iteration} {
global DLA_IP_0_CSR_ADDR
# Completion counter assert from index
set completion_counter_assert 0x00000000
set completion_counter_assert [expr {$completion_counter_assert + $iteration}]
set formatted_counter_assert [format "0x%08X" $completion_counter_assert]
# Check what completion counter CSR in HW is set to
set csr_register_addr [expr {$DLA_IP_0_CSR_ADDR + 0x224}]
set completion_counter_result [master_read_32 $master_path $csr_register_addr 1]
puts "Completion counter from HW: $completion_counter_result"
if {$completion_counter_result != $formatted_counter_assert} {
error "Error: completion counter should be equal to $formatted_counter_assert but instead is $completion_counter_result"
}
}
# This process handles creating a binary file from input partition data
proc create_input_bin {partition_data index} {
set temp_file "chunk_$index.bin"
set temp_fh [open $temp_file "wb"]
fconfigure $temp_fh -translation binary
puts -nonewline $temp_fh $partition_data
close $temp_fh
return $temp_file
}
# This process polls a register and returns if assertion is true within a timeout window
proc poll_register {master_path register_addr register_val_assert} {
# Set timeout to be 30 seconds (in centi-seconds)
set timeout_count 3000
while {$timeout_count > 0} {
set register_val [master_read_32 $master_path $register_addr 1]
if {$register_val == $register_val_assert} {
break
}
set timeout_count [expr {$timeout_count - 1}]
after 10
}
if {$timeout_count == 0} {
puts "Register polling timeout. CSR addr: $register_addr = $register_val \nRegister should be = $register_val_assert"
exit 1
}
}
# Printing usage process
proc print_usage {} {
puts "Usage: system-console --script system_console_script.tcl <input.bin file> <# of inferences>
<output channels> <output height> <output width>"
exit 1
}
# Main Function
proc main {argc argv} {
global INGRESS_ON_CHIP_MEMORY_SIZE_BYTES
global EGRESS_ON_CHIP_MEMORY_SIZE_BYTES
global AXI_STREAM_DATA_WIDTH_BYTES
global INGRESS_SGDMA_DESCRIPTOR_ADDR
global EGRESS_SGDMA_DESCRIPTOR_ADDR
global INGRESS_SGDMA_CSR_ADDR
global EGRESS_SGDMA_CSR_ADDR
# Check if the script should display help information
if {$argc > 0} {
set firstArg [lindex $argv 0]
if {[string equal $firstArg "help"] || [string equal $firstArg "--help"] || [string equal $firstArg "-help"]} {
print_usage
}
}
# Check the total number of arguments
if {$argc != 5} {
print_usage
}
# Setting script arguments to variables
set input_file [lindex $argv 0]
set num_inferences [lindex $argv 1]
set C [lindex $argv 2]
set H [lindex $argv 3]
set W [lindex $argv 4]
# Validating script arguments. Return input file size in bytes
set file_size [validate_args $input_file $num_inferences]
set file_size_hex [format "0x%X" $file_size]
# Calculate # of AXI transfers from FPGA AI Suite IP output streamer in bytes
set output_streamer_transfers [calulate_egress_axi_transfers $C $H $W]
puts "\nInput file provided: $input_file and is of size $file_size_hex bytes"
puts "Number of inferences: $num_inferences"
# Claim service path to System Console
set mpath [lindex [get_service_paths master] 0]
set master_path [claim_service master $mpath ""]
puts "\n________________________________________________________________________________"
puts " STARTING FPGA AI SUITE INFERENCE "
puts "________________________________________________________________________________\n"
# Assert resetn using source/probe IP
assert_reset
# Initialize coreDLA's CSR registers
initialize_coredla $master_path
# Open the input binary file for reading
for {set i 1} {$i <= $num_inferences} {incr i} {
# Open input file per iteration due to the potential partioning in the case where input file > INGRESS_ON_CHIP_MEMORY_SIZE_BYTES.
set input_fh [open $input_file "rb"]
fconfigure $input_fh -translation binary
# Create an output file every iteration of inferences
set combined_fh [open "output$i.bin" "wb"]
fconfigure $combined_fh -translation binary
# Logic to ensure input image can fully fit into ingress on-chip memory
# If not, must partition input data into chunks at a time. This allows us to queue
# descriptors for partial input sizes.
set num_input_partition [expr {int(($file_size + $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES - 1) / $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES)}]
for {set j 0} {$j < $num_input_partition} {incr j} {
set offset [expr {$j * $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES}]
set size [
expr {($file_size - $offset) < $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES ? ($file_size - $offset) : $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES}
]
# Seek to the offset and read the chunk
# Need to catch an error if offset > file size
if {[catch {seek $input_fh $offset} err]} {
puts "Error seeking to offset $offset: $err"
close $input_fh
exit 1
}
# Begin partioning the input data to INGRESS_ON_CHIP_MEMORY_SIZE_BYTES chunks
set partition_data [read $input_fh $size]
set partition_data_file_name [create_input_bin $partition_data $j]
stage_input $partition_data_file_name $master_path
queue_ingress_descriptor $master_path $size
file delete $partition_data_file_name
# Poll SGDMA register to check if input data streaming is complete
set sgdma_csr_assert 0x00000002
poll_register $master_path $INGRESS_SGDMA_CSR_ADDR $sgdma_csr_assert
}
close $input_fh
# Logic to ensure output inference results can fully fit into egress on-chip memory
# If not, must partition output data into chunks at a time. This allows us to queue
# descriptors for partial output sizes.
set num_output_partition [expr {int(($output_streamer_transfers + $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES - 1) / $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES)}]
for {set j 0} {$j < $num_output_partition} {incr j} {
set offset [expr {$j * $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES}]
set size [
expr {($output_streamer_transfers - $offset) < $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES ? ($output_streamer_transfers - $offset) : $EGRESS_ON_CHIP_MEMORY_SIZE_BYTES}
]
# Queue chunks of EGRESS_ON_CHIP_MEMORY_SIZE_BYTES at a time to ensure a fit in egress on-chip memory
queue_egress_descriptor $master_path $size
# Poll SGDMA register to check if output data streaming is complete
set sgdma_csr_assert 0x00000002
poll_register $master_path $EGRESS_SGDMA_CSR_ADDR $sgdma_csr_assert
# Write a partition of the inference result to the partition file
set output_file "partition_out_$j.bin"
read_output $master_path $output_file $size
# Open partioned output inference result
set bin_fh [open $output_file "rb"]
fconfigure $bin_fh -translation binary
set bin_data [read $bin_fh]
# Append smaller partition of inference result to larger output$i.bin file for inference iteration
puts -nonewline $combined_fh $bin_data
close $bin_fh
file delete $output_file
}
# Ensure inference count has gone up
check_inference_count $master_path $i
close $combined_fh
}
puts "\n$num_inferences inferences successfully completed"
}
# Main function call
main $argc $argv
|