blob: f0cd5f79c3dfa6f17be565a1b468cb1e147e9d1f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
# Initiate reset via source/probe IP
proc assert_reset {} {
set issp_index 0
set issp [lindex [get_service_paths issp] 0]
set claimed_issp [claim_service issp $issp mylib]
set source_data 0x0
issp_write_source_data $claimed_issp $source_data
set source_data 0x1
issp_write_source_data $claimed_issp $source_data
}
# Initializing coreDLA (register map: fpga/csr/rtl/inc/dla_csr_constants.svh)
proc initialize_coredla {master_path} {
master_write_32 $master_path 0x00038220 0
master_write_32 $master_path 0x00038204 0
master_write_32 $master_path 0x00038200 3
# Writing 0x1 to this register will instruct DLA to accept input until register is cleared
master_write_32 $master_path 0x0003822c 1
# Reset egress descriptor
master_write_32 $master_path 0x00030044 0x2
# Stop the descriptor
master_write_32 $master_path 0x00030044 0x20
# Reset ingress descriptor
master_write_32 $master_path 0x00030004 0x2
# Stop the descriptor
master_write_32 $master_path 0x00030004 0x20
}
proc start_stream {master_path} {
# Start the egress descriptor
master_write_32 $master_path 0x00030044 0x00
# Start the ingress descriptor
master_write_32 $master_path 0x00030004 0x00
}
# This checks if the descriptor buffers are full
proc check_descriptor_buffer_full {master_path} {
set egress_descriptor_status [master_read_32 $master_path 0x00030040 1]
set ingress_descriptor_status [master_read_32 $master_path 0x00030000 1]
if {$egress_descriptor_status & 0x4} {
error "Egress descriptor is full."
}
if {$ingress_descriptor_status & 0x4} {
error "Ingress descriptor is full."
}
}
proc stage_input {input_file master_path} {
# Initializing rom with input image
master_write_from_file $master_path $input_file 0x00200000
}
# Adding descriptor to egress streaming mSGDMA
proc queue_egress_descriptor {master_path} {
# Destination addr
master_write_32 $master_path 0x00030064 0x00280000
# Length should be 128 bit aligned
master_write_32 $master_path 0x00030068 0xA800
# Queue descriptor
master_write_32 $master_path 0x0003006c 0x80000000
}
# Adding descriptor to ingress streaming mSGDMA
proc queue_ingress_descriptor {master_path} {
# Source addr
master_write_32 $master_path 0x00030020 0x00200000
# Transfer length in bytes (input size)
master_write_32 $master_path 0x00030028 0x17A00
# Queue descriptor
master_write_32 $master_path 0x0003002c 0x80000000
}
# Copying input and output to file
proc copy_input_for_validation {master_path} {
master_read_to_file $master_path input.bin 0x00200000 0x17A00
}
# Read inference counter values to get performance
# There is an assumption here that the clk_ddr is attached to 100MHz
proc get_performance {master_path num_inferences} {
set active_clk_lo [master_read_32 $master_path 0x00038240 1]
set active_clk_hi [master_read_32 $master_path 0x00038244 1]
set total_active_clk_count [expr { $active_clk_lo | ($active_clk_hi << 32) }]
set active_clk_count_per_inference [expr {$total_active_clk_count / $num_inferences}]
puts "Total active clk cycles: 0x$total_active_clk_count"
set all_active_clk_lo [master_read_32 $master_path 0x00038248 1]
set all_active_clk_hi [master_read_32 $master_path 0x0003824c 1]
set all_active_clk_count [expr { $all_active_clk_lo | ($all_active_clk_hi << 32) }]
set all_active_clk_count_per_inference [expr {$all_active_clk_count / $num_inferences}]
set core_active_clk_lo [master_read_32 $master_path 0x0003827c 1]
set core_active_clk_hi [master_read_32 $master_path 0x00038280 1]
set total_core_active_clk_count [expr { $core_active_clk_lo | ($core_active_clk_hi << 32) }]
set core_active_clk_count_per_inference [expr {$total_core_active_clk_count / $num_inferences}]
puts "Total core active clk cycles (without input and output streamer): 0x$total_core_active_clk_count"
set clk_period [expr { 1.0 / 100000000.0 }]
set final_fps [expr { 1 / ($clk_period * $active_clk_count_per_inference) }]
set final_latency [expr { 1 / ($clk_period * $all_active_clk_count_per_inference) }]
puts "--------------------------------------------------------"
puts "Final Throughput: $final_fps fps assuming 100MHz clk_ddr"
}
# Poll the completion counter until it reaches the expected number of inferences
proc wait_for_completion_counter {master_path num_inferences} {
# Set timeout to be 30 seconds (in centi-seconds)
set timeout_count 3000
while {$timeout_count > 0} {
set completion_counter_result [master_read_32 $master_path 0x00038224 1]
if {$completion_counter_result == $num_inferences} {
break
}
set timeout_count [expr {$timeout_count - 1}]
after 10
}
if {$timeout_count == 0} {
error "Timeout hit at 30 seconds. Increase the timeout if the inference is expected to take longer."
}
}
# Read output from on-chip memory
proc read_last_output {master_path num_inference} {
# Completion counter assert form index
set completion_counter_assert 0x00000000
set completion_counter_assert [expr {$completion_counter_assert + $num_inference}]
set formatted_counter_assert [format "0x%08X" $completion_counter_assert]
# Check what completion counter CSR in HW is set to
set completion_counter_result [master_read_32 $master_path 0x00038224 1]
puts "Completion counter from HW: $completion_counter_result"
if {$completion_counter_result == $formatted_counter_assert} {
master_read_to_file $master_path output0.bin 0x00280000 0xA800
} else {
error "Error: completion counter should be equal to $formatted_counter_assert but instead is $completion_counter_result"
}
}
# This design example has a limit to ingress on-chip memory size in bytes
set INGRESS_ON_CHIP_MEMORY_SIZE_BYTES 524288
# Main Function
if {$argc != 1} {
error "Usage: system-console --script system_console_script_perf.tcl <input.bin file>"
}
set input_file [lindex $argv 0]
puts "Input file provided: $input_file"
set file_size [file size $input_file]
# Make sure the input file can fit into on-chip memory
if {$file_size > $INGRESS_ON_CHIP_MEMORY_SIZE_BYTES} {
puts "Input file '$input_file' is too large to fully fit into on-chip memory of size
$INGRESS_ON_CHIP_MEMORY_SIZE_BYTES bytes.\nThe `system_console_script.tcl` file will
partition the input file for partial transfers to solve this problem but it should not
be used for performance testing. Please increase the on-chip memory size for performance
testing.\n"
exit 1
}
set mpath [lindex [get_service_paths master] 0]
set master_path [claim_service master $mpath ""]
# Assert resetn using source/probe IP
assert_reset
# Stage input file into on-chip memory
stage_input $input_file $master_path
# Initialize coreDLA's CSR registers
initialize_coredla $master_path
# Number of inferences cannot exceed the descriptor queue FIFO size
set num_inferences 32
for {set i 1} {$i <= $num_inferences} {incr i} {
check_descriptor_buffer_full $master_path
# Queue egress descriptor into mSGDMA
queue_egress_descriptor $master_path
# Queue egress descriptor into mSGDMA
queue_ingress_descriptor $master_path
}
start_stream $master_path
wait_for_completion_counter $master_path $num_inferences
get_performance $master_path $num_inferences
read_last_output $master_path $num_inferences
puts "\n$num_inferences inferences successfully completed"
|