1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
|
// Copyright 2020-2020 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.
// This module implements the CSR for DMA. It also includes the descriptor queue
// and interrupt request generator. The CSR is implemented with a RAM. Certain
// values are kept live in registers, such the interrupt control and mask. This
// makes it easier to detect when a change has happened (instead of trying to a
// read-modify-write with the RAM).
//
// The AXI4 lite slave interface is usually going to backpressure PCIe. There is
// a state machine which allows one outstanding read request, or one outstanding
// write request at a time (write requests can be outstanding if the writeack is
// backpressured which AXI allows). There is a register which tracks whether the
// last request was a read or write, this enables round robin arbitration. Each
// request takes a few clock cycles to process, as the address needs to be decoded
// to determine if a write is allowed to commit to the RAM, or if we need to use
// read data from one of the registers instead of the RAM.
//
// Special offsets are defined as localparams below. Writing to DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4
// will cause one unit of work to be enqueued in the descriptor queue. Currently
// this involves writing 8 values to a fifo, which are then consumed by the config
// reader. Internal to the config reader, 4 values go to the config reader address
// generator, the other 4 go to the config reader intercept.
//
// Beware the following assumptions about how the host issues requests to this CSR:
// - no bursts (required by AXI4 lite)
// - byte enables are assumed to be all 1 (no partial word access)
// - all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)
`resetall
`undefineall
`default_nettype none
`include "dla_acl_parameter_assert.svh"
module dla_dma_csr #(
parameter int CSR_ADDR_WIDTH, //width of the byte address signal, determines CSR address space size, e.g. 11 bit address = 2048 bytes, the largest size that uses only 1 M20K
parameter int CSR_DATA_BYTES, //width of the CSR data path, typically 4 bytes
parameter int CONFIG_DATA_BYTES, //data width of the config network output port, typically 4 bytes, the descriptor queue matches this so that config decode can be reused
parameter int CONFIG_READER_DATA_BYTES, //data width of the config network input port, needed by config reader address generator for loop update
parameter int ENABLE_INPUT_STREAMING,
parameter int ENABLE_OUTPUT_STREAMING,
parameter int ENABLE_ON_CHIP_PARAMETERS
) (
input wire clk_ddr,
input wire clk_pcie,
input wire clk_dla,
input wire i_sclrn_ddr, //active low reset that has already been synchronized to clk_ddr
input wire i_resetn_async, //active low reset that has NOT been synchronized to any clock, only to be consumed by dcfifo
//updates for interrupt, runs on ddr clock
input wire i_token_done, //feature writer reports it is done
input wire i_token_stream_started, //input streamer is reading the first word
input wire i_stream_received_first_word,
input wire i_stream_sent_last_word,
input wire i_token_error, //dla has encountered some error, assert high for one clock cycle to report it to host (assuming mask bit is 1)
input wire i_license_flag,
input wire i_token_out_of_inferences,
//snoop signals for the input feature, output featuer, and filter LSU's core <--> fabric traffic
//run on clk_ddr
input wire i_input_feature_rvalid,
input wire i_input_feature_rready,
input wire i_input_filter_rvalid,
input wire i_input_filter_rready,
input wire i_output_feature_wvalid,
input wire i_output_feature_wready,
//interrupt request to pcie, runs on pcie clock
output logic o_interrupt_level, //level sensitive interrupt
//read side of descriptor queue goes to config reader, runs on ddr clock
output logic o_config_valid,
output logic [8*CONFIG_DATA_BYTES-1:0] o_config_data,
output logic o_config_for_intercept, //0 = goes to config reader addr gen, 1 = goes to config reader intercept
input wire i_config_ready,
//debug network AXI-4 lite interface, read request and read response channels, runs on dla_clock
output logic o_debug_network_arvalid,
output logic [8*CSR_DATA_BYTES-1:0] o_debug_network_araddr,
input wire i_debug_network_arready,
input wire i_debug_network_rvalid,
input wire [8*CSR_DATA_BYTES-1:0] i_debug_network_rdata,
output logic o_debug_network_rready,
//AXI4-lite slave interface for host control, runs on ddr clock
//no bursts, byte enables are assumed to be all 1, all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)
input wire i_csr_arvalid,
input wire [CSR_ADDR_WIDTH-1:0] i_csr_araddr,
output logic o_csr_arready,
output logic o_csr_rvalid,
output logic [8*CSR_DATA_BYTES-1:0] o_csr_rdata,
input wire i_csr_rready,
input wire i_csr_awvalid,
input wire [CSR_ADDR_WIDTH-1:0] i_csr_awaddr,
output logic o_csr_awready,
input wire i_csr_wvalid,
input wire [8*CSR_DATA_BYTES-1:0] i_csr_wdata,
output logic o_csr_wready,
output logic o_csr_bvalid,
input wire i_csr_bready,
//reset request for the whole ip, runs on ddr clock
output logic o_request_ip_reset,
//output bit to start/stop streaming interface
output logic o_streaming_active
);
/////////////////////////////////
// Parameter legality checks //
/////////////////////////////////
//signal widths cannot be trivial
`DLA_ACL_PARAMETER_ASSERT(CSR_DATA_BYTES >= 1)
`DLA_ACL_PARAMETER_ASSERT(CONFIG_DATA_BYTES >= 1)
//csr address space cannot be trivial
`DLA_ACL_PARAMETER_ASSERT(2**CSR_ADDR_WIDTH > CONFIG_DATA_BYTES)
//offsets must be within address space
localparam int CSR_LO_ADDR = $clog2(CSR_DATA_BYTES); //number of LSBs that must be 0 in order for byte address to be word aligned
localparam int CSR_WORD_ADDR_WIDTH = CSR_ADDR_WIDTH - CSR_LO_ADDR;
/////////////////
// Constants //
/////////////////
`include "dla_dma_constants.svh"
//special offsets -- these values are defined in one place and shared between hardware and software
//the constants from the dla_dma_constants.svh header file that CSR cares about are named DLA_DMA_CSR_OFFSET_**** and DLA_DMA_CSR_INTERRUPT_****
//state machine
enum {
STATE_IDLE_BIT,
STATE_READ_ACCEPT_BIT,
STATE_READ_ADDR_BIT,
STATE_READ_DATA_BIT,
STATE_WRITE_ACCEPT_BIT,
STATE_WRITE_COMMIT_BIT,
STATE_DESCRIPTOR_BIT,
STATE_AWAIT_RESET_BIT
} index;
enum logic [index.num()-1:0] {
//1-hot encodings
STATE_IDLE = 1 << STATE_IDLE_BIT,
STATE_READ_ACCEPT = 1 << STATE_READ_ACCEPT_BIT,
STATE_READ_ADDR = 1 << STATE_READ_ADDR_BIT,
STATE_READ_DATA = 1 << STATE_READ_DATA_BIT,
STATE_WRITE_ACCEPT = 1 << STATE_WRITE_ACCEPT_BIT,
STATE_WRITE_COMMIT = 1 << STATE_WRITE_COMMIT_BIT,
STATE_DESCRIPTOR = 1 << STATE_DESCRIPTOR_BIT,
STATE_AWAIT_RESET = 1 << STATE_AWAIT_RESET_BIT,
XXX = 'x
} state;
localparam int MAX_JOBS_ACTIVE = 64; //upper bounded by how many descriptors the queue can hold
localparam int JOBS_ACTIVE_WIDTH = $clog2(MAX_JOBS_ACTIVE+1);
///////////////
// Signals //
///////////////
//ram
logic ram_wr_en;
logic [CSR_WORD_ADDR_WIDTH-1:0] ram_wr_addr, ram_rd_addr;
logic [8*CSR_DATA_BYTES-1:0] ram_wr_data, ram_rd_data;
//descriptor queue
logic descriptor_queue_forced_write, descriptor_queue_full, descriptor_diagnostics_almost_full;
logic [8*CONFIG_DATA_BYTES:0] descriptor_queue_data;
logic [2:0] descriptor_words_read;
logic first_word_of_descriptor_being_read, jobs_active_is_nonzero, core_jobs_active_is_nonzero;
logic [JOBS_ACTIVE_WIDTH-1:0] jobs_active, core_jobs_active;
//Perfomance counters connections
logic [31:0] total_clocks_active_lo, total_clocks_active_hi;
logic [31:0] total_core_clocks_active_lo, total_core_clocks_active_hi;
logic [31:0] total_clocks_for_all_jobs_lo, total_clocks_for_all_jobs_hi;
logic [31:0] number_of_input_feature_reads_lo, number_of_input_feature_reads_hi;
logic [31:0] number_of_input_filter_reads_lo, number_of_input_filter_reads_hi;
logic [31:0] number_of_output_feature_writes_lo, number_of_output_feature_writes_hi;
//state machine
logic previous_was_write;
logic [3:0] descriptor_count;
//specific offsets are implemented in registers instead of RAM
logic interrupt_control_error, interrupt_control_done, interrupt_mask_error, interrupt_mask_done;
logic [8*CSR_DATA_BYTES-1:0] completion_count;
logic descriptor_diagnostics_overflow;
//address decode for specific offsets that are implemented in registers or require some action to be taken
logic write_to_interrupt_control, read_from_interrupt_control, write_to_interrupt_mask, read_from_interrupt_mask;
logic write_to_ram, read_from_desc_diagnostics, read_from_completion_count, enqueue_descriptor;
logic read_from_clocks_active_lo, read_from_clocks_active_hi, read_from_clocks_all_jobs_lo, read_from_clocks_all_jobs_hi;
logic read_from_core_clocks_active_lo, read_from_core_clocks_active_hi;
logic read_from_input_feature_reads_lo, read_from_input_feature_reads_hi;
logic read_from_input_filter_reads_lo, read_from_input_filter_reads_hi;
logic read_from_output_feature_writes_lo, read_from_output_feature_writes_hi;
logic write_to_debug_network_addr, read_from_debug_network_valid, read_from_debug_network_data;
logic read_from_license_flag;
logic read_from_ip_reset, write_to_ip_reset;
//clock crosser for interrupt
logic ddr_interrupt_level;
//debug network read request address
logic debug_network_arvalid, not_o_debug_network_arvalid;
logic [8*CSR_DATA_BYTES-1:0] debug_network_araddr;
//debug network read response data
logic not_o_debug_network_rready, debug_network_dcfifo_empty, debug_network_rvalid, debug_network_rready;
logic [8*CSR_DATA_BYTES-1:0] debug_network_dcfifo_data, debug_network_rdata;
//streaming states
logic write_ready_streaming_interface;
logic read_ready_streaming_interface;
logic dla_sclrn;
//reset parameterization
localparam int RESET_USE_SYNCHRONIZER = 1;
localparam int RESET_PIPE_DEPTH = 3;
localparam int RESET_NUM_COPIES = 1;
dla_reset_handler_simple #(
.USE_SYNCHRONIZER (RESET_USE_SYNCHRONIZER),
.PIPE_DEPTH (RESET_PIPE_DEPTH),
.NUM_COPIES (RESET_NUM_COPIES)
)
ddr_reset_synchronizer
(
.clk (clk_dla),
.i_resetn (i_resetn_async),
.o_sclrn (dla_sclrn)
);
///////////
// RAM //
///////////
//could use hld_ram, but this simple ram doesn't need the depth stitching or clock enable magic that hld_ram provides
altera_syncram
#(
.address_aclr_b ("NONE"),
.address_reg_b ("CLOCK0"),
.clock_enable_input_a ("BYPASS"),
.clock_enable_input_b ("BYPASS"),
.clock_enable_output_b ("BYPASS"),
.enable_ecc ("FALSE"),
.init_file ("dla_dma_csr_discovery_rom.mif"),
.intended_device_family ("Arria 10"), //Quartus will fix this automatically
.lpm_type ("altera_syncram"),
.numwords_a (2**CSR_WORD_ADDR_WIDTH),
.numwords_b (2**CSR_WORD_ADDR_WIDTH),
.operation_mode ("DUAL_PORT"),
.outdata_aclr_b ("NONE"),
.outdata_sclr_b ("NONE"),
.outdata_reg_b ("CLOCK0"),
.power_up_uninitialized ("FALSE"),
.ram_block_type ("M20K"),
.read_during_write_mode_mixed_ports ("DONT_CARE"),
.widthad_a (CSR_WORD_ADDR_WIDTH),
.widthad_b (CSR_WORD_ADDR_WIDTH),
.width_a (8*CSR_DATA_BYTES),
.width_b (8*CSR_DATA_BYTES),
.width_byteena_a (1)
)
csr_ram
(
.address_a (ram_wr_addr),
.address_b (ram_rd_addr),
.clock0 (clk_ddr),
.data_a (ram_wr_data),
.wren_a (ram_wr_en),
.q_b (ram_rd_data),
.address2_a (1'b1),
.address2_b (1'b1),
.addressstall_a (1'b0),
.addressstall_b (1'b0),
.byteena_a (1'b1),
.byteena_b (1'b1),
.clock1 (1'b1),
.clocken0 (1'b1),
.clocken1 (1'b1),
.clocken2 (1'b1),
.clocken3 (1'b1),
.data_b ({(8*CSR_DATA_BYTES){1'b1}}),
.eccencbypass (1'b0),
.eccencparity (8'b0),
.eccstatus (),
.q_a (),
.rden_a (1'b1),
.rden_b (1'b1),
.wren_b (1'b0)
);
////////////////////////
// Descriptor Queue //
////////////////////////
//runtime knows how many jobs it has enqueued and how many jobs have finished
//runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware
localparam int DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF = DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB; //almost full asserts when queue only has space for 1 more job
dla_hld_fifo #(
.WIDTH (8*CONFIG_DATA_BYTES + 1),
.DEPTH (DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE), //this is set to 512 in dla_dma_constants.svh, may as well use up full depth of M20K
.ALMOST_FULL_CUTOFF (DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF),
.ASYNC_RESET (0), //consume reset synchronously
.SYNCHRONIZE_RESET (0), //reset is already synchronized
.STYLE ("ms")
)
descriptor_queue
(
.clock (clk_ddr),
.resetn (i_sclrn_ddr),
.i_valid (descriptor_queue_forced_write),
.i_data (descriptor_queue_data),
.o_stall (descriptor_queue_full), //software is responsible for not overflowing this fifo
.o_almost_full (descriptor_diagnostics_almost_full),
.o_valid (o_config_valid),
.o_data ({o_config_for_intercept, o_config_data}),
.i_stall (~i_config_ready | i_token_out_of_inferences)
);
////////////////////////////
// Performance counters //
////////////////////////////
//Auxillary logic that controls the jobs active counters
assign first_word_of_descriptor_being_read = o_config_valid & i_config_ready & (descriptor_words_read==3'h0); //desc words read was 0, going to be 1
always_ff @(posedge clk_ddr) begin
if (o_config_valid & i_config_ready) descriptor_words_read <= descriptor_words_read + 1'b1;
if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
// In this case, we should only track the cycles between the feature data being read, and
// results being streamed out, since we continually read the on-chip config params
if (i_token_stream_started & ~i_token_done) jobs_active <= jobs_active + 1'b1;
if (~i_token_stream_started & i_token_done) jobs_active <= jobs_active - 1'b1;
end else begin
if (first_word_of_descriptor_being_read & ~i_token_done) jobs_active <= jobs_active + 1'b1;
if (~first_word_of_descriptor_being_read & i_token_done) jobs_active <= jobs_active - 1'b1;
end
if (~i_sclrn_ddr) begin
descriptor_words_read <= 3'h0;
jobs_active <= '0;
jobs_active_is_nonzero <= 1'b0;
end
end
logic core_jobs_active_is_nonzero_ddr_clk;
always_ff @(posedge clk_dla) begin
if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
// In this case, we should only track the cycles between the feature data being read, and
// results being streamed out, since we continually read the on-chip config params
if (i_stream_received_first_word & ~i_stream_sent_last_word) core_jobs_active <= core_jobs_active + 1'b1;
if (~i_stream_received_first_word & i_stream_sent_last_word) core_jobs_active <= core_jobs_active - 1'b1;
core_jobs_active_is_nonzero <= core_jobs_active != 0;
end
if (~dla_sclrn) begin
core_jobs_active <= '0;
core_jobs_active_is_nonzero <= 1'b0;
end
end
// crossover core_jobs_active_is_nonzero from dla to ddr clk
dla_clock_cross_full_sync dla_to_ddr_clock_cross_sync
(
.clk_src (clk_dla),
.i_src_async_resetn (1'b1),
.i_src_data (core_jobs_active_is_nonzero),
.o_src_data (),
.clk_dst (clk_ddr),
.i_dst_async_resetn (1'b1),
.o_dst_data (core_jobs_active_is_nonzero_ddr_clk)
);
//track the number of active jobs
dla_dma_counter_64 count_total_core_clocks_active (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (core_jobs_active_is_nonzero_ddr_clk),
.i_increment_val (32'b1),
.i_read_counter_low_bits (read_from_core_clocks_active_lo),
.o_counter_low_bits (total_core_clocks_active_lo),
.o_counter_high_bits_latch (total_core_clocks_active_hi)
);
//a job is active once the first word of its descriptor is read from the queue
//a job is finished once the feature writer sends a done token
dla_dma_counter_64 count_total_clocks_active (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (jobs_active != 0),
.i_increment_val (32'b1),
.i_read_counter_low_bits (read_from_clocks_active_lo),
.o_counter_low_bits (total_clocks_active_lo),
.o_counter_high_bits_latch (total_clocks_active_hi)
);
dla_dma_counter_64 count_total_clocks_for_all_jobs (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (1'b1),
.i_increment_val (jobs_active),
.i_read_counter_low_bits (read_from_clocks_all_jobs_lo),
.o_counter_low_bits (total_clocks_for_all_jobs_lo),
.o_counter_high_bits_latch (total_clocks_for_all_jobs_hi)
);
//tracks the number of input feature reads in terms of memory words transfers.
dla_dma_counter_64 count_input_feature_reads (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (i_input_feature_rready & i_input_feature_rvalid),
.i_increment_val (32'b1),
.i_read_counter_low_bits (read_from_input_feature_reads_lo),
.o_counter_low_bits (number_of_input_feature_reads_lo),
.o_counter_high_bits_latch (number_of_input_feature_reads_hi)
);
//tracks the number of output feature writes in terms of memory words transfers.
dla_dma_counter_64 count_output_feature_writes (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (i_output_feature_wready & i_output_feature_wvalid),
.i_increment_val (32'b1),
.i_read_counter_low_bits (read_from_output_feature_writes_lo),
.o_counter_low_bits (number_of_output_feature_writes_lo),
.o_counter_high_bits_latch (number_of_output_feature_writes_hi)
);
//tracks the number of input filter reads in terms of memory words transfers.
dla_dma_counter_64 count_input_filter_reads (
.i_clk (clk_ddr),
.i_sclrn (i_sclrn_ddr),
.i_increment_en (i_input_filter_rready & i_input_filter_rvalid),
.i_increment_val (32'b1),
.i_read_counter_low_bits (read_from_input_filter_reads_lo),
.o_counter_low_bits (number_of_input_filter_reads_lo),
.o_counter_high_bits_latch (number_of_input_filter_reads_hi)
);
//////////////////////
// Address decode //
//////////////////////
always_ff @(posedge clk_ddr) begin
//the csr address space is mostly read only, except for a few specific offsets listed below
write_to_ram <= 1'b0;
if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4) write_to_ram <= 1'b1;
if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4) write_to_ram <= 1'b1;
if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4) write_to_ram <= 1'b1;
if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4) write_to_ram <= 1'b1;
if (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4) write_to_ram <= 1'b1;
if (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4) write_to_ram <= 1'b1;
//decode specific addresses in which the storage lives in registers
write_to_interrupt_control <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
read_from_interrupt_control <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
write_to_interrupt_mask <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
read_from_interrupt_mask <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
read_from_desc_diagnostics <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS/4);
read_from_completion_count <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_COMPLETION_COUNT/4);
read_from_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO/4);
read_from_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI/4);
read_from_core_clocks_active_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_LO/4);
read_from_core_clocks_active_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_HI/4);
read_from_clocks_all_jobs_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO/4);
read_from_clocks_all_jobs_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI/4);
write_to_debug_network_addr <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4);
read_from_debug_network_valid <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID/4);
read_from_debug_network_data <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA/4);
read_from_license_flag <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG /4);
read_from_ip_reset <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
read_from_input_filter_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO/4);
read_from_input_filter_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI/4);
read_from_input_feature_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO/4);
read_from_input_feature_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI/4);
read_from_output_feature_writes_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO/4);
read_from_output_feature_writes_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI/4);
read_ready_streaming_interface<= (ram_rd_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);
//decode specific addresses in which an action must be taken
enqueue_descriptor <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4);
write_to_ip_reset <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
if (ENABLE_INPUT_STREAMING) begin
write_ready_streaming_interface <= (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);
end
end
/////////////////////
// State machine //
/////////////////////
always_ff @(posedge clk_ddr) begin
//default behavior
o_csr_arready <= 1'b0;
o_csr_rvalid <= 1'b0;
o_csr_awready <= 1'b0;
o_csr_wready <= 1'b0;
o_csr_bvalid <= 1'b0;
ram_wr_en <= 1'b0;
descriptor_queue_forced_write <= 1'b0;
descriptor_queue_data <= 'x;
debug_network_arvalid <= 1'b0;
debug_network_rready <= 1'b0;
o_request_ip_reset <= 1'b0;
o_streaming_active <= o_streaming_active;
unique case (1'b1)
state[STATE_IDLE_BIT]: begin
if (i_csr_arvalid && (previous_was_write || ~(i_csr_awvalid && i_csr_wvalid))) begin
o_csr_arready <= 1'b1;
state <= STATE_READ_ACCEPT;
ram_rd_addr <= i_csr_araddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
end
if (i_csr_awvalid && i_csr_wvalid && (~previous_was_write || ~i_csr_arvalid)) begin
o_csr_awready <= 1'b1;
o_csr_wready <= 1'b1;
state <= STATE_WRITE_ACCEPT;
ram_wr_addr <= i_csr_awaddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
ram_wr_data <= i_csr_wdata;
end
end
state[STATE_READ_ACCEPT_BIT]: begin
//o_csr_arready is asserted now, indicates csr has accepted a read
//ram_rd_addr valid now
state <= STATE_READ_ADDR;
previous_was_write <= 1'b0;
end
state[STATE_READ_ADDR_BIT]: begin
//hardened input register inside m20k valid now
state <= STATE_READ_DATA;
end
state[STATE_READ_DATA_BIT]: begin
//hardened output register inside m20k valid now
o_csr_rvalid <= 1'b1;
o_csr_rdata <= ram_rd_data;
if (read_from_interrupt_control) begin
o_csr_rdata <= '0;
o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_control_error;
o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_control_done;
end
if (read_from_interrupt_mask) begin
o_csr_rdata <= '0;
o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_mask_error;
o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT] <= interrupt_mask_done;
end
if (read_from_desc_diagnostics) begin
o_csr_rdata <= '0;
o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT] <= descriptor_diagnostics_overflow;
o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT] <= descriptor_diagnostics_almost_full;
o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT] <= i_token_out_of_inferences;
end
if (read_from_completion_count) o_csr_rdata <= completion_count;
if (read_from_clocks_active_lo) o_csr_rdata <= total_clocks_active_lo;
if (read_from_clocks_active_hi) o_csr_rdata <= total_clocks_active_hi;
if (read_from_core_clocks_active_lo) o_csr_rdata <= total_core_clocks_active_lo;
if (read_from_core_clocks_active_hi) o_csr_rdata <= total_core_clocks_active_hi;
if (read_from_clocks_all_jobs_lo) o_csr_rdata <= total_clocks_for_all_jobs_lo;
if (read_from_clocks_all_jobs_hi) o_csr_rdata <= total_clocks_for_all_jobs_hi;
if (read_from_input_feature_reads_lo) o_csr_rdata <= number_of_input_feature_reads_lo;
if (read_from_input_feature_reads_hi) o_csr_rdata <= number_of_input_feature_reads_hi;
if (read_from_input_filter_reads_lo) o_csr_rdata <= number_of_input_filter_reads_lo;
if (read_from_input_filter_reads_hi) o_csr_rdata <= number_of_input_filter_reads_hi;
if (read_from_output_feature_writes_lo) o_csr_rdata <= number_of_output_feature_writes_lo;
if (read_from_output_feature_writes_hi) o_csr_rdata <= number_of_output_feature_writes_hi;
if (read_from_debug_network_valid) o_csr_rdata <= debug_network_rvalid; //read prefetch after dcfifo has valid data
if (read_from_debug_network_data) begin
o_csr_rdata <= debug_network_rdata; //read prefetch after dcfifo
debug_network_rready <= 1'b1; //rdack the read prefetch
end
if (read_from_license_flag) o_csr_rdata <= i_license_flag;
if (read_from_ip_reset) o_csr_rdata <= '0; //this read will always return 0
if (read_ready_streaming_interface) o_csr_rdata <= o_streaming_active;
if (o_csr_rvalid && i_csr_rready) begin
o_csr_rvalid <= 1'b0;
state <= STATE_IDLE;
end
end
state[STATE_WRITE_ACCEPT_BIT]: begin
//o_csr_awready and o_csr_wready are asserted now, indicates csr has accepted a write
//ram_wr_addr valid now
previous_was_write <= 1'b1;
state <= STATE_WRITE_COMMIT;
end
state[STATE_WRITE_COMMIT_BIT]: begin
//write_to_ram valid now
ram_wr_en <= write_to_ram;
if (write_to_interrupt_control) begin //write 1 to clear
if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT]) interrupt_control_error <= 1'b0;
if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT]) interrupt_control_done <= 1'b0;
end
if (write_to_interrupt_mask) begin
interrupt_mask_error <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT];
interrupt_mask_done <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT];
end
if (write_to_debug_network_addr) begin
//don't care if dcfifo is full, handshaking scheme is already tolerant to debug network not responding to requests
debug_network_arvalid <= 1'b1;
debug_network_araddr <= ram_wr_data;
end
o_csr_bvalid <= 1'b1;
if (o_csr_bvalid && i_csr_bready) begin
o_csr_bvalid <= 1'b0;
if (enqueue_descriptor) state <= STATE_DESCRIPTOR;
else if (write_to_ip_reset) state <= (ram_wr_data != '0) ? STATE_AWAIT_RESET : STATE_IDLE;
else if (write_ready_streaming_interface) begin
if (ram_wr_data == 1) begin
state <= STATE_IDLE;
if (~ENABLE_ON_CHIP_PARAMETERS) state <= STATE_DESCRIPTOR;
o_streaming_active <= 1'b1;
end else begin
state <= STATE_IDLE;
o_streaming_active <= 1'b0;
end
end
else state <= STATE_IDLE;
end
descriptor_count <= 0;
end
state[STATE_DESCRIPTOR_BIT]: begin
descriptor_count <= descriptor_count + 1'b1;
case (descriptor_count)
4'h0: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //addr gen 0: config reader base addr
4'h1: ram_rd_addr <= 'x; //addr gen 1: token
4'h2: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //addr gen 2: config reader num words minus two
4'h3: ram_rd_addr <= 'x; //addr gen 3: addr update
4'h4: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4; //intercept 0: config reader num words minus two
4'h5: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4; //intercept 1: filter reader offset correction
4'h6: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4; //intercept 2: feature input/output offset
4'h7: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4; //intercept 3: feature intermediate offset
default: ram_rd_addr <= 'x;
endcase
//there are 3 clocks of latency from the time ram_rd_addr is set until ram_rd_data is valid
//This is why the config_reader struct in the dma/dual_inc folder has to be laid out in that order
case (descriptor_count)
4'h3: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 0: config reader base addr
4'h4: descriptor_queue_data <= '0; //addr gen 1: token
4'h5: descriptor_queue_data <= {1'b0, ram_rd_data}; //addr gen 2: config reader num words minus two
4'h6: descriptor_queue_data <= CONFIG_READER_DATA_BYTES; //addr gen 3: addr update
4'h7: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 0: config reader num words minus two
4'h8: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 1: filter reader offset correction
4'h9: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 2: feature input/output offset
4'ha: descriptor_queue_data <= {1'b1, ram_rd_data}; //intercept 3: feature intermediate offset
default: descriptor_queue_data <= 'x;
endcase
descriptor_queue_forced_write <= (descriptor_count >= 4'h3);
if (descriptor_count == 4'ha) state <= STATE_IDLE;
end
state[STATE_AWAIT_RESET_BIT]: begin
//reset request was triggered by a CSR write
// -we completed the axi4-lite write response handshake in STATE_WRITE_COMMIT
// -we don't want to return to STATE_IDLE, since a new transaction might get initiated and then interrupted when reset hits
// -we should assert o_request_ip_reset for multiple cycles to ensure the async signal is synchronized into all clock domains
//so, just hang out here and wait for reset
o_request_ip_reset <= 1'b1;
state <= STATE_AWAIT_RESET;
end
default: begin
state <= STATE_IDLE;
end
endcase
//completion tracking
completion_count <= completion_count + i_token_done;
//interrupt tracking
if (i_token_error) interrupt_control_error <= 1'b1;
if (i_token_done) interrupt_control_done <= 1'b1;
//sticky bit for detecting if descriptor queue has overflowed
if (descriptor_queue_forced_write & descriptor_queue_full) descriptor_diagnostics_overflow <= 1'b1;
if (~i_sclrn_ddr) begin
//state
state <= STATE_IDLE;
previous_was_write <= 1'b0;
//AXI4-lite outputs to host control
o_csr_arready <= 1'b0;
o_csr_rvalid <= 1'b0;
o_csr_awready <= 1'b0;
o_csr_wready <= 1'b0;
o_csr_bvalid <= 1'b0;
//ram
ram_wr_en <= 1'b0;
//specific offsets implemented in registers
interrupt_control_error <= 1'b0;
interrupt_control_done <= 1'b0;
interrupt_mask_error <= 1'b0;
interrupt_mask_done <= 1'b0;
completion_count <= '0;
descriptor_diagnostics_overflow <= 1'b0;
//descriptor queue
descriptor_queue_forced_write <= 1'b0;
//debug network
debug_network_arvalid <= 1'b0;
debug_network_rready <= 1'b0;
// stops streaming reload
o_streaming_active <= 1'b0;
end
end
//////////////////////////////////////////////////////////
// Bring the level interrupt to the host clock domain //
//////////////////////////////////////////////////////////
always_ff @(posedge clk_ddr) begin
ddr_interrupt_level <= 1'b0;
if (interrupt_mask_error & interrupt_control_error) ddr_interrupt_level <= 1'b1;
if (interrupt_mask_done & interrupt_control_done ) ddr_interrupt_level <= 1'b1;
end
//this is a 3-stage register-based synchonizer
dla_clock_cross_full_sync dla_clock_cross_sync
(
.clk_src (clk_ddr),
.i_src_async_resetn (1'b1),
.i_src_data (ddr_interrupt_level),
.o_src_data (),
.clk_dst (clk_pcie),
.i_dst_async_resetn (1'b1),
.o_dst_data (o_interrupt_level)
);
///////////////////////////
// Clock crossing FIFOS //
///////////////////////////
localparam int DCFIFO_DEPTH = 32; //dcfifo is RAM-based, may as well use an entire MLAB
dla_acl_dcfifo #(
.WIDTH (8*CSR_DATA_BYTES),
.DEPTH (DCFIFO_DEPTH)
)
clock_cross_debug_network_request
(
.async_resetn (i_resetn_async), //reset synchronization is handled internally
//write side -- write is ignored if fifo is full, this is okay since debug network handshaking is fault tolerant
.wr_clock (clk_ddr),
.wr_req (debug_network_arvalid),
.wr_data (debug_network_araddr),
//read side
.rd_clock (clk_dla),
.rd_empty (not_o_debug_network_arvalid),
.rd_data (o_debug_network_araddr),
.rd_ack (i_debug_network_arready)
);
assign o_debug_network_arvalid = ~not_o_debug_network_arvalid;
dla_acl_dcfifo #(
.WIDTH (8*CSR_DATA_BYTES),
.DEPTH (DCFIFO_DEPTH)
)
clock_cross_debug_network_response
(
.async_resetn (i_resetn_async), //reset synchronization is handled internally
//write side
.wr_clock (clk_dla),
.wr_req (i_debug_network_rvalid),
.wr_data (i_debug_network_rdata),
.wr_full (not_o_debug_network_rready),
//read side
.rd_clock (clk_ddr),
.rd_empty (debug_network_dcfifo_empty),
.rd_data (debug_network_dcfifo_data),
.rd_ack (~debug_network_dcfifo_empty) //consume read data immediately, cached in a read prefetch
);
assign o_debug_network_rready = ~not_o_debug_network_rready;
//cache the most recent value returned from the debug network
always_ff @(posedge clk_ddr) begin
if (~debug_network_dcfifo_empty) begin
debug_network_rdata <= debug_network_dcfifo_data;
debug_network_rvalid <= 1'b1;
end
if (debug_network_rready) begin
debug_network_rvalid <= 1'b0;
end
if (~i_sclrn_ddr) begin
debug_network_rvalid <= 1'b0;
end
end
endmodule
|