summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_dma_csr.sv
blob: ae7006c4eecf139feddc10dde4c9f1ac3401678b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
// Copyright 2020-2020 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.


// This module implements the CSR for DMA. It also includes the descriptor queue
// and interrupt request generator. The CSR is implemented with a RAM. Certain
// values are kept live in registers, such the interrupt control and mask. This
// makes it easier to detect when a change has happened (instead of trying to a
// read-modify-write with the RAM).
//
// The AXI4 lite slave interface is usually going to backpressure PCIe. There is
// a state machine which allows one outstanding read request, or one outstanding
// write request at a time (write requests can be outstanding if the writeack is
// backpressured which AXI allows). There is a register which tracks whether the
// last request was a read or write, this enables round robin arbitration. Each
// request takes a few clock cycles to process, as the address needs to be decoded
// to determine if a write is allowed to commit to the RAM, or if we need to use
// read data from one of the registers instead of the RAM.
//
// Special offsets are defined as localparams below. Writing to DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4
// will cause one unit of work to be enqueued in the descriptor queue. Currently
// this involves writing 8 values to a fifo, which are then consumed by the config
// reader. Internal to the config reader, 4 values go to the config reader address
// generator, the other 4 go to the config reader intercept.
//
// Beware the following assumptions about how the host issues requests to this CSR:
// - no bursts (required by AXI4 lite)
// - byte enables are assumed to be all 1 (no partial word access)
// - all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)

`resetall
`undefineall
`default_nettype none
`include "dla_acl_parameter_assert.svh"

module dla_dma_csr #(
    parameter int CSR_ADDR_WIDTH,           //width of the byte address signal, determines CSR address space size, e.g. 11 bit address = 2048 bytes, the largest size that uses only 1 M20K
    parameter int CSR_DATA_BYTES,           //width of the CSR data path, typically 4 bytes
    parameter int CONFIG_DATA_BYTES,        //data width of the config network output port, typically 4 bytes, the descriptor queue matches this so that config decode can be reused
    parameter int CONFIG_READER_DATA_BYTES, //data width of the config network input port, needed by config reader address generator for loop update

    parameter int ENABLE_INPUT_STREAMING,
    parameter int ENABLE_OUTPUT_STREAMING,
    parameter int ENABLE_ON_CHIP_PARAMETERS

    ) (
    input  wire                             clk_ddr,
    input  wire                             clk_pcie,
    input  wire                             clk_dla,
    input  wire                             i_sclrn_ddr,        //active low reset that has already been synchronized to clk_ddr
    input  wire                             i_resetn_async,     //active low reset that has NOT been synchronized to any clock, only to be consumed by dcfifo

    //updates for interrupt, runs on ddr clock
    input  wire                             i_token_done,       //feature writer reports it is done
    input  wire                             i_token_stream_started, //input streamer is reading the first word
    input  wire                             i_stream_received_first_word,
    input  wire                             i_stream_sent_last_word,
    input  wire                             i_token_error,      //dla has encountered some error, assert high for one clock cycle to report it to host (assuming mask bit is 1)
    input  wire                             i_license_flag,
    input  wire                             i_token_out_of_inferences,

    //snoop signals for the input feature, output featuer, and filter LSU's core <--> fabric traffic
    //run on clk_ddr
    input  wire                             i_input_feature_rvalid,
    input  wire                             i_input_feature_rready,
    input  wire                             i_input_filter_rvalid,
    input  wire                             i_input_filter_rready,
    input  wire                             i_output_feature_wvalid,
    input  wire                             i_output_feature_wready,

    //interrupt request to pcie, runs on pcie clock
    output logic                            o_interrupt_level,  //level sensitive interrupt

    //read side of descriptor queue goes to config reader, runs on ddr clock
    output logic                            o_config_valid,
    output logic  [8*CONFIG_DATA_BYTES-1:0] o_config_data,
    output logic                            o_config_for_intercept, //0 = goes to config reader addr gen, 1 = goes to config reader intercept
    input  wire                             i_config_ready,

    //debug network AXI-4 lite interface, read request and read response channels, runs on dla_clock
    output logic                            o_debug_network_arvalid,
    output logic     [8*CSR_DATA_BYTES-1:0] o_debug_network_araddr,
    input  wire                             i_debug_network_arready,
    input  wire                             i_debug_network_rvalid,
    input  wire      [8*CSR_DATA_BYTES-1:0] i_debug_network_rdata,
    output logic                            o_debug_network_rready,

    //AXI4-lite slave interface for host control, runs on ddr clock
    //no bursts, byte enables are assumed to be all 1, all addresses must be word aligned (e.g. if CSR_DATA_BYTES=4 then the bottom 2 bits of address must be 0)
    input  wire                             i_csr_arvalid,
    input  wire        [CSR_ADDR_WIDTH-1:0] i_csr_araddr,
    output logic                            o_csr_arready,
    output logic                            o_csr_rvalid,
    output logic     [8*CSR_DATA_BYTES-1:0] o_csr_rdata,
    input  wire                             i_csr_rready,
    input  wire                             i_csr_awvalid,
    input  wire        [CSR_ADDR_WIDTH-1:0] i_csr_awaddr,
    output logic                            o_csr_awready,
    input  wire                             i_csr_wvalid,
    input  wire      [8*CSR_DATA_BYTES-1:0] i_csr_wdata,
    output logic                            o_csr_wready,
    output logic                            o_csr_bvalid,
    input  wire                             i_csr_bready,

    //reset request for the whole ip, runs on ddr clock
    output logic                            o_request_ip_reset,

    //output bit to start/stop streaming interface
    output logic                            o_streaming_active
);


    /////////////////////////////////
    //  Parameter legality checks  //
    /////////////////////////////////

    //signal widths cannot be trivial
    `DLA_ACL_PARAMETER_ASSERT(CSR_DATA_BYTES >= 1)
    `DLA_ACL_PARAMETER_ASSERT(CONFIG_DATA_BYTES >= 1)

    //csr address space cannot be trivial
    `DLA_ACL_PARAMETER_ASSERT(2**CSR_ADDR_WIDTH > CONFIG_DATA_BYTES)

    //offsets must be within address space
    localparam int CSR_LO_ADDR = $clog2(CSR_DATA_BYTES);    //number of LSBs that must be 0 in order for byte address to be word aligned
    localparam int CSR_WORD_ADDR_WIDTH = CSR_ADDR_WIDTH - CSR_LO_ADDR;



    /////////////////
    //  Constants  //
    /////////////////
    `include "dla_dma_constants.svh"
    //special offsets -- these values are defined in one place and shared between hardware and software
    //the constants from the dla_dma_constants.svh header file that CSR cares about are named DLA_DMA_CSR_OFFSET_**** and DLA_DMA_CSR_INTERRUPT_****

    //state machine
    enum {
        STATE_IDLE_BIT,
        STATE_READ_ACCEPT_BIT,
        STATE_READ_ADDR_BIT,
        STATE_READ_DATA_BIT,
        STATE_WRITE_ACCEPT_BIT,
        STATE_WRITE_COMMIT_BIT,
        STATE_DESCRIPTOR_BIT,
        STATE_AWAIT_RESET_BIT
    } index;

    enum logic [index.num()-1:0] {
        //1-hot encodings
        STATE_IDLE = 1 << STATE_IDLE_BIT,
        STATE_READ_ACCEPT = 1 << STATE_READ_ACCEPT_BIT,
        STATE_READ_ADDR = 1 << STATE_READ_ADDR_BIT,
        STATE_READ_DATA = 1 << STATE_READ_DATA_BIT,
        STATE_WRITE_ACCEPT = 1 << STATE_WRITE_ACCEPT_BIT,
        STATE_WRITE_COMMIT = 1 << STATE_WRITE_COMMIT_BIT,
        STATE_DESCRIPTOR = 1 << STATE_DESCRIPTOR_BIT,
        STATE_AWAIT_RESET = 1 << STATE_AWAIT_RESET_BIT,
        XXX = 'x
    } state;

    localparam int MAX_JOBS_ACTIVE   = 64;  //upper bounded by how many descriptors the queue can hold
    localparam int JOBS_ACTIVE_WIDTH = $clog2(MAX_JOBS_ACTIVE+1);



    ///////////////
    //  Signals  //
    ///////////////

    //ram
    logic                           ram_wr_en;
    logic [CSR_WORD_ADDR_WIDTH-1:0] ram_wr_addr, ram_rd_addr;
    logic    [8*CSR_DATA_BYTES-1:0] ram_wr_data, ram_rd_data;

    //descriptor queue
    logic                           descriptor_queue_forced_write, descriptor_queue_full, descriptor_diagnostics_almost_full;
    logic   [8*CONFIG_DATA_BYTES:0] descriptor_queue_data;
    logic                     [2:0] descriptor_words_read;
    logic                           first_word_of_descriptor_being_read, jobs_active_is_nonzero, core_jobs_active_is_nonzero;
    logic   [JOBS_ACTIVE_WIDTH-1:0] jobs_active, core_jobs_active;

    //Perfomance counters connections
    logic                    [31:0] total_clocks_active_lo, total_clocks_active_hi;
    logic                    [31:0] total_core_clocks_active_lo, total_core_clocks_active_hi;
    logic                    [31:0] total_clocks_for_all_jobs_lo, total_clocks_for_all_jobs_hi;
    logic                    [31:0] number_of_input_feature_reads_lo, number_of_input_feature_reads_hi;
    logic                    [31:0] number_of_input_filter_reads_lo, number_of_input_filter_reads_hi;
    logic                    [31:0] number_of_output_feature_writes_lo, number_of_output_feature_writes_hi;

    //state machine
    logic                           previous_was_write;
    logic                     [3:0] descriptor_count;

    //specific offsets are implemented in registers instead of RAM
    logic                           interrupt_control_error, interrupt_control_done, interrupt_mask_error, interrupt_mask_done;
    logic    [8*CSR_DATA_BYTES-1:0] completion_count;
    logic                           descriptor_diagnostics_overflow;

    //address decode for specific offsets that are implemented in registers or require some action to be taken
    logic                           write_to_interrupt_control, read_from_interrupt_control, write_to_interrupt_mask, read_from_interrupt_mask;
    logic                           write_to_ram, read_from_desc_diagnostics, read_from_completion_count, enqueue_descriptor;
    logic                           read_from_clocks_active_lo, read_from_clocks_active_hi, read_from_clocks_all_jobs_lo, read_from_clocks_all_jobs_hi;
    logic                           read_from_core_clocks_active_lo, read_from_core_clocks_active_hi;
    logic                           read_from_input_feature_reads_lo, read_from_input_feature_reads_hi;
    logic                           read_from_input_filter_reads_lo, read_from_input_filter_reads_hi;
    logic                           read_from_output_feature_writes_lo, read_from_output_feature_writes_hi;
    logic                           write_to_debug_network_addr, read_from_debug_network_valid, read_from_debug_network_data;
    logic                           read_from_license_flag;
    logic                           read_from_ip_reset, write_to_ip_reset;

    //clock crosser for interrupt
    logic                           ddr_interrupt_level;

    //debug network read request address
    logic                           debug_network_arvalid, not_o_debug_network_arvalid;
    logic    [8*CSR_DATA_BYTES-1:0] debug_network_araddr;

    //debug network read response data
    logic                           not_o_debug_network_rready, debug_network_dcfifo_empty, debug_network_rvalid, debug_network_rready;
    logic    [8*CSR_DATA_BYTES-1:0] debug_network_dcfifo_data, debug_network_rdata;

    //streaming states
    logic                           write_ready_streaming_interface;
    logic                           read_ready_streaming_interface;

    logic dla_sclrn;

    //reset parameterization
    localparam int RESET_USE_SYNCHRONIZER = 1;
    localparam int RESET_PIPE_DEPTH       = 3;
    localparam int RESET_NUM_COPIES       = 1;
    dla_reset_handler_simple #(
        .USE_SYNCHRONIZER   (RESET_USE_SYNCHRONIZER),
        .PIPE_DEPTH         (RESET_PIPE_DEPTH),
        .NUM_COPIES         (RESET_NUM_COPIES)
    )
    ddr_reset_synchronizer
    (
        .clk                (clk_dla),
        .i_resetn           (i_resetn_async),
        .o_sclrn            (dla_sclrn)
    );

    ///////////
    //  RAM  //
    ///////////

    //could use hld_ram, but this simple ram doesn't need the depth stitching or clock enable magic that hld_ram provides

    altera_syncram
    #(
        .address_aclr_b                     ("NONE"),
        .address_reg_b                      ("CLOCK0"),
        .clock_enable_input_a               ("BYPASS"),
        .clock_enable_input_b               ("BYPASS"),
        .clock_enable_output_b              ("BYPASS"),
        .enable_ecc                         ("FALSE"),
        .init_file                          ("dla_dma_csr_discovery_rom.mif"),
        .intended_device_family             ("Arria 10"),       //Quartus will fix this automatically
        .lpm_type                           ("altera_syncram"),
        .numwords_a                         (2**CSR_WORD_ADDR_WIDTH),
        .numwords_b                         (2**CSR_WORD_ADDR_WIDTH),
        .operation_mode                     ("DUAL_PORT"),
        .outdata_aclr_b                     ("NONE"),
        .outdata_sclr_b                     ("NONE"),
        .outdata_reg_b                      ("CLOCK0"),
        .power_up_uninitialized             ("FALSE"),
        .ram_block_type                     ("M20K"),
        .read_during_write_mode_mixed_ports ("DONT_CARE"),
        .widthad_a                          (CSR_WORD_ADDR_WIDTH),
        .widthad_b                          (CSR_WORD_ADDR_WIDTH),
        .width_a                            (8*CSR_DATA_BYTES),
        .width_b                            (8*CSR_DATA_BYTES),
        .width_byteena_a                    (1)
    )
    csr_ram
    (
        .address_a                          (ram_wr_addr),
        .address_b                          (ram_rd_addr),
        .clock0                             (clk_ddr),
        .data_a                             (ram_wr_data),
        .wren_a                             (ram_wr_en),
        .q_b                                (ram_rd_data),
        .address2_a                         (1'b1),
        .address2_b                         (1'b1),
        .addressstall_a                     (1'b0),
        .addressstall_b                     (1'b0),
        .byteena_a                          (1'b1),
        .byteena_b                          (1'b1),
        .clock1                             (1'b1),
        .clocken0                           (1'b1),
        .clocken1                           (1'b1),
        .clocken2                           (1'b1),
        .clocken3                           (1'b1),
        .data_b                             ({(8*CSR_DATA_BYTES){1'b1}}),
        .eccencbypass                       (1'b0),
        .eccencparity                       (8'b0),
        .eccstatus                          (),
        .q_a                                (),
        .rden_a                             (1'b1),
        .rden_b                             (1'b1),
        .wren_b                             (1'b0)
    );



    ////////////////////////
    //  Descriptor Queue  //
    ////////////////////////

    //runtime knows how many jobs it has enqueued and how many jobs have finished
    //runtime is responsible for not overflowing the descriptor queue, it must limit the number of outstanding jobs queued in hardware

    localparam int DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF = DLA_DMA_CSR_DESCRIPTOR_QUEUE_WORDS_PER_JOB;    //almost full asserts when queue only has space for 1 more job

    dla_hld_fifo #(
        .WIDTH                      (8*CONFIG_DATA_BYTES + 1),
        .DEPTH                      (DLA_DMA_CSR_DESCRIPTOR_QUEUE_PHYSICAL_SIZE),   //this is set to 512 in dla_dma_constants.svh, may as well use up full depth of M20K
        .ALMOST_FULL_CUTOFF         (DESCRIPTOR_QUEUE_ALMOST_FULL_CUTOFF),
        .ASYNC_RESET                (0),    //consume reset synchronously
        .SYNCHRONIZE_RESET          (0),    //reset is already synchronized
        .STYLE                      ("ms")
    )
    descriptor_queue
    (
        .clock                      (clk_ddr),
        .resetn                     (i_sclrn_ddr),

        .i_valid                    (descriptor_queue_forced_write),
        .i_data                     (descriptor_queue_data),
        .o_stall                    (descriptor_queue_full),    //software is responsible for not overflowing this fifo
        .o_almost_full              (descriptor_diagnostics_almost_full),

        .o_valid                    (o_config_valid),
        .o_data                     ({o_config_for_intercept, o_config_data}),
        .i_stall                    (~i_config_ready | i_token_out_of_inferences)
    );



    ////////////////////////////
    //  Performance counters  //
    ////////////////////////////

    //Auxillary logic that controls the jobs active counters
    assign first_word_of_descriptor_being_read = o_config_valid & i_config_ready & (descriptor_words_read==3'h0);   //desc words read was 0, going to be 1
    always_ff @(posedge clk_ddr) begin
        if (o_config_valid & i_config_ready) descriptor_words_read <= descriptor_words_read + 1'b1;

        if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
            // In this case, we should only track the cycles between the feature data being read, and
            // results being streamed out, since we continually read the on-chip config params
            if (i_token_stream_started & ~i_token_done) jobs_active <= jobs_active + 1'b1;
            if (~i_token_stream_started & i_token_done) jobs_active <= jobs_active - 1'b1;
        end else begin
            if (first_word_of_descriptor_being_read & ~i_token_done) jobs_active <= jobs_active + 1'b1;
            if (~first_word_of_descriptor_being_read & i_token_done) jobs_active <= jobs_active - 1'b1;
        end

        if (~i_sclrn_ddr) begin
            descriptor_words_read <= 3'h0;
            jobs_active <= '0;
            jobs_active_is_nonzero <= 1'b0;
        end
    end

    logic core_jobs_active_is_nonzero_ddr_clk;

    always_ff @(posedge clk_dla) begin
        if (ENABLE_INPUT_STREAMING & ENABLE_OUTPUT_STREAMING & ENABLE_ON_CHIP_PARAMETERS) begin
            // In this case, we should only track the cycles between the feature data being read, and
            // results being streamed out, since we continually read the on-chip config params
            if (i_stream_received_first_word & ~i_stream_sent_last_word) core_jobs_active <= core_jobs_active + 1'b1;
            if (~i_stream_received_first_word & i_stream_sent_last_word) core_jobs_active <= core_jobs_active - 1'b1;
            core_jobs_active_is_nonzero <= core_jobs_active != 0;
        end
        if (~dla_sclrn) begin
            core_jobs_active <= '0;
            core_jobs_active_is_nonzero <= 1'b0;
        end
    end

    // crossover core_jobs_active_is_nonzero from dla to ddr clk
    dla_clock_cross_full_sync dla_to_ddr_clock_cross_sync
    (
        .clk_src            (clk_dla),
        .i_src_async_resetn (1'b1),
        .i_src_data         (core_jobs_active_is_nonzero),
        .o_src_data         (),

        .clk_dst            (clk_ddr),
        .i_dst_async_resetn (1'b1),
        .o_dst_data         (core_jobs_active_is_nonzero_ddr_clk)
    );


    //track the number of active jobs
    dla_dma_counter_64 count_total_core_clocks_active (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (core_jobs_active_is_nonzero_ddr_clk),
        .i_increment_val            (32'b1),
        .i_read_counter_low_bits    (read_from_core_clocks_active_lo),
        .o_counter_low_bits         (total_core_clocks_active_lo),
        .o_counter_high_bits_latch  (total_core_clocks_active_hi)
    );
    //a job is active once the first word of its descriptor is read from the queue
    //a job is finished once the feature writer sends a done token
    dla_dma_counter_64 count_total_clocks_active (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (jobs_active != 0),
        .i_increment_val            (32'b1),
        .i_read_counter_low_bits    (read_from_clocks_active_lo),
        .o_counter_low_bits         (total_clocks_active_lo),
        .o_counter_high_bits_latch  (total_clocks_active_hi)
    );

    dla_dma_counter_64 count_total_clocks_for_all_jobs (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (1'b1),
        .i_increment_val            (jobs_active),
        .i_read_counter_low_bits    (read_from_clocks_all_jobs_lo),
        .o_counter_low_bits         (total_clocks_for_all_jobs_lo),
        .o_counter_high_bits_latch  (total_clocks_for_all_jobs_hi)
    );

    //tracks the number of input feature reads in terms of memory words transfers.
    dla_dma_counter_64 count_input_feature_reads (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (i_input_feature_rready & i_input_feature_rvalid),
        .i_increment_val            (32'b1),
        .i_read_counter_low_bits    (read_from_input_feature_reads_lo),
        .o_counter_low_bits         (number_of_input_feature_reads_lo),
        .o_counter_high_bits_latch  (number_of_input_feature_reads_hi)
    );

    //tracks the number of output feature writes in terms of memory words transfers.
    dla_dma_counter_64 count_output_feature_writes (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (i_output_feature_wready & i_output_feature_wvalid),
        .i_increment_val            (32'b1),
        .i_read_counter_low_bits    (read_from_output_feature_writes_lo),
        .o_counter_low_bits         (number_of_output_feature_writes_lo),
        .o_counter_high_bits_latch  (number_of_output_feature_writes_hi)
    );

    //tracks the number of input filter reads in terms of memory words transfers.
    dla_dma_counter_64 count_input_filter_reads (
        .i_clk                      (clk_ddr),
        .i_sclrn                    (i_sclrn_ddr),
        .i_increment_en             (i_input_filter_rready & i_input_filter_rvalid),
        .i_increment_val            (32'b1),
        .i_read_counter_low_bits    (read_from_input_filter_reads_lo),
        .o_counter_low_bits         (number_of_input_filter_reads_lo),
        .o_counter_high_bits_latch  (number_of_input_filter_reads_hi)
    );


    //////////////////////
    //  Address decode  //
    //////////////////////

    always_ff @(posedge clk_ddr) begin
        //the csr address space is mostly read only, except for a few specific offsets listed below
        write_to_ram <= 1'b0;
        if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4)       write_to_ram <= 1'b1;
        if (ram_wr_addr == DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4) write_to_ram <= 1'b1;
        if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4) write_to_ram <= 1'b1;
        if (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4) write_to_ram <= 1'b1;
        if (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4)     write_to_ram <= 1'b1;
        if (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4)      write_to_ram <= 1'b1;

        //decode specific addresses in which the storage lives in registers
        write_to_interrupt_control    <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
        read_from_interrupt_control   <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_CONTROL/4);
        write_to_interrupt_mask       <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
        read_from_interrupt_mask      <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INTERRUPT_MASK/4);
        read_from_desc_diagnostics    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DESC_DIAGNOSTICS/4);
        read_from_completion_count    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_COMPLETION_COUNT/4);
        read_from_clocks_active_lo    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_LO/4);
        read_from_clocks_active_hi    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ACTIVE_HI/4);
        read_from_core_clocks_active_lo    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_LO/4);
        read_from_core_clocks_active_hi    <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CORE_CLOCKS_ACTIVE_HI/4);
        read_from_clocks_all_jobs_lo  <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_LO/4);
        read_from_clocks_all_jobs_hi  <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_CLOCKS_ALL_JOBS_HI/4);
        write_to_debug_network_addr   <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_ADDR/4);
        read_from_debug_network_valid <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_VALID/4);
        read_from_debug_network_data  <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_DEBUG_NETWORK_DATA/4);
        read_from_license_flag        <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_LICENSE_FLAG /4);
        read_from_ip_reset            <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
        read_from_input_filter_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_LO/4);
        read_from_input_filter_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FILTER_READ_COUNT_HI/4);
        read_from_input_feature_reads_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_LO/4);
        read_from_input_feature_reads_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_INPUT_FEATURE_READ_COUNT_HI/4);
        read_from_output_feature_writes_lo <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_LO/4);
        read_from_output_feature_writes_hi <= (ram_rd_addr == DLA_DMA_CSR_OFFSET_OUTPUT_FEATURE_WRITE_COUNT_HI/4);
        read_ready_streaming_interface<= (ram_rd_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);

        //decode specific addresses in which an action must be taken
        enqueue_descriptor <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4);
        write_to_ip_reset  <= (ram_wr_addr == DLA_DMA_CSR_OFFSET_IP_RESET/4);
        if (ENABLE_INPUT_STREAMING) begin
            write_ready_streaming_interface <= (ram_wr_addr == DLA_CSR_OFFSET_READY_STREAMING_IFACE/4);
        end
    end



    /////////////////////
    //  State machine  //
    /////////////////////

    always_ff @(posedge clk_ddr) begin
        //default behavior
        o_csr_arready <= 1'b0;
        o_csr_rvalid  <= 1'b0;
        o_csr_awready <= 1'b0;
        o_csr_wready  <= 1'b0;
        o_csr_bvalid  <= 1'b0;
        ram_wr_en     <= 1'b0;
        descriptor_queue_forced_write <= 1'b0;
        descriptor_queue_data         <= 'x;
        debug_network_arvalid <= 1'b0;
        debug_network_rready  <= 1'b0;
        o_request_ip_reset    <= 1'b0;
        o_streaming_active <= o_streaming_active;

        unique case (1'b1)
        state[STATE_IDLE_BIT]: begin
            if (i_csr_arvalid && (previous_was_write || ~(i_csr_awvalid && i_csr_wvalid))) begin
                o_csr_arready <= 1'b1;
                state <= STATE_READ_ACCEPT;
                ram_rd_addr <= i_csr_araddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
            end
            if (i_csr_awvalid && i_csr_wvalid && (~previous_was_write || ~i_csr_arvalid)) begin
                o_csr_awready <= 1'b1;
                o_csr_wready <= 1'b1;
                state <= STATE_WRITE_ACCEPT;
                ram_wr_addr <= i_csr_awaddr[CSR_ADDR_WIDTH-1:CSR_LO_ADDR];
                ram_wr_data <= i_csr_wdata;
            end
        end

        state[STATE_READ_ACCEPT_BIT]: begin
            //o_csr_arready is asserted now, indicates csr has accepted a read
            //ram_rd_addr valid now
            state <= STATE_READ_ADDR;
            previous_was_write <= 1'b0;
        end
        state[STATE_READ_ADDR_BIT]: begin
            //hardened input register inside m20k valid now
            state <= STATE_READ_DATA;
        end
        state[STATE_READ_DATA_BIT]: begin
            //hardened output register inside m20k valid now
            o_csr_rvalid <= 1'b1;
            o_csr_rdata <= ram_rd_data;
            if (read_from_interrupt_control) begin
                o_csr_rdata <= '0;
                o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_control_error;
                o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT]  <= interrupt_control_done;
            end
            if (read_from_interrupt_mask) begin
                o_csr_rdata <= '0;
                o_csr_rdata[DLA_DMA_CSR_INTERRUPT_ERROR_BIT] <= interrupt_mask_error;
                o_csr_rdata[DLA_DMA_CSR_INTERRUPT_DONE_BIT]  <= interrupt_mask_done;
            end
            if (read_from_desc_diagnostics) begin
                o_csr_rdata <= '0;
                o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OVERFLOW_BIT]    <= descriptor_diagnostics_overflow;
                o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_ALMOST_FULL_BIT] <= descriptor_diagnostics_almost_full;
                o_csr_rdata[DLA_DMA_CSR_DESC_DIAGNOSTICS_OUT_OF_INFERENCES_BIT] <= i_token_out_of_inferences;
            end
            if (read_from_completion_count) o_csr_rdata <= completion_count;
            if (read_from_clocks_active_lo) o_csr_rdata <= total_clocks_active_lo;
            if (read_from_clocks_active_hi) o_csr_rdata <= total_clocks_active_hi;
            if (read_from_core_clocks_active_lo) o_csr_rdata <= total_core_clocks_active_lo;
            if (read_from_core_clocks_active_hi) o_csr_rdata <= total_core_clocks_active_hi;
            if (read_from_clocks_all_jobs_lo) o_csr_rdata <= total_clocks_for_all_jobs_lo;
            if (read_from_clocks_all_jobs_hi) o_csr_rdata <= total_clocks_for_all_jobs_hi;
            if (read_from_input_feature_reads_lo) o_csr_rdata <= number_of_input_feature_reads_lo;
            if (read_from_input_feature_reads_hi) o_csr_rdata <= number_of_input_feature_reads_hi;
            if (read_from_input_filter_reads_lo) o_csr_rdata <= number_of_input_filter_reads_lo;
            if (read_from_input_filter_reads_hi) o_csr_rdata <= number_of_input_filter_reads_hi;
            if (read_from_output_feature_writes_lo) o_csr_rdata <= number_of_output_feature_writes_lo;
            if (read_from_output_feature_writes_hi) o_csr_rdata <= number_of_output_feature_writes_hi;
            if (read_from_debug_network_valid) o_csr_rdata <= debug_network_rvalid; //read prefetch after dcfifo has valid data
            if (read_from_debug_network_data) begin
                o_csr_rdata <= debug_network_rdata; //read prefetch after dcfifo
                debug_network_rready <= 1'b1;       //rdack the read prefetch
            end
            if (read_from_license_flag) o_csr_rdata <= i_license_flag;
            if (read_from_ip_reset)     o_csr_rdata <= '0; //this read will always return 0
            if (read_ready_streaming_interface) o_csr_rdata <= o_streaming_active;

            if (o_csr_rvalid && i_csr_rready) begin
                o_csr_rvalid <= 1'b0;
                state <= STATE_IDLE;
            end
        end

        state[STATE_WRITE_ACCEPT_BIT]: begin
            //o_csr_awready and o_csr_wready are asserted now, indicates csr has accepted a write
            //ram_wr_addr valid now
            previous_was_write <= 1'b1;
            state <= STATE_WRITE_COMMIT;
        end
        state[STATE_WRITE_COMMIT_BIT]: begin
            //write_to_ram valid now
            ram_wr_en <= write_to_ram;
            if (write_to_interrupt_control) begin   //write 1 to clear
                if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT]) interrupt_control_error <= 1'b0;
                if (ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT])  interrupt_control_done  <= 1'b0;
            end
            if (write_to_interrupt_mask) begin
                interrupt_mask_error <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_ERROR_BIT];
                interrupt_mask_done  <= ram_wr_data[DLA_DMA_CSR_INTERRUPT_DONE_BIT];
            end
            if (write_to_debug_network_addr) begin
                //don't care if dcfifo is full, handshaking scheme is already tolerant to debug network not responding to requests
                debug_network_arvalid <= 1'b1;
                debug_network_araddr  <= ram_wr_data;
            end
            o_csr_bvalid <= 1'b1;
            if (o_csr_bvalid && i_csr_bready) begin
                o_csr_bvalid <= 1'b0;
                if (enqueue_descriptor)      state <= STATE_DESCRIPTOR;
                else if (write_to_ip_reset)  state <= (ram_wr_data != '0) ? STATE_AWAIT_RESET : STATE_IDLE;
                else if (write_ready_streaming_interface) begin
                    if (ram_wr_data == 1) begin
                        state <= STATE_IDLE;
                        if (~ENABLE_ON_CHIP_PARAMETERS) state <= STATE_DESCRIPTOR;
                        o_streaming_active <= 1'b1;
                    end else begin
                        state <= STATE_IDLE;
                        o_streaming_active <= 1'b0;
                    end
                end
                else                         state <= STATE_IDLE;
            end
            descriptor_count <= 0;
        end

        state[STATE_DESCRIPTOR_BIT]: begin
            descriptor_count <= descriptor_count + 1'b1;
            case (descriptor_count)
            4'h0: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4;         //addr gen 0: config reader base addr
            4'h1: ram_rd_addr <= 'x;                                            //addr gen 1: token
            4'h2: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4;   //addr gen 2: config reader num words minus two
            4'h3: ram_rd_addr <= 'x;                                            //addr gen 3: addr update
            4'h4: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_RANGE_MINUS_TWO/4;   //intercept 0: config reader num words minus two
            4'h5: ram_rd_addr <= DLA_DMA_CSR_OFFSET_CONFIG_BASE_ADDR/4;         //intercept 1: filter reader offset correction
            4'h6: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INPUT_OUTPUT_BASE_ADDR/4;   //intercept 2: feature input/output offset
            4'h7: ram_rd_addr <= DLA_DMA_CSR_OFFSET_INTERMEDIATE_BASE_ADDR/4;   //intercept 3: feature intermediate offset
            default: ram_rd_addr <= 'x;
            endcase

            //there are 3 clocks of latency from the time ram_rd_addr is set until ram_rd_data is valid
            //This is why the config_reader struct in the dma/dual_inc folder has to be laid out in that order
            case (descriptor_count)
            4'h3: descriptor_queue_data <= {1'b0, ram_rd_data};         //addr gen 0: config reader base addr
            4'h4: descriptor_queue_data <= '0;                          //addr gen 1: token
            4'h5: descriptor_queue_data <= {1'b0, ram_rd_data};         //addr gen 2: config reader num words minus two
            4'h6: descriptor_queue_data <= CONFIG_READER_DATA_BYTES;    //addr gen 3: addr update
            4'h7: descriptor_queue_data <= {1'b1, ram_rd_data};         //intercept 0: config reader num words minus two
            4'h8: descriptor_queue_data <= {1'b1, ram_rd_data};         //intercept 1: filter reader offset correction
            4'h9: descriptor_queue_data <= {1'b1, ram_rd_data};         //intercept 2: feature input/output offset
            4'ha: descriptor_queue_data <= {1'b1, ram_rd_data};         //intercept 3: feature intermediate offset
            default: descriptor_queue_data <= 'x;
            endcase

            descriptor_queue_forced_write <= (descriptor_count >= 4'h3);
            if (descriptor_count == 4'ha) state <= STATE_IDLE;
        end

        state[STATE_AWAIT_RESET_BIT]: begin
            //reset request was triggered by a CSR write
            // -we completed the axi4-lite write response handshake in STATE_WRITE_COMMIT
            // -we don't want to return to STATE_IDLE, since a new transaction might get initiated and then interrupted when reset hits
            // -we should assert o_request_ip_reset for multiple cycles to ensure the async signal is synchronized into all clock domains
            //so, just hang out here and wait for reset
            o_request_ip_reset <= 1'b1;
            state <= STATE_AWAIT_RESET;
        end

        default: begin
            state <= STATE_IDLE;
        end
        endcase

        //completion tracking
        completion_count <= completion_count + i_token_done;

        //interrupt tracking
        if (i_token_error) interrupt_control_error <= 1'b1;
        if (i_token_done)  interrupt_control_done  <= 1'b1;

        //sticky bit for detecting if descriptor queue has overflowed
        if (descriptor_queue_forced_write & descriptor_queue_full) descriptor_diagnostics_overflow <= 1'b1;

        if (~i_sclrn_ddr) begin
            //state
            state              <= STATE_IDLE;
            previous_was_write <= 1'b0;

            //AXI4-lite outputs to host control
            o_csr_arready <= 1'b0;
            o_csr_rvalid  <= 1'b0;
            o_csr_awready <= 1'b0;
            o_csr_wready  <= 1'b0;
            o_csr_bvalid  <= 1'b0;

            //ram
            ram_wr_en   <= 1'b0;

            //specific offsets implemented in registers
            interrupt_control_error <= 1'b0;
            interrupt_control_done  <= 1'b0;
            interrupt_mask_error    <= 1'b0;
            interrupt_mask_done     <= 1'b0;
            completion_count        <= '0;
            descriptor_diagnostics_overflow <= 1'b0;

            //descriptor queue
            descriptor_queue_forced_write <= 1'b0;

            //debug network
            debug_network_arvalid <= 1'b0;
            debug_network_rready  <= 1'b0;

            // stops streaming reload
            o_streaming_active <= 1'b0;
        end
    end



    //////////////////////////////////////////////////////////
    //  Bring the level interrupt to the host clock domain  //
    //////////////////////////////////////////////////////////

    always_ff @(posedge clk_ddr) begin
        ddr_interrupt_level <= 1'b0;
        if (interrupt_mask_error & interrupt_control_error) ddr_interrupt_level <= 1'b1;
        if (interrupt_mask_done  & interrupt_control_done ) ddr_interrupt_level <= 1'b1;
    end

    //this is a 3-stage register-based synchonizer
    dla_clock_cross_full_sync dla_clock_cross_sync
    (
        .clk_src            (clk_ddr),
        .i_src_async_resetn (1'b1),
        .i_src_data         (ddr_interrupt_level),
        .o_src_data         (),

        .clk_dst            (clk_pcie),
        .i_dst_async_resetn (1'b1),
        .o_dst_data         (o_interrupt_level)
    );



    ///////////////////////////
    //  Clock crossing FIFOS //
    ///////////////////////////

    localparam int DCFIFO_DEPTH = 32;   //dcfifo is RAM-based, may as well use an entire MLAB

    dla_acl_dcfifo #(
        .WIDTH                      (8*CSR_DATA_BYTES),
        .DEPTH                      (DCFIFO_DEPTH)
    )
    clock_cross_debug_network_request
    (
        .async_resetn               (i_resetn_async),   //reset synchronization is handled internally

        //write side -- write is ignored if fifo is full, this is okay since debug network handshaking is fault tolerant
        .wr_clock                   (clk_ddr),
        .wr_req                     (debug_network_arvalid),
        .wr_data                    (debug_network_araddr),

        //read side
        .rd_clock                   (clk_dla),
        .rd_empty                   (not_o_debug_network_arvalid),
        .rd_data                    (o_debug_network_araddr),
        .rd_ack                     (i_debug_network_arready)
    );
    assign o_debug_network_arvalid = ~not_o_debug_network_arvalid;

    dla_acl_dcfifo #(
        .WIDTH                      (8*CSR_DATA_BYTES),
        .DEPTH                      (DCFIFO_DEPTH)
    )
    clock_cross_debug_network_response
    (
        .async_resetn               (i_resetn_async),   //reset synchronization is handled internally

        //write side
        .wr_clock                   (clk_dla),
        .wr_req                     (i_debug_network_rvalid),
        .wr_data                    (i_debug_network_rdata),
        .wr_full                    (not_o_debug_network_rready),

        //read side
        .rd_clock                   (clk_ddr),
        .rd_empty                   (debug_network_dcfifo_empty),
        .rd_data                    (debug_network_dcfifo_data),
        .rd_ack                     (~debug_network_dcfifo_empty)    //consume read data immediately, cached in a read prefetch
    );
    assign o_debug_network_rready = ~not_o_debug_network_rready;

    //cache the most recent value returned from the debug network
    always_ff @(posedge clk_ddr) begin
        if (~debug_network_dcfifo_empty) begin
            debug_network_rdata <= debug_network_dcfifo_data;
            debug_network_rvalid <= 1'b1;
        end
        if (debug_network_rready) begin
            debug_network_rvalid <= 1'b0;
        end
        if (~i_sclrn_ddr) begin
            debug_network_rvalid <= 1'b0;
        end
    end

endmodule