// Copyright 2021 Intel Corporation. // // This software and the related documents are Intel copyrighted materials, // and your use of them is governed by the express license under which they // were provided to you ("License"). Unless the License provides otherwise, // you may not use, modify, copy, publish, distribute, disclose or transmit // this software or the related documents without Intel's prior written // permission. // // This software and the related documents are provided as is, with no express // or implied warranties, other than those that are expressly stated in the // License. `resetall `undefineall `default_nettype none `include "dla_acl_parameter_assert.svh" // This module snoops on the control signals of several stallable interfaces throughout CoreDLA. It uses each valid and // ready pair to produce several counters which are useful for debug purposes. These counters as well as the steady state // values of valid and ready are readable by the debug network. // It is expected that a human (and not some automated script) will examine the data dumped by the debug network. To assist // the parsing of this information, this module also contains a ROM that describes which offsets the profiling counters // are available at as well as a human readable string to describe what each profiling counter is. As a contingency plan // in case the FPGA runs low on M20K memory blocks, the ROM functionality could be moved entirely into software, however // this would then require strict versions between hardware and runtime. // Future optimizations: // 1. Not all interfaces runs on clk_dla, probably the easiest way to deal with this is to have one instantiation of this // module per clock domain (use clock crossers to connect to the debug network which runs on clk_dla). Would need to // expose RAM depth, mem init file name, etc. as parameters. In case there are lots of stallable interfaces, probably // better to use multiple instantiations so that the ROM and read data mux won't grow super large. // 2. Upper 32 bits of each 64-bit counter can probably be implemented with RAM instead of registers. Each time the bottom // 32 bits overflows, assert some flag. Enhance the state machine to check and clear such flags, and upon doing so // perform a read-modify-write to the RAM to add 1 to the upper 32 bits of the corresponding counter. module dla_interface_profiling_counters #( parameter int NUM_INTERFACES, //number of stallable interfaces to snoop on valid and ready parameter int ADDR_WIDTH, //width of the read request address, typically 24 parameter int DATA_WIDTH //width of the read response data, typically 32 ) ( input wire clk, input wire i_resetn_async, //active low reset that has NOT been synchronized to any clock //snoop on valid and ready from various stallable interfaces input wire i_snoop_valid [NUM_INTERFACES-1:0], input wire i_snoop_ready [NUM_INTERFACES-1:0], //debug network interfaces input wire i_dbg_arvalid, input wire [ADDR_WIDTH-1:0] i_dbg_araddr, output logic o_dbg_arready, output logic o_dbg_rvalid, output logic [DATA_WIDTH-1:0] o_dbg_rdata, input wire i_dbg_rready ); ///////////////////////////////// // Parameter legality checks // ///////////////////////////////// //signal widths cannot be trivial `DLA_ACL_PARAMETER_ASSERT(NUM_INTERFACES >= 1) `DLA_ACL_PARAMETER_ASSERT(ADDR_WIDTH == 24) //this is the only configuration ever tested `DLA_ACL_PARAMETER_ASSERT(DATA_WIDTH == 32) //this is the only configuration ever tested ///////////////// // Constants // ///////////////// //ROM sizing localparam int RAM_DEPTH = 2048; //BEWARE: make sure this is at least as deep as the MIF file localparam int RAM_ADDR = $clog2(RAM_DEPTH); //state machine localparam int STATE_IDLE = 0; localparam int STATE_ROM_ACCEPT = 1; localparam int STATE_ROM_ADDR = 2; localparam int STATE_ROM_DATA = 3; localparam int STATE_COUNT_ACCEPT = 4; localparam int STATE_COUNT_ADDR = 5; localparam int STATE_COUNT_DATA = 6; localparam int STATE_FREEZE_DATA = 7; localparam int NUM_STATES = 8; /////////////// // Signals // /////////////// //reset logic sclrn; //rom - implemented as ram for now //future optimization: upper bits of profiling counters change slowly, can probably move these inside the ram logic ram_wr_en; logic [RAM_ADDR-1:0] ram_wr_addr, ram_rd_addr; logic [DATA_WIDTH-1:0] ram_wr_data, ram_rd_data; //profiling counters genvar g; logic freeze; logic [DATA_WIDTH-1:0] profiling_counter_per_interface [NUM_INTERFACES-1:0]; logic [DATA_WIDTH-1:0] profiling_counter_final; //state machine logic [$clog2(NUM_STATES)-1:0] state; logic [ADDR_WIDTH-1:0] captured_addr; ///////////////////////////// // Reset Synchronization // ///////////////////////////// dla_reset_handler_simple #( .USE_SYNCHRONIZER (1), .PIPE_DEPTH (1), .NUM_COPIES (1) ) reset_synchronizer ( .clk (clk), .i_resetn (i_resetn_async), .o_sclrn (sclrn) ); /////////// // ROM // /////////// // See create_mif.cpp for a description of the offset/string encoding that the runtime expects and that // the memory initialization file needs to implement. altera_syncram #( .address_aclr_b ("NONE"), .address_reg_b ("CLOCK0"), .clock_enable_input_a ("BYPASS"), .clock_enable_input_b ("BYPASS"), .clock_enable_output_b ("BYPASS"), .enable_ecc ("FALSE"), .init_file ("dla_interface_profiling_counters.mif"), .intended_device_family ("Arria 10"), //Quartus will fix this automatically .lpm_type ("altera_syncram"), .numwords_a (RAM_DEPTH), .numwords_b (RAM_DEPTH), .operation_mode ("DUAL_PORT"), .outdata_aclr_b ("NONE"), .outdata_sclr_b ("NONE"), .outdata_reg_b ("CLOCK0"), .power_up_uninitialized ("FALSE"), .ram_block_type ("M20K"), .read_during_write_mode_mixed_ports ("DONT_CARE"), .widthad_a (RAM_ADDR), .widthad_b (RAM_ADDR), .width_a (DATA_WIDTH), .width_b (DATA_WIDTH), .width_byteena_a (1) ) ram ( .address_a (ram_wr_addr), .address_b (ram_rd_addr), .clock0 (clk), .data_a (ram_wr_data), .wren_a (ram_wr_en), .q_b (ram_rd_data), .address2_a (1'b1), .address2_b (1'b1), .addressstall_a (1'b0), .addressstall_b (1'b0), .byteena_a (1'b1), .byteena_b (1'b1), .clock1 (1'b1), .clocken0 (1'b1), .clocken1 (1'b1), .clocken2 (1'b1), .clocken3 (1'b1), .data_b ({DATA_WIDTH{1'b1}}), .eccencbypass (1'b0), .eccencparity (8'b0), .eccstatus (), .q_a (), .rden_a (1'b1), .rden_b (1'b1), .wren_b (1'b0) ); assign ram_wr_en = 1'b0; assign ram_wr_addr = '0; assign ram_wr_data = '0; assign ram_rd_addr = captured_addr[RAM_ADDR+1:2]; ////////////////////////// // Profiling Counters // ////////////////////////// // Each stallable interface has a set of profiling counters which occupies a 32 byte chunk of the address space. // // Byte offset | Interpretation // ------------+---------------------------------------------------------------------------------------------- // 0 | Steady state value of valid // 4 | Steady state value of ready // 8 | Lower 32-bits of 64-bit counter for number of transactions accepted (valid & ready) // 12 | Upper 32-bits of the above counter // 16 | Lower 32-bits of 64-bit counter for number of clock cycles of backpressure (valid & ~ready) // 20 | Upper 32-bits of the above counter // 24 | Lower 32-bits of 64-bit counter for number of clock cycles of data starvation (~valid & ready) // 28 | Upper 32-bits of the above counter // // There is some special handling for data starvation. It is a bit trickier to profile since "~valid & ready" will be // true before any work has begun as well as after all the work has finished. To resolve this, only start counting // after the first item of work has been seen, and every time a new item of work is seen capture the value in a // shadow register (avoid observing any increment in the raw counter after the last item of work). // // Since it takes multiple 32-bit reads to access the entire 64-bit value, there is the ability to freeze the counter // values. This is implemented by masking ready and valid i.e. the increment to the counter is set to 0. Note the freeze // does not affect the reading of the steady state value of valid and ready. for (g=0; g