summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_acl_burst_splitter.sv
blob: 903424d0ab765d54ae3dc3cce778514d48f41f78 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// Copyright 2020 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

//  AvalonMM Burst Splitter
//
//  This module splits AvalonMM bursts. One can configure it to split only write bursts, split only read bursts, or split both. It is legal to configure this module to not
//  split any bursts, Quaurtus should sweep away all the unused logic and this module should become passthrough wires.
//
//  The interfaces are only the command portion of the AvalonMM interface, it is assumed that the timing of the response path is independent of the command. This is certainly
//  the case when the response path includes readdatavalid, as the read response can happen an arbitraty number of clock cycles later than when the read request was accepted.
//  
//  This module has no capacity, it splits bursts on-the-fly. There is no change in control flow when splitting write bursts since a write burst of length N already takes N
//  clock cycles to transfer (it contains N words of data). Splitting a write burst basically involves calculating the address for the words inside the burst. Conversely, if
//  splitting a read burst, one read request of length N (which can be transferred in 1 clock cycle) will result in N read requests of length 1, and those N read requests
//  need N clock cycles to be transferred. Therefore the burst splitter must stall the upstream interface while these read requests are provided to the downstream interface.
//
//  If this module is configured to split read bursts, there must be a zero-cycle handshake with the upstream interface. If only splitting write bursts, then one may
//  optionally use stall latency handshaking with upstream. Any style of handshaking (stall/valid or stall latency) can be used with the downstream interface, if using stall
//  latency then downstream should provide an almost full signal as backpressure.
//
//  By default, the adder used to calculate the address for words inside a burst is the full width of the address. To improve fmax, we can reduce the adder width, however it
//  has to be known that bursts will not cross some boundary. For example, if it is known that bursts cannot cross a 4096 byte boundary, then adder only needs to span the lower
//  12 bits of the address. Shortening a long carry chain helps to improve fmax, and saves area.
//
//  Required files:
//  - dla_acl_burst_splitter.sv
//  - dla_acl_reset_handler.sv
//  - dla_acl_parameter_assert.svh

`default_nettype none
`include "dla_acl_parameter_assert.svh"

module dla_acl_burst_splitter #(
    //signal width -- all must be at least 1
    parameter int unsigned ADDRESS_WIDTH,       // byte address must be word aligned, e.g. if BYTEENABLE_WIDTH = 4, then address must be 4-byte aligned, bottom 2 bits must be 0
    parameter int unsigned BURSTCOUNT_WIDTH,
    parameter int unsigned BYTEENABLE_WIDTH,    // must be a power of 2, specifies word size
    
    //burst splitting configuration
    parameter bit SPLIT_WRITE_BURSTS = 1,       // 0 means leave writes bursts untouched, 1 means split write bursts
    parameter bit SPLIT_READ_BURSTS = 1,        // likewise for read bursts
    
    //special configuration
    parameter int unsigned BURST_BOUNDARY = 0,  // set to nonzero to specify what address size a burst will not cross, e.g. 12 means bursts cannot cross a 4K boundary
    parameter bit USE_STALL_LATENCY = 0,        // for write burst splitting only where there is no change in control flow (address inside the burst is computed on-the-fly),
                                                // 0 means stall/valid (up_write means we MAY accept it), 1 means stall/latency (up_write means we MUST accept it)
    //reset configuration
    parameter bit ASYNC_RESET = 0,              // how do we use reset: 1 means registers are reset asynchronously, 0 means registers are reset synchronously
    parameter bit SYNCHRONIZE_RESET = 1,        // based on how reset gets to us, what do we need to do: 1 means synchronize reset before consumption (if reset arrives asynchronously), 0 means passthrough (managed externally)
    parameter bit BACKPRESSURE_DURING_RESET = 1,// determine whether up_waitrequest will backpressure during reset, safer to do so but adds combinational logic
    
    //derived parameters
    localparam int unsigned DATA_WIDTH = 8*BYTEENABLE_WIDTH,
    localparam int unsigned ADDRESS_BITS_PER_WORD = $clog2(BYTEENABLE_WIDTH)    // how many lower bits of the byte address are stuck at 0 to ensure it is word aligned
) (
    input  wire                         clock,
    input  wire                         resetn,
    
    //upstream interface - avalon slave
    output logic                        up_waitrequest,
    input  wire                         up_read,
    input  wire                         up_write,
    input  wire     [ADDRESS_WIDTH-1:0] up_address,
    input  wire        [DATA_WIDTH-1:0] up_writedata,
    input  wire  [BYTEENABLE_WIDTH-1:0] up_byteenable,
    input  wire  [BURSTCOUNT_WIDTH-1:0] up_burstcount,
    
    //downstream interface - avalon master
    input  wire                         down_waitrequest,
    output logic                        down_read,
    output logic                        down_write,
    output logic    [ADDRESS_WIDTH-1:0] down_address,
    output logic       [DATA_WIDTH-1:0] down_writedata,
    output logic [BYTEENABLE_WIDTH-1:0] down_byteenable,
    output logic [BURSTCOUNT_WIDTH-1:0] down_burstcount
);

    
    
    //////////////////////////////////////
    //                                  //
    //  Sanity check on the parameters  //
    //                                  //
    //////////////////////////////////////
    
    generate
    `DLA_ACL_PARAMETER_ASSERT(ADDRESS_WIDTH >= 1)
    `DLA_ACL_PARAMETER_ASSERT(BURSTCOUNT_WIDTH >= 1)
    `DLA_ACL_PARAMETER_ASSERT(BYTEENABLE_WIDTH >= 1)
    `DLA_ACL_PARAMETER_ASSERT(BYTEENABLE_WIDTH == 2**ADDRESS_BITS_PER_WORD)
    `DLA_ACL_PARAMETER_ASSERT(BURST_BOUNDARY < ADDRESS_WIDTH)
    `DLA_ACL_PARAMETER_ASSERT(BURST_BOUNDARY == 0 || BURST_BOUNDARY >= ADDRESS_BITS_PER_WORD)
    `DLA_ACL_PARAMETER_ASSERT(USE_STALL_LATENCY == 0 || SPLIT_READ_BURSTS == 0)
    endgenerate
    
    
    
    /////////////
    //         //
    //  Reset  //
    //         //
    /////////////
    
    logic aclrn, sclrn;
    dla_acl_reset_handler
    #(
        .ASYNC_RESET            (ASYNC_RESET),
        .USE_SYNCHRONIZER       (SYNCHRONIZE_RESET),
        .SYNCHRONIZE_ACLRN      (SYNCHRONIZE_RESET),
        .PIPE_DEPTH             (2),
        .NUM_COPIES             (1)
    )
    dla_acl_reset_handler_inst
    (
        .clk                    (clock),
        .i_resetn               (resetn),
        .o_aclrn                (aclrn),
        .o_resetn_synchronized  (),
        .o_sclrn                (sclrn)
    );
    
    
    
    //////////////////////
    //                  //
    //  Burst splitter  //
    //                  //
    //////////////////////
    
    logic                        inside_read_burst;                         //inside a read burst
    logic                        inside_burst;                              //inside some burst -- we are inside a write burst if inside_burst & ~inside_read_burst
    logic [ADDRESS_WIDTH-1:0]    internal_address;                          //address for words inside a burst
    logic [BURSTCOUNT_WIDTH-1:0] internal_burstcount;                       //keep track of how many remaining words are in a burst
    logic                        internal_burstcount_eq_two;                //register the check for internal_burstcount == 2 by looking at how we get into that condition
    logic                        backpressure_during_reset;                 //helper signal which sets up_waitrequest = 1 during reset under various reset configurations
    logic                        backpressure_during_read_burst;            //stall upstream while we split read bursts
    logic [ADDRESS_WIDTH-1:0]    up_address_plus_byteenable_width;          //manually split the bits of the adder in the case where the bursts are known to not cross some boundary
    logic [ADDRESS_WIDTH-1:0]    internal_address_plus_byteenable_width;    //same idea as above
    logic                        down_burstcount_mask;                      //under which conditions should we override down_burstcount to 1
    logic [ADDRESS_WIDTH-1:0]    down_address_raw;                          //before outputting down_address, set the bottom ADDRESS_BITS_PER_WORD bits to 0, Quartus will prune any logic that drove these bits
    
    generate
    if (BURST_BOUNDARY) begin : GEN_SHORT_ADDRESS_ADDER     //burst will not cross a boundary of 2**BURST_BOUNDARY
        assign up_address_plus_byteenable_width[BURST_BOUNDARY-1:0] = up_address[BURST_BOUNDARY-1:0] + BYTEENABLE_WIDTH;                    //only the lower address bits within a burst need the adder
        assign up_address_plus_byteenable_width[ADDRESS_WIDTH-1:BURST_BOUNDARY] = up_address[ADDRESS_WIDTH-1:BURST_BOUNDARY];               //upper bits come directly from the input address
        assign internal_address_plus_byteenable_width[BURST_BOUNDARY-1:0] = internal_address[BURST_BOUNDARY-1:0] + BYTEENABLE_WIDTH;
        assign internal_address_plus_byteenable_width[ADDRESS_WIDTH-1:BURST_BOUNDARY] = internal_address[ADDRESS_WIDTH-1:BURST_BOUNDARY];   //reg holds its value
    end
    else begin : GEN_FULL_ADDRESS_ADDER
        assign up_address_plus_byteenable_width = up_address + BYTEENABLE_WIDTH;
        assign internal_address_plus_byteenable_width = internal_address + BYTEENABLE_WIDTH;
    end
    endgenerate
    
    
    always_ff @(posedge clock or negedge aclrn) begin
        if (~aclrn) begin
            inside_read_burst <= 1'b0;
            inside_burst <= 1'b0;
            internal_address <= '0;
            internal_burstcount <= '0;
            internal_burstcount_eq_two <= 1'b0;
        end
        else begin
            if (~inside_burst) begin
                internal_address <= up_address_plus_byteenable_width;
                internal_burstcount <= up_burstcount;
                internal_burstcount_eq_two <= (up_burstcount == 2);
                
                //whether or not we enter inside a burst for splitting depends on the burst splitting configuration
                if (SPLIT_WRITE_BURSTS && SPLIT_READ_BURSTS) begin      //split both read and write bursts
                    if ((up_read | up_write) & ~down_waitrequest & (up_burstcount != 1)) inside_burst <= 1'b1;
                    if ( up_read             & ~down_waitrequest & (up_burstcount != 1)) inside_read_burst <= 1'b1;
                end
                else if (SPLIT_WRITE_BURSTS) begin                      //split write bursts only
                    if (up_write & (~down_waitrequest | USE_STALL_LATENCY) & (up_burstcount != 1)) inside_burst <= 1'b1;
                    //inside_read_burst will be stuck at 0
                end
                else if (SPLIT_READ_BURSTS) begin                       //split read bursts only
                    if (up_read & ~down_waitrequest & (up_burstcount != 1)) begin
                        inside_burst <= 1'b1;
                        inside_read_burst <= 1'b1;
                    end
                end
                //else no burst splitting, inside_burst and inside_read_burst will both be stuck at 0
            end
            else begin
                //note that USE_STALL_LATENCY applies to the upstream interface, but one can only set USE_STALL_LATENCY = 1 when splitting only write bursts
                //in which case the control flow does not change, i.e. up_waitrequest = down_waitrequest
                if ((~down_waitrequest | USE_STALL_LATENCY) & (backpressure_during_read_burst | up_write)) begin
                    internal_address <= internal_address_plus_byteenable_width;
                    internal_burstcount <= internal_burstcount - 1;
                    internal_burstcount_eq_two <= (internal_burstcount == 3);
                    if (internal_burstcount_eq_two) begin
                        inside_read_burst <= 1'b0;
                        inside_burst <= 1'b0;
                    end
                end
            end
            if (~sclrn) begin
                inside_read_burst <= 1'b0;
                inside_burst <= 1'b0;
            end
        end
    end
    
    //backpressure
    assign backpressure_during_reset = (!BACKPRESSURE_DURING_RESET) ? 1'b0 : (~aclrn) ? 1'b1 : (~sclrn) ? 1'b1 : 1'b0;
    assign backpressure_during_read_burst = (SPLIT_READ_BURSTS) ? inside_read_burst : 1'b0;
    assign up_waitrequest = down_waitrequest | backpressure_during_read_burst | backpressure_during_reset;
    
    //write only data path can simply pass through
    assign down_writedata  = up_writedata;
    assign down_byteenable = up_byteenable;
    
    //if we are inside a burst, the read ack has already gone to upstream so the next transaction is being presented
    assign down_read  = (backpressure_during_read_burst) ? 1'b1 : up_read;
    assign down_write = (backpressure_during_read_burst) ? 1'b0 : up_write;
    
    //original address is used at the beginning of a burst (or if burst was not split), we computed the address inside a burst
    assign down_address_raw = (inside_burst) ? internal_address : up_address;
    
    //set lower ADDRESS_BITS_PER_WORD bits to 0, the lower bits will be pruned away from all logic related to address
    assign down_address = down_address_raw[ADDRESS_WIDTH-1:ADDRESS_BITS_PER_WORD] << ADDRESS_BITS_PER_WORD;
    
    //under what conditions are we splitting a burst, and therefore down_burstcount should be set to 1
    assign down_burstcount_mask = (SPLIT_WRITE_BURSTS && SPLIT_READ_BURSTS) ? 1'b1 : (SPLIT_WRITE_BURSTS) ? down_write : (SPLIT_READ_BURSTS) ? down_read : 1'b0;
    assign down_burstcount = (down_burstcount_mask) ? 1'h1 : up_burstcount;
    
endmodule

`default_nettype wire