summaryrefslogtreecommitdiff
path: root/python/openvino/demo/ip/intel_ai_ip/verilog/dla_aux_depthwise_control.sv
blob: 9c5ae6946810f41545335b041a3baf4cddca050f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
// Copyright 2020-2023 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

/*
 * Module `dla_aux_depthwise_control`
 *
 * Control of the core functionality of the auxiliary block.
 *
 * WARNING!  ONLY EDIT THE PARTS MARKED IN BETWEEN
 *           "START EDITING" AND "END EDITING"
 *
 * See README.md of the Example Aux block for more details.
 */

`undefineall
`resetall
`default_nettype none

`include "dla_acl_parameter_assert.svh"

module dla_aux_depthwise_control
  import dla_aux_depthwise_pkg::*;
#(
  parameter aux_depthwise_arch_params_t ARCH // Architecture parameters
) (
  input  wire                         clk                , // Clock
  input  wire                         i_resetn           , // active low reset
  //
  depthwise_config_to_control_if.receiver  i_config_to_control, // Config to control connection
  output control_to_config_t          o_control_to_config, // Control to config connection
  depthwise_control_to_lane_if.sender      o_control_to_lane  , // Control to lane connection
  input  lane_to_control_t            i_lane_to_control  , // Lane to control connection
  //
  output debug_control_t              o_debug              // Debug output
);

/* synthesis translate_off */
`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(i_config_to_control.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS,
  "i_config_to_control if parameters don't match data pack params")
`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(i_config_to_control.special_params) == ARCH.AUX_SPECIAL_PARAMS,
  "i_config_to_control if parameters don't match special params")
`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_data_pack_params_t'(o_control_to_lane.data_pack_params) == ARCH.AUX_DATA_PACK_PARAMS,
  "o_control_to_lane if parameters don't match data pack params")
`DLA_ACL_PARAMETER_ASSERT_MESSAGE(aux_special_params_t'(o_control_to_lane.special_params) == ARCH.AUX_SPECIAL_PARAMS,
  "o_control_to_lane if parameters don't match special params")
/* synthesis translate_on */

//
// ------------------------------ START EDITING ------------------------------
//
  // Shorthand versions of parameters
  localparam NATIVE_VECTOR_SIZE    = ARCH.AUX_DATA_PACK_PARAMS.NATIVE_VECTOR_SIZE ;
  localparam VECTOR_SIZE           = ARCH.AUX_DATA_PACK_PARAMS.VECTOR_SIZE        ;
  localparam MAX_WINDOW_HEIGHT     = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_HEIGHT    ;
  localparam MAX_WINDOW_WIDTH      = ARCH.AUX_SPECIAL_PARAMS.MAX_WINDOW_WIDTH     ;
  localparam MAX_STRIDE_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_HORIZONTAL;
  localparam MAX_STRIDE_VERTICAL   = ARCH.AUX_SPECIAL_PARAMS.MAX_STRIDE_VERTICAL  ;
  localparam MAX_DILATION_VERTICAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_VERTICAL ;
  localparam MAX_DILATION_HORIZONTAL = ARCH.AUX_SPECIAL_PARAMS.MAX_DILATION_HORIZONTAL ;
  localparam VERTICAL_LINES =  MAX_WINDOW_HEIGHT + ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1));
  localparam WINDOW_BITS_VERTICAL      = $clog2(MAX_WINDOW_HEIGHT + 1);
  localparam WINDOW_BITS_HORIZONTAL    = $clog2(MAX_WINDOW_WIDTH + 1);
  localparam DILATION_BITS_VERTICAL      = $clog2(MAX_DILATION_VERTICAL + 1);
  localparam DILATION_BITS_HORIZONTAL      = $clog2(MAX_DILATION_HORIZONTAL + 1);
  localparam VECTOR_RATIO = NATIVE_VECTOR_SIZE / VECTOR_SIZE;
  localparam TILE_COUNT = ARCH.AUX_DATA_PACK_PARAMS.GROUP_SIZE *
    ARCH.AUX_DATA_PACK_PARAMS.GROUP_NUM;


  // input valid counters
  logic [$clog2(                        VECTOR_RATIO    + 1                    )-1:0] count_in_vector;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH  + 1                    )-1:0] count_in_width ;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1                    )-1:0] count_in_height;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS  + NATIVE_VECTOR_SIZE)-1:0] count_in_channels ;

  // output valid counters
  logic [$clog2(                        VECTOR_RATIO    + 1                    )-1:0] count_out_vector;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_WIDTH  + 1                    )-1:0] count_out_width ;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_HEIGHT + 1                    )-1:0] count_out_height;
  logic [$clog2(ARCH.AUX_SPECIAL_PARAMS.MAX_TILE_CHANNELS  + NATIVE_VECTOR_SIZE)-1:0] count_out_channels ;

  // Register the computation of the effective filter sizes to be used lane_to_control_t
  logic [WINDOW_BITS_VERTICAL-1:0] kernel_vert_minus_one;
  logic [WINDOW_BITS_HORIZONTAL-1:0] kernel_horiz_minus_one;

  logic [DILATION_BITS_VERTICAL-1:0] dilation_vert_minus_one;
  logic [DILATION_BITS_HORIZONTAL-1:0] dilation_horiz_minus_one;

  logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL-1:0] kernel_x_dilation_vert;
  logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL-1:0] kernel_x_dilation_horiz;

  logic [WINDOW_BITS_VERTICAL+DILATION_BITS_VERTICAL:0] eff_kernel_vert;
  logic [WINDOW_BITS_HORIZONTAL+DILATION_BITS_HORIZONTAL:0] eff_kernel_horiz;

  always_ff @(posedge clk) begin
    if (~i_resetn) begin
      kernel_vert_minus_one  <= '{default:'0};
      kernel_horiz_minus_one <= '{default:'0};
      dilation_vert_minus_one <= '{default:'0};
      dilation_horiz_minus_one <= '{default:'0};
      kernel_x_dilation_vert <= '{default:'0};
      kernel_x_dilation_horiz <= '{default:'0};
      eff_kernel_vert <= '{default:'0};
      eff_kernel_horiz <= '{default:'0};
    end else begin
      kernel_vert_minus_one <= (i_config_to_control.data[0][0].window_height - 1);
      kernel_horiz_minus_one <= (i_config_to_control.data[0][0].window_width - 1);
      dilation_vert_minus_one <= (i_config_to_control.data[0][0].dilation_vertical - 1);
      dilation_horiz_minus_one <= (i_config_to_control.data[0][0].dilation_horizontal - 1);
      kernel_x_dilation_vert <= kernel_vert_minus_one * dilation_vert_minus_one;
      kernel_x_dilation_horiz <= kernel_horiz_minus_one * dilation_horiz_minus_one;
      eff_kernel_vert <= i_config_to_control.data[0][0].window_height + kernel_x_dilation_vert;
      eff_kernel_horiz <= i_config_to_control.data[0][0].window_width + kernel_x_dilation_horiz;
    end
  end
  //
  // Input valid counter comprises cascaded counters of vector, width, height and channels.
  //
  // The input backpressure signal is also generated in this process.
  //
  logic input_group_done;
  logic feature_ready;
  logic feature_almost_ready;
  logic configured_delayed;
  assign o_control_to_lane.data[0][0].ready = feature_ready;
  always_ff @(posedge clk) begin : proc_input_counters
    // Nested counters for channels, line and column, which operate only when the core's input is valid
    configured_delayed <= i_config_to_control.data[0][0].configured;
    input_group_done <= 0;
    feature_almost_ready <= 0;
    if (i_config_to_control.data[0][0].configured & ~configured_delayed)
      o_control_to_lane.data[0][0].configured_starting <= ~o_control_to_lane.data[0][0].configured_starting;
    if (o_control_to_config.done)
      o_control_to_lane.data[0][0].configured_ending <= ~o_control_to_lane.data[0][0].configured_ending;
    if (i_lane_to_control.core_input_valid) begin
      // shallow channels counter
      count_in_vector <= count_in_vector + 1'b1;
      if (count_in_vector >= VECTOR_RATIO-1) begin
        count_in_vector <= '0;
        // column counter
        count_in_width <= count_in_width + 1'b1;
        // We want to stop reading features if filters are not ready and if we are close to getting enough features to produce output
        // enough features euql to a number of rows = window_height and columns equal window_width
        if ((count_in_height >= kernel_vert_minus_one) && (count_in_width >= i_config_to_control.data[0][0].window_width-2)) begin
          feature_almost_ready <= 1;
        end
        if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin
          count_in_width <= '0;
          // line counter
          count_in_height <= count_in_height + 1'b1;
          if (count_in_height >= i_config_to_control.data[0][0].tile_height-1) begin
            count_in_height <= '0;
            // channels counter
            input_group_done <= 1;
            count_in_channels <= $bits(count_in_channels)'(count_in_channels + NATIVE_VECTOR_SIZE);
            if (count_in_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin
              count_in_channels <= '0;
              input_group_done <= 1;
              // input tensor is finished, backpressure the input pipeline
              //o_control_to_lane.data[0][0].ready <= 1'b0;
            end
          end
        end
      end
    end
    //
    begin
      logic configured_reg;
      logic filter_ready_reg;
      // register the current value of the 'configured' signal
      configured_reg <= i_config_to_control.data[0][0].configured;
      filter_ready_reg <= i_lane_to_control.depthwise_filter_ready;
      // wait for a rising edge of the 'configured' signal to disable input pipeline backpressure
      // CHECKME: why the first and?
    end
    // reset counters if in reset or not configured
    if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin
      count_in_vector    <= '0;
      count_in_width     <= '0;
      count_in_height    <= '0;
      count_in_channels  <= '0;
      input_group_done   <= 1'b0;
      feature_almost_ready <= 0;
      configured_delayed <= '0;
    end
    if (~i_resetn) begin
      o_control_to_lane.data[0][0].configured_starting <= 0;
      o_control_to_lane.data[0][0].configured_ending <= 0;
    end
  end : proc_input_counters
  //
  // state machine to handle when features and filters should be ready to be received
  // right now, we receive filters first then features, then we process
  //
  typedef enum logic [2:0] {
        IDLE = 3'b000,
        FILTER_FEATURE = 3'b001,
        FILTER = 3'b010,
        FEATURE = 3'b011,
        PROCESSING = 3'b100
    } state_t;
  state_t state, state_next;
  always_ff @(posedge clk) begin
    if (~i_resetn) begin
      state <= IDLE;
    end else begin
      state <= state_next;
    end
  end
  always_comb begin
      state_next = state;
      feature_ready = 0;
      case(state)
        IDLE: begin
          feature_ready = 0;
          if (i_config_to_control.data[0][0].configured) begin
            state_next = FILTER_FEATURE;
            feature_ready = 1;
          end
        end
        FILTER_FEATURE: begin
          feature_ready = 1;
          if (feature_almost_ready) begin
            feature_ready = 0;
            state_next = FILTER;
          end
          if (i_lane_to_control.depthwise_filter_ready)
            state_next = FEATURE;
        end
        FILTER: begin
          if (i_lane_to_control.depthwise_filter_ready)
            state_next = FEATURE;
        end
        FEATURE: begin
          feature_ready = 1;
          if (input_group_done) begin
            state_next = PROCESSING;
            feature_ready = 0;
          end
          else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured) begin
            state_next = FILTER;
            feature_ready = 0;
          end else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured) begin
            state_next = IDLE;
            feature_ready = 0;
          end
        end
        PROCESSING: begin
          if (o_control_to_config.done) begin
            state_next = IDLE;
            feature_ready = 0;
          end
          else if (o_control_to_lane.data[0][0].done & i_config_to_control.data[0][0].configured)
            state_next = FILTER_FEATURE;
          else if (o_control_to_lane.data[0][0].done & ~i_config_to_control.data[0][0].configured)
            state_next = IDLE;
        end
        default: state_next = IDLE; // Default state
      endcase
  end
  // Pass dilation from config to lane
  assign o_control_to_lane.data[0][0].dilation_vertical = i_config_to_control.data[0][0].dilation_vertical;
  assign o_control_to_lane.data[0][0].dilation_horizontal = i_config_to_control.data[0][0].dilation_horizontal;
  //
  // Line-buffers inside the core are implemented as FIFOs. FIFO synchronization and handover
  // between consequent tensors are achieved by the following steps:
  //  * Line buffers are filled with tensor-width amount of data at the beginning of each tensor.
  //  * The fill level is kept constant throughout the tensor.
  //  * At the end of each tensor all FIFOs are drained to prepare them for the next tensor.
  //
  always_ff @(posedge clk) begin : proc_line_buff_control
    o_control_to_lane.data[0][0].line_buff_wait_fill <= count_in_height == 0;
    o_control_to_lane.data[0][0].line_buff_flush     <= count_in_height == i_config_to_control.data[0][0].tile_height-1;
    // Flush the FIFO fill level when window height is configured to be 1
    if (i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured) begin
      o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0;
      o_control_to_lane.data[0][0].line_buff_flush     <= 1'b1;
    end
    if (~i_resetn) begin
      o_control_to_lane.data[0][0].line_buff_wait_fill <= 1'b0;
      o_control_to_lane.data[0][0].line_buff_flush     <= 1'b0;
    end
  end : proc_line_buff_control

  //
  // Padding generator control consists of multiple enable flags. Each flag enables a set/reset
  // mode of a register or act like select bits of a multiplexer.
  //
  // If max window size is larger than the configured window size, then the generator is used to
  // load the identity element of the operation into the out of bound registers.
  //
  always_ff @(posedge clk) begin : proc_pad_control
    o_control_to_lane.data[0][0].window_height <= i_config_to_control.data[0][0].window_height;
    o_control_to_lane.data[0][0].window_width <= i_config_to_control.data[0][0].window_width;
    // Vertical padding control
    for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_vert
      for (int j = 0; j < VERTICAL_LINES; j++) begin
        // For the height of the active window, determine if, when and which padding mode is enabled
        // per-tile and per-line
        // ((MAX_WINDOW_HEIGHT-1) * (MAX_DILATION_VERTICAL-1))
        if (j < eff_kernel_vert) begin
          o_control_to_lane.data[0][0].en_pad_zero_vert[i][j] <= (
            count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i]  ||
            count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end  [i]) &&
            i_config_to_control.data[0][0].padding_mode == 2'b00;
            //
            // TODO: Implement constant and reflection boundary conditions
            //
            o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= (
              count_in_height < j + i_config_to_control.data[0][0].tile_vertical_start[i]  ||
              count_in_height > j + i_config_to_control.data[0][0].tile_vertical_end  [i]) &&
              i_config_to_control.data[0][0].padding_ignore;
        end else begin
          // For the lines outside the active window, pad everything to NaN, which is defined to be
          // the identity element
          o_control_to_lane.data[0][0].en_pad_nan_vert[i][j] <= 1'b1;
        end
      end
    end : proc_pad_control_vert
    // Horizontal padding control
    for (int i = 0; i < TILE_COUNT; i++) begin : proc_pad_control_horiz
      for (int j = 0; j < MAX_WINDOW_WIDTH; j++) begin
        // For the width of the active window, determine if, when and which padding mode is enabled
        // per-tile and per-line
        if (j < i_config_to_control.data[0][0].window_width) begin
          o_control_to_lane.data[0][0].en_pad_zero_horiz[i][j] <= (
            count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i]  ||
            count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end  [i]) &&
            i_config_to_control.data[0][0].padding_mode == 2'b00;
            //
            // TODO: Implement constant and reflection boundary conditions
            //
            o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= (
              count_in_width < j + i_config_to_control.data[0][0].tile_horizontal_start[i]  ||
              count_in_width > j + i_config_to_control.data[0][0].tile_horizontal_end  [i]) &&
              i_config_to_control.data[0][0].padding_ignore;
        end else begin
          // For the columns outside the active window pad everything to NaN, which is defined to be the
          // identity element
          o_control_to_lane.data[0][0].en_pad_nan_horiz[i][j] <= 1'b1;
        end
      end
    end : proc_pad_control_horiz
    // These flags mark the area of padding
    o_control_to_lane.data[0][0].is_padding_zone_vert  <= i_config_to_control.data[0][0].window_height > 1 &&
      count_in_height < (eff_kernel_vert-1);
    o_control_to_lane.data[0][0].is_padding_zone_horiz <= i_config_to_control.data[0][0].window_width > 1 &&
      count_in_width < (eff_kernel_horiz-1);
    if (~i_resetn) begin
      o_control_to_lane.data[0][0].is_padding_zone_vert  <= 1'b1;
      o_control_to_lane.data[0][0].is_padding_zone_horiz <= 1'b1;
      o_control_to_lane.data[0][0].window_height <= MAX_WINDOW_HEIGHT;
      o_control_to_lane.data[0][0].window_width <= MAX_WINDOW_WIDTH;
    end
  end : proc_pad_control

  //
  // Stride counters and stride valid signal generator.
  //
  // Other input counters are used in conjunction
  //
  always_ff @(posedge clk) begin : proc_stride
    // stride counters
    logic [$clog2(MAX_STRIDE_VERTICAL  +1):0] count_stride_vert ;
    logic [$clog2(MAX_STRIDE_HORIZONTAL+1):0] count_stride_horiz;
    // Count only when input is valid and shallow channels counter is about to overflow (which means
    // we are moving on to the next face coordinates)
    if (i_lane_to_control.core_input_valid) begin
      if (count_in_vector >= VECTOR_RATIO-1) begin
        // By default, increment the horizontal stride counter, as long as the width-counter has
        // counted minimum window-with number of elements (so core has a full window to operate on).
        if (count_in_width >= kernel_horiz_minus_one) begin
          count_stride_horiz <= count_stride_horiz + 1'b1;
        end
        // Reset horizontal stride counter when it overflows
        if (count_stride_horiz >= i_config_to_control.data[0][0].stride_horizontal - 1) begin
          count_stride_horiz <= '0;
        end
        // Vertical stride counter is manipulated only when the input width-counter is about to
        // overflow
        if (count_in_width >= i_config_to_control.data[0][0].tile_width-1) begin
          // Reset the horizontal counter
          count_stride_horiz <= '0;
          // By default, increment the vertical stride counter, as long as the height-counter has
          // counted minimum window-height number of elements (so core has a full window to operate
          // on).
          if (count_in_height >= kernel_vert_minus_one) begin
            count_stride_vert <= count_stride_vert + 1'b1;
          end
          // Reset vertical stride counter when it overflows or when input height-counter is about
          // to overflow
          if (
            count_stride_vert >= i_config_to_control.data[0][0].stride_vertical - 1 ||
            count_in_height   >= i_config_to_control.data[0][0].tile_height     - 1
          ) begin
            count_stride_vert <= '0;
          end
        end
      end
    end
    // Stride counters must be reset when window height is configured to 1
    if (
      i_config_to_control.data[0][0].window_height == 1 && i_config_to_control.data[0][0].configured &&
      count_stride_horiz == '1 && count_stride_vert == '1
    ) begin
      count_stride_vert <= '0;
      count_stride_horiz <= '0;
    end
    // During reset both counters are set to give one extra cycle to the counters
    if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin
      count_stride_horiz <= '1;
      count_stride_vert <= '1;
    end
    //
    // Stride is valid when both horizontal and vertical counters are zero
    //
    o_control_to_lane.data[0][0].stride_valid <= count_stride_vert == '0 && count_stride_horiz == '0;
  end : proc_stride

  //
  // Output valid counter comprises cascaded counters of vector, width, height and channels.
  //
  // A 'done' pulse is sent to the config decoder when the last tensor element is processed.
  //
  always_ff @(posedge clk) begin : proc_output_counters
    // clear the done signal by default
    o_control_to_config.done <= 1'b0;
    o_control_to_lane.data[0][0].done <= 1'b0;
    // Nested counters for channels, line and column, which operate only when core has a valid result.
    if (i_lane_to_control.core_output_valid) begin
      // shallow channels counter
      count_out_vector <= count_out_vector + 1'b1;
      if (count_out_vector >= VECTOR_RATIO-1) begin
        count_out_vector <= '0;
        // column counter
        count_out_width <= $bits(count_out_width)'(count_out_width + i_config_to_control.data[0][0].stride_horizontal);
        if (count_out_width >= (i_config_to_control.data[0][0].tile_width -
                                i_config_to_control.data[0][0].window_width) -
                                kernel_x_dilation_horiz) begin
          count_out_width <= '0;
          // line counter
          count_out_height <= $bits(count_out_height)'(count_out_height + i_config_to_control.data[0][0].stride_vertical);
          //-i_config_to_control.data[0][0].stride_vertical + 1
          if (count_out_height >= i_config_to_control.data[0][0].tile_height -
                                  i_config_to_control.data[0][0].window_height -
                                  kernel_x_dilation_vert) begin
            count_out_height <= '0;
            // send a 1 clock cycle long 'done' every time channels counter increment
            // to indicate a new set of filters
            o_control_to_lane.data[0][0].done <= 1'b1;
            // channels counter
            count_out_channels <= $bits(count_out_channels)'(count_out_channels + NATIVE_VECTOR_SIZE);
            if (count_out_channels >= i_config_to_control.data[0][0].tile_channels - NATIVE_VECTOR_SIZE) begin
              count_out_channels <= '0;
              // send a 1 clock cycle long 'done' pulse after all counters reset to 0
              o_control_to_config.done <= 1'b1;
            end
          end
        end
      end
    end
    // Reset counters if the module is in reset or not configured
    if (~i_resetn || ~i_config_to_control.data[0][0].configured) begin
      count_out_vector <= '0;
      count_out_width  <= '0;
      count_out_height <= '0;
      count_out_channels  <= '0;
    end
  end : proc_output_counters
//
// ------------------------------  END EDITING  ------------------------------
//

endmodule