// Copyright 2020-2024 Intel Corporation.
//
// This software and the related documents are Intel copyrighted materials,
// and your use of them is governed by the express license under which they
// were provided to you ("License"). Unless the License provides otherwise,
// you may not use, modify, copy, publish, distribute, disclose or transmit
// this software or the related documents without Intel's prior written
// permission.
//
// This software and the related documents are provided as is, with no express
// or implied warranties, other than those that are expressly stated in the
// License.

/**
 * dla_layout_transform.sv
 *
 * Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled)
 * and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also
 * fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1.
 * The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node
 * in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler.
 *
 * The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension
 * which improves the efficiency of the PE array. The parameters of the first convolution in the graph are
 * required as input to this module. The input tensor is partitioned into volumes equal to the
 * STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution.
 * The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete.
 *
 * To achieve the folding transform, the DLA layout transform module instantiates the following modules:
 *   > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level
 *     signals and submodules.
 *
 *   > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types.
 *
 *   > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet.
 *
 *   > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping
 *     target output position; this includes which RAM module, RAM line, and posisiton within the RAM line
 *     (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this
 *     data to emplace the incoming data into its position in the RAM.
 *
 *   > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace
 *     incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold
 *     data, we have to buffer an output CVEC for many cycles before all the data becomes available.
 *     Has these submodules,
 *      > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources.
 *      > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in
 *        a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info
 *        module.
 *
 *    > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output.
 *      This module is also responsible for keeping track of the output dimensions and writing padding lines when
 *      required.
 *
 */

`resetall
`undefineall
`default_nettype none

`include "dla_acl_parameter_assert.svh"

function int calc_output_channels(
  input int cvec, channels, stride_height, stride_width, stride_depth
);
  integer div_result;
  div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec;
  calc_output_channels = div_result * cvec;
endfunction

function int calc_output_dim_max(
  input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim
);
  integer conv_dim;
  // conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1;
  conv_dim = (feature_dim + pad_dim) + 1;
  // ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1;
  calc_output_dim_max = conv_dim + filter_dim - 1;
endfunction

 module dla_layout_transform
 import dla_common_pkg::*,dla_lt_pkg::*;
  #(
    // Convolution parameters:
    parameter int MAX_CHANNELS =0,
    parameter int MAX_FEATURE_HEIGHT=0,
    parameter int MAX_FEATURE_WIDTH=0,
    parameter int MAX_FEATURE_DEPTH=0,
    parameter int MAX_STRIDE_HEIGHT=0,
    parameter int MAX_STRIDE_WIDTH=0,
    parameter int MAX_STRIDE_DEPTH=0,
    parameter int MAX_PAD_FRONT=0,
    parameter int MAX_PAD_LEFT=0,
    parameter int MAX_PAD_TOP=0,
    parameter int MAX_FILTER_WIDTH=4,
    parameter int MAX_FILTER_HEIGHT=4,
    parameter int MAX_FILTER_DEPTH=4,
    parameter int MAX_DILATION_WIDTH,
    parameter int MAX_DILATION_HEIGHT,
    parameter int MAX_DILATION_DEPTH,

    // Exact parameters
    parameter int CVEC=0,
    parameter bit DO_U8_CONV=1,
    parameter int DATA_ELEMENT_WIDTH = 32,
    parameter int CNT_BITS = 32,
    parameter int DDR_BYTES = 4,
    parameter int CONFIG_DATA_BYTES,

    device_family_t DEVICE,
    // Derived Params
    localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC),
    localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH,
    //todo: Capitalize constants...
    localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH,
    localparam int unsigned OUTPUT_DATA_WIDTH = 16,
    localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR
   ) (
     // Module connections
     input wire clk,
     input wire i_rstn,
     input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data,
     input wire i_config_valid,
     output logic o_config_ready,
     input wire [8*DDR_BYTES-1:0] i_data,
     input wire i_valid,
     input wire i_stall,
     output logic o_ready,
     output logic o_stall,
     output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data,
     output logic o_valid,
     output logic o_last,
     output logic o_param_error
   );

  `DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently.

  localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH);
  localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH);
  localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT);
  localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH);
  localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0;
  localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C;
  // if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area!
  localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions(
    MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR
   ) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works.
  localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!!
  localparam int n_pool_bits = $clog2((max_num_partitions));
  localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits));
  localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places.
  localparam int total_buffers = n_buffer_pools * cvec_per_buffer;
  localparam int buffers_in_progress = max_num_partitions*7;

  localparam int available = total_buffers - buffers_in_progress;

  logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written;
  logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0];
  logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0];

  logic ready_for_config;
  logic ready_for_transfer;
  logic input_data_valid;
  logic cnt_ready;
  logic next_transfer_overflow;
  logic internal_reset;
  logic resetn_condition, start_rx, done_frame;
  logic config_ready;
  logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count;

  layout_transform_config_if layout_transform_config();

  shortint unsigned finished_lines_reg;
  int buffer_usage;

  // counters used by dimension counter logic:
  logic [max($clog2(MAX_CHANNELS), 1)-1:0]        C_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0]   W_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0]    IN_W_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0]   S_W_d [ELEM_PER_DDR-1:0];

  logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0]  H_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0]   IN_H_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0]  S_H_d [ELEM_PER_DDR-1:0];

  logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0]   D_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0]    IN_D_d [ELEM_PER_DDR-1:0];
  logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0]   S_D_d [ELEM_PER_DDR-1:0];
  logic [MAX_DIM_BITS-1:0]    Index_d [ELEM_PER_DDR-1:0];

  assign resetn_condition = i_rstn & !internal_reset;
  assign o_config_ready = config_ready & ready_for_config;

  dla_config_deserialize #(
    .CONFIG_WIDTH(CONFIG_DATA_BYTES*8)
  ) lt_config_deserialize (
    .clk(clk),
    .i_resetn(resetn_condition),
    .i_valid(i_config_valid),
    .i_config(i_config_data),
    .o_ready(config_ready),

    .if_config(layout_transform_config)
  );

  always_ff @( posedge clk ) begin : latch_last_frame
    frame_finished <= frame_finished;

    if (i_valid & o_ready & ~done_frame) begin
      // We've started to accept output for this inference. This
      // signals to the output logic that we can start writing outputs.
      start_rx <= 1;
      frame_finished <= frame_finished - 1;
      done_frame <= frame_finished[$clog2(MAX_TRANSFERS)];
    end else if (layout_transform_config.valid & ~start_rx) begin
      // Don't accept a new config until we're finished with this transform.
      ready_for_config <= 1'b0;
      frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2;
    end

    if (~resetn_condition) begin
      start_rx <= 0;
      frame_finished <= MAX_TRANSFERS-2;
      done_frame <= 0;
      ready_for_config <= 1'b1;
    end
  end

  assign buffer_usage = (finished_lines_reg - lines_written);
  assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'.
  assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame;
  assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame;
  assign internal_reset = o_last & o_valid & !i_stall;
  // Dimension counter: Keeps track of position within tensor of incoming data.
  dla_lt_dimension_counter #(
    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
    .MAX_CHANNELS(MAX_CHANNELS),
    .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
    .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
    .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
    .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
    .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
    .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
    .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME),
    .MAX_DIM_BITS(MAX_DIM_BITS)
  ) dim_counter (
    .clk(clk),
    .i_rstn(resetn_condition),
    .i_increment(input_data_valid | ~ready_for_transfer),
    .if_lt_config(layout_transform_config),

    .o_ready(cnt_ready),
    .o_c_dim(C_d),
    .o_w_dim(W_d),
    .o_h_dim(H_d),
    .o_d_dim(D_d),
    .o_w_inner(IN_W_d),
    .o_h_inner(IN_H_d),
    .o_d_inner(IN_D_d),
    .o_w_stride(S_W_d),
    .o_h_stride(S_H_d),
    .o_d_stride(S_D_d),
    .o_index(Index_d)
  );

  // TODO(arooney): add more conversions
  logic [ELEM_PER_DDR-1:0][15:0] fp16_val;
  if (DO_U8_CONV) begin
    dla_lt_data_conversion #(
      .DDR_BYTES(DDR_BYTES),
      .DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH),
      .ELEMENTS_PER_CYCLE(ELEM_PER_DDR)
    ) data_conversion (
      .clk(clk),
      .i_valid(i_valid & o_ready),
      .i_data(i_data),

      .o_fp16_val(fp16_val),
      .o_valid(input_data_valid)
    );
  end
  else begin
    assign input_data_valid = i_valid & o_ready;
    always_ff @(posedge clk) begin
      fp16_val <= i_data;
    end
  end

  dla_lt_gen_index_info #(
    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
    .N_BUFFER_POOLS(n_buffer_pools),
    .CVEC_PER_BUFFER(cvec_per_buffer),
    .N_POOL_BITS(n_pool_bits),
    .CNT_BITS(CNT_BITS),
    .MAX_CHANNELS(MAX_CHANNELS),
    .CVEC(CVEC),
    .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
    .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
    .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),

    .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
    .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
    .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
    .MAX_DIM_BITS(MAX_DIM_BITS),
    .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME)
  ) gen_index_info (
    .clk(clk),
    .i_rstn(resetn_condition),
    .i_next_overflow(next_transfer_overflow),
    .i_valid(input_data_valid),
    .i_ready(cnt_ready),
    .i_c_dim(C_d),
    .i_w_inner(IN_W_d),
    .i_h_inner(IN_H_d),
    .i_d_inner(IN_D_d),
    .i_w_stride(S_W_d),
    .i_h_stride(S_H_d),
    .i_d_stride(S_D_d),
    .i_index(Index_d),
    .if_lt_config(layout_transform_config),

    .o_addr_queue(addr_queue),
    .o_completed_vol_tally(completed_vol_tally[0]),
    .o_ready_for_transfer(ready_for_transfer)
  );

  logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0];
  logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0];
  logic [n_buffer_pools-1:0] actively_reading;
  logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0];
  dla_lt_memory_manager #(
    .NUM_BUFFER_POOLS(n_buffer_pools),
    .CVEC_PER_BUFFER(cvec_per_buffer),
    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
    .CNT_BITS(CNT_BITS),
    .CVEC(CVEC),
    .MAX_OUTPUT_C(MAX_OUTPUT_C),
    .DEVICE(DEVICE)
  ) memory_manager (
    .clk(clk),
    .i_rstn(resetn_condition),
    .i_addr_queue(addr_queue),
    .i_output_line_num(output_line_num),
    .i_actively_reading(actively_reading),
    .i_fp16_data(fp16_val),
    .i_completed_vol_tally(completed_vol_tally[0]),

    .o_completed_vol_tally(completed_vol_tally[1]),
    .o_output_line_data(output_line_data),
    .o_curr_out_line(curr_out_line)
  );

  dla_lt_output_logic #(
    .NUM_BUFFER_POOLS(n_buffer_pools),
    .CVEC_PER_BUFFER(cvec_per_buffer),
    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
    .N_POOL_BITS(n_pool_bits),
    .MAX_OUTPUT_W(MAX_OUTPUT_W),
    .MAX_OUTPUT_H(MAX_OUTPUT_H),
    .MAX_OUTPUT_D(MAX_OUTPUT_D),
    .MAX_OUTPUT_C(MAX_OUTPUT_C),
    .CVEC(CVEC),
    .CNT_BITS(CNT_BITS),
    .MAX_DIM_BITS(MAX_DIM_BITS)
  ) output_logic (
    .clk(clk),
    .i_rstn(resetn_condition),
    .i_output_line_data(output_line_data),
    .i_curr_out_line(curr_out_line),
    .i_completed_vol_tally(completed_vol_tally[1]),
    .i_stall(i_stall),
    .if_lt_config(layout_transform_config),
    .i_ready(start_rx),

    .o_line_num(output_line_num),
    .o_read_req(actively_reading),
    .o_data(o_data),
    .o_valid(o_valid),
    .o_last(o_last),
    .o_lines_written(lines_written),
    .o_finished_lines(finished_lines_reg)
  );

endmodule