1 files changed, 384 insertions, 0 deletions
diff --git a/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv
new file mode 100644
index 0000000..e52fe53
--- /dev/null
+++ b/python/openvino/demo/ip/intel_ai_ip/verilog/dla_layout_transform.sv
@@ -0,0 +1,384 @@
+// Copyright 2020-2024 Intel Corporation.
+//
+// This software and the related documents are Intel copyrighted materials,
+// and your use of them is governed by the express license under which they
+// were provided to you ("License"). Unless the License provides otherwise,
+// you may not use, modify, copy, publish, distribute, disclose or transmit
+// this software or the related documents without Intel's prior written
+// permission.
+//
+// This software and the related documents are provided as is, with no express
+// or implied warranties, other than those that are expressly stated in the
+// License.
+
+/**
+ * dla_layout_transform.sv
+ *
+ * Top level of the DLA layout transform (LT) module. The transform can u8 data to FP16 (can be disabled)
+ * and converts DHWC tensors to CDHWCvec (which is the format required by the PE array), it will also
+ * fold the data into the CVEC dimension whenever the stride dimensions of the first convolution are non-1.
+ * The folding feature cannot be turned off here - it can be turned off my ensuring that the transform node
+ * in the compiler has strides of 1. See `dla_pass_folding.cpp` to see how this is done in the compiler.
+ *
+ * The main feature of this layout transform is that it can fold input tensor dimensions into the channel dimension
+ * which improves the efficiency of the PE array. The parameters of the first convolution in the graph are
+ * required as input to this module. The input tensor is partitioned into volumes equal to the
+ * STRIDE_HEIGHTxSTRIDE_WIDTHxSTRIDE_DEPTHxCHANNELS of the input convolution.
+ * The partitioned volume is then copied into one "CVEC" line and output once the CVEC line is complete.
+ *
+ * To achieve the folding transform, the DLA layout transform module instantiates the following modules:
+ *   > dla_layout_transform.sv - This module, serves as the interface for users, and instantiates top-level
+ *     signals and submodules.
+ *
+ *   > dla_lt_conversion.sv - If enabled, converts input data from U8 to FP16 data types.
+ *
+ *   > dla_lt_dimension_counter.sv - Generates tensor indexes for all tokens in incoming data packet.
+ *
+ *   > dla_lt_gen_index_info.sv - Uses tensor indexes from the lt_dimension_counter to calculate the mapping
+ *     target output position; this includes which RAM module, RAM line, and posisiton within the RAM line
+ *     (each line holds a CVEC line of output data) each output is mapped to. The memory manager uses this
+ *     data to emplace the incoming data into its position in the RAM.
+ *
+ *   > dla_lt_memory_manager.sv - Uses the addressing information from the lt_gen_index_info module to emplace
+ *     incoming data into the RAM. The RAM is used to store intermediate results because often, when we fold
+ *     data, we have to buffer an output CVEC for many cycles before all the data becomes available.
+ *     Has these submodules,
+ *      > dla_lt_ram_arb.sv - Arbitrates write requests and read requests from two sources.
+ *      > dla_lt_funnel.sv - Maps data from incoming data packet to the correct position within CVEC in
+ *        a single cycle for all data in the input packet. Uses the indexing info from the lt_gen_index_info
+ *        module.
+ *
+ *    > dla_lt_output_logic.sv - Keeps track of the number of completed CVEC lines, and writes them to output.
+ *      This module is also responsible for keeping track of the output dimensions and writing padding lines when
+ *      required.
+ *
+ */
+
+`resetall
+`undefineall
+`default_nettype none
+
+`include "dla_acl_parameter_assert.svh"
+
+function int calc_output_channels(
+  input int cvec, channels, stride_height, stride_width, stride_depth
+);
+  integer div_result;
+  div_result = ((channels * stride_width * stride_height * stride_depth) + cvec - 1) / cvec;
+  calc_output_channels = div_result * cvec;
+endfunction
+
+function int calc_output_dim_max(
+  input int feature_dim, filter_dim, dilation_dim, pad_dim, stride_dim
+);
+  integer conv_dim;
+  // conv_dim = (feature_dim - ((filter_dim - 1) * dilation_dim + 1) + pad_dim) / stride_dim + 1;
+  conv_dim = (feature_dim + pad_dim) + 1;
+  // ceil_value = (filter_dim + stride_dim - 1) / stride_dim;//((filter_dim % stride_dim) == 0) ? 0 : 1;
+  calc_output_dim_max = conv_dim + filter_dim - 1;
+endfunction
+
+ module dla_layout_transform
+ import dla_common_pkg::*,dla_lt_pkg::*;
+  #(
+    // Convolution parameters:
+    parameter int MAX_CHANNELS =0,
+    parameter int MAX_FEATURE_HEIGHT=0,
+    parameter int MAX_FEATURE_WIDTH=0,
+    parameter int MAX_FEATURE_DEPTH=0,
+    parameter int MAX_STRIDE_HEIGHT=0,
+    parameter int MAX_STRIDE_WIDTH=0,
+    parameter int MAX_STRIDE_DEPTH=0,
+    parameter int MAX_PAD_FRONT=0,
+    parameter int MAX_PAD_LEFT=0,
+    parameter int MAX_PAD_TOP=0,
+    parameter int MAX_FILTER_WIDTH=4,
+    parameter int MAX_FILTER_HEIGHT=4,
+    parameter int MAX_FILTER_DEPTH=4,
+    parameter int MAX_DILATION_WIDTH,
+    parameter int MAX_DILATION_HEIGHT,
+    parameter int MAX_DILATION_DEPTH,
+
+    // Exact parameters
+    parameter int CVEC=0,
+    parameter bit DO_U8_CONV=1,
+    parameter int DATA_ELEMENT_WIDTH = 32,
+    parameter int CNT_BITS = 32,
+    parameter int DDR_BYTES = 4,
+    parameter int CONFIG_DATA_BYTES,
+
+    device_family_t DEVICE,
+    // Derived Params
+    localparam int MAX_DIM_BITS = $clog2((MAX_FEATURE_DEPTH + MAX_PAD_FRONT) * (MAX_FEATURE_HEIGHT + MAX_PAD_TOP) * (MAX_FEATURE_WIDTH + MAX_PAD_LEFT) * CVEC),
+    localparam int unsigned MAX_INPUT_VOLUME = MAX_CHANNELS * MAX_FEATURE_WIDTH * MAX_FEATURE_HEIGHT * MAX_FEATURE_DEPTH,
+    //todo: Capitalize constants...
+    localparam int unsigned ELEM_PER_DDR = (DDR_BYTES*8)/DATA_ELEMENT_WIDTH,
+    localparam int unsigned OUTPUT_DATA_WIDTH = 16,
+    localparam int MAX_TRANSFERS = (MAX_INPUT_VOLUME + ELEM_PER_DDR -1) / ELEM_PER_DDR
+   ) (
+     // Module connections
+     input wire clk,
+     input wire i_rstn,
+     input wire [CONFIG_DATA_BYTES*8-1:0] i_config_data,
+     input wire i_config_valid,
+     output logic o_config_ready,
+     input wire [8*DDR_BYTES-1:0] i_data,
+     input wire i_valid,
+     input wire i_stall,
+     output logic o_ready,
+     output logic o_stall,
+     output logic [CVEC-1:0][OUTPUT_DATA_WIDTH-1:0] o_data,
+     output logic o_valid,
+     output logic o_last,
+     output logic o_param_error
+   );
+
+  `DLA_ACL_PARAMETER_ASSERT((DO_U8_CONV == 0 && DATA_ELEMENT_WIDTH == 16) || (DO_U8_CONV == 1 && DATA_ELEMENT_WIDTH == 8)); // only supported combinations currently.
+
+  localparam int unsigned MAX_OUTPUT_C = calc_output_channels(CVEC, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, MAX_STRIDE_DEPTH);
+  localparam int unsigned MAX_OUTPUT_W = calc_output_dim_max(MAX_FEATURE_WIDTH, MAX_FILTER_WIDTH, MAX_DILATION_WIDTH, MAX_PAD_LEFT, MAX_STRIDE_WIDTH);
+  localparam int unsigned MAX_OUTPUT_H = calc_output_dim_max(MAX_FEATURE_HEIGHT, MAX_FILTER_HEIGHT, MAX_DILATION_HEIGHT, MAX_PAD_TOP, MAX_STRIDE_HEIGHT);
+  localparam int unsigned MAX_OUTPUT_D = calc_output_dim_max(MAX_FEATURE_DEPTH, MAX_FILTER_DEPTH, MAX_DILATION_DEPTH, MAX_PAD_FRONT, MAX_STRIDE_DEPTH);
+  localparam int unsigned MAX_INNER_ROWS = MAX_OUTPUT_C > CVEC ? MAX_OUTPUT_C/CVEC-1 : 0;
+  localparam int unsigned MAX_OUTPUT_VOLUME = MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_C;
+  // if CHANNELS > ELEM_PER_DDR then only 2 CVECs are modified per cycle in the worst case. Bump up to 4 to redue congestion; this value heavily effects area!
+  localparam shortint unsigned max_num_partitions = MAX_CHANNELS > ELEM_PER_DDR ? 4 : calc_max_partitions(
+    MAX_FEATURE_HEIGHT, MAX_FEATURE_WIDTH, MAX_CHANNELS, MAX_STRIDE_HEIGHT, MAX_STRIDE_WIDTH, ELEM_PER_DDR
+   ) + 20; // TODO: The max_partition calculation does a pretty good job, but there are some edge conditions that are not accounted for. Review. For now, +N works.
+  localparam integer num_buffers = (max_num_partitions)+(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH) + MAX_INNER_ROWS*(MAX_FEATURE_WIDTH/MAX_STRIDE_WIDTH)*(MAX_FEATURE_HEIGHT/MAX_STRIDE_HEIGHT)*(MAX_FEATURE_DEPTH/MAX_STRIDE_DEPTH);// minimum number of buffers!!!
+  localparam int n_pool_bits = $clog2((max_num_partitions));
+  localparam int n_buffer_pools = $rtoi($pow(2, n_pool_bits));
+  localparam int cvec_per_buffer = $rtoi($pow(2, $clog2(($rtoi($ceil((num_buffers*1.0)/(max_num_partitions)))+1) + 30))); // round to nearst power of 2 since we mod by the value in a few places.
+  localparam int total_buffers = n_buffer_pools * cvec_per_buffer;
+  localparam int buffers_in_progress = max_num_partitions*7;
+
+  localparam int available = total_buffers - buffers_in_progress;
+
+  logic [$clog2((MAX_OUTPUT_W * MAX_OUTPUT_H * MAX_OUTPUT_D * (MAX_OUTPUT_C/CVEC)))-1:0] lines_written;
+  logic [ELEM_PER_DDR-1:0] completed_vol_tally [1:0];
+  logic [address_info_e.num()-1:0][CNT_BITS-1:0] addr_queue [n_buffer_pools-1:0][ELEM_PER_DDR-1:0];
+
+  logic ready_for_config;
+  logic ready_for_transfer;
+  logic input_data_valid;
+  logic cnt_ready;
+  logic next_transfer_overflow;
+  logic internal_reset;
+  logic resetn_condition, start_rx, done_frame;
+  logic config_ready;
+  logic [$clog2(MAX_TRANSFERS):0] frame_finished, transfer_count;
+
+  layout_transform_config_if layout_transform_config();
+
+  shortint unsigned finished_lines_reg;
+  int buffer_usage;
+
+  // counters used by dimension counter logic:
+  logic [max($clog2(MAX_CHANNELS), 1)-1:0]        C_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0]   W_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_STRIDE_WIDTH), 1)-1:0]    IN_W_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_FEATURE_WIDTH), 1)-1:0]   S_W_d [ELEM_PER_DDR-1:0];
+
+  logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0]  H_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_STRIDE_HEIGHT), 1)-1:0]   IN_H_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_FEATURE_HEIGHT), 1)-1:0]  S_H_d [ELEM_PER_DDR-1:0];
+
+  logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0]   D_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_STRIDE_DEPTH), 1)-1:0]    IN_D_d [ELEM_PER_DDR-1:0];
+  logic [max($clog2(MAX_FEATURE_DEPTH), 1)-1:0]   S_D_d [ELEM_PER_DDR-1:0];
+  logic [MAX_DIM_BITS-1:0]    Index_d [ELEM_PER_DDR-1:0];
+
+  assign resetn_condition = i_rstn & !internal_reset;
+  assign o_config_ready = config_ready & ready_for_config;
+
+  dla_config_deserialize #(
+    .CONFIG_WIDTH(CONFIG_DATA_BYTES*8)
+  ) lt_config_deserialize (
+    .clk(clk),
+    .i_resetn(resetn_condition),
+    .i_valid(i_config_valid),
+    .i_config(i_config_data),
+    .o_ready(config_ready),
+
+    .if_config(layout_transform_config)
+  );
+
+  always_ff @( posedge clk ) begin : latch_last_frame
+    frame_finished <= frame_finished;
+
+    if (i_valid & o_ready & ~done_frame) begin
+      // We've started to accept output for this inference. This
+      // signals to the output logic that we can start writing outputs.
+      start_rx <= 1;
+      frame_finished <= frame_finished - 1;
+      done_frame <= frame_finished[$clog2(MAX_TRANSFERS)];
+    end else if (layout_transform_config.valid & ~start_rx) begin
+      // Don't accept a new config until we're finished with this transform.
+      ready_for_config <= 1'b0;
+      frame_finished <= ((layout_transform_config.data.feature_volume + ELEM_PER_DDR - 1) / ELEM_PER_DDR) - 2;
+    end
+
+    if (~resetn_condition) begin
+      start_rx <= 0;
+      frame_finished <= MAX_TRANSFERS-2;
+      done_frame <= 0;
+      ready_for_config <= 1'b1;
+    end
+  end
+
+  assign buffer_usage = (finished_lines_reg - lines_written);
+  assign next_transfer_overflow = available <= buffer_usage; // may need to pipeline this. Maybe change to 'almost full signal'.
+  assign o_ready = ready_for_transfer == 1'b1 & next_transfer_overflow == 1'b0 & !done_frame;
+  assign o_stall = ready_for_transfer == 1'b0 | next_transfer_overflow == 1'b1 | done_frame;
+  assign internal_reset = o_last & o_valid & !i_stall;
+  // Dimension counter: Keeps track of position within tensor of incoming data.
+  dla_lt_dimension_counter #(
+    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+    .MAX_CHANNELS(MAX_CHANNELS),
+    .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
+    .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
+    .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
+    .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
+    .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
+    .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
+    .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME),
+    .MAX_DIM_BITS(MAX_DIM_BITS)
+  ) dim_counter (
+    .clk(clk),
+    .i_rstn(resetn_condition),
+    .i_increment(input_data_valid | ~ready_for_transfer),
+    .if_lt_config(layout_transform_config),
+
+    .o_ready(cnt_ready),
+    .o_c_dim(C_d),
+    .o_w_dim(W_d),
+    .o_h_dim(H_d),
+    .o_d_dim(D_d),
+    .o_w_inner(IN_W_d),
+    .o_h_inner(IN_H_d),
+    .o_d_inner(IN_D_d),
+    .o_w_stride(S_W_d),
+    .o_h_stride(S_H_d),
+    .o_d_stride(S_D_d),
+    .o_index(Index_d)
+  );
+
+  // TODO(arooney): add more conversions
+  logic [ELEM_PER_DDR-1:0][15:0] fp16_val;
+  if (DO_U8_CONV) begin
+    dla_lt_data_conversion #(
+      .DDR_BYTES(DDR_BYTES),
+      .DATA_ELEMENT_WIDTH(DATA_ELEMENT_WIDTH),
+      .ELEMENTS_PER_CYCLE(ELEM_PER_DDR)
+    ) data_conversion (
+      .clk(clk),
+      .i_valid(i_valid & o_ready),
+      .i_data(i_data),
+
+      .o_fp16_val(fp16_val),
+      .o_valid(input_data_valid)
+    );
+  end
+  else begin
+    assign input_data_valid = i_valid & o_ready;
+    always_ff @(posedge clk) begin
+      fp16_val <= i_data;
+    end
+  end
+
+  dla_lt_gen_index_info #(
+    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+    .N_BUFFER_POOLS(n_buffer_pools),
+    .CVEC_PER_BUFFER(cvec_per_buffer),
+    .N_POOL_BITS(n_pool_bits),
+    .CNT_BITS(CNT_BITS),
+    .MAX_CHANNELS(MAX_CHANNELS),
+    .CVEC(CVEC),
+    .MAX_FEATURE_WIDTH(MAX_FEATURE_WIDTH),
+    .MAX_FEATURE_HEIGHT(MAX_FEATURE_HEIGHT),
+    .MAX_FEATURE_DEPTH(MAX_FEATURE_DEPTH),
+
+    .MAX_STRIDE_WIDTH(MAX_STRIDE_WIDTH),
+    .MAX_STRIDE_HEIGHT(MAX_STRIDE_HEIGHT),
+    .MAX_STRIDE_DEPTH(MAX_STRIDE_DEPTH),
+    .MAX_DIM_BITS(MAX_DIM_BITS),
+    .MAX_INPUT_VOLUME(MAX_INPUT_VOLUME)
+  ) gen_index_info (
+    .clk(clk),
+    .i_rstn(resetn_condition),
+    .i_next_overflow(next_transfer_overflow),
+    .i_valid(input_data_valid),
+    .i_ready(cnt_ready),
+    .i_c_dim(C_d),
+    .i_w_inner(IN_W_d),
+    .i_h_inner(IN_H_d),
+    .i_d_inner(IN_D_d),
+    .i_w_stride(S_W_d),
+    .i_h_stride(S_H_d),
+    .i_d_stride(S_D_d),
+    .i_index(Index_d),
+    .if_lt_config(layout_transform_config),
+
+    .o_addr_queue(addr_queue),
+    .o_completed_vol_tally(completed_vol_tally[0]),
+    .o_ready_for_transfer(ready_for_transfer)
+  );
+
+  logic [($clog2(cvec_per_buffer))-1:0] output_line_num [n_buffer_pools-1:0];
+  logic [($clog2(cvec_per_buffer))-1:0] curr_out_line [n_buffer_pools-1:0];
+  logic [n_buffer_pools-1:0] actively_reading;
+  logic [MAX_OUTPUT_C-1:0][16-1:0] output_line_data [n_buffer_pools-1:0];
+  dla_lt_memory_manager #(
+    .NUM_BUFFER_POOLS(n_buffer_pools),
+    .CVEC_PER_BUFFER(cvec_per_buffer),
+    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+    .CNT_BITS(CNT_BITS),
+    .CVEC(CVEC),
+    .MAX_OUTPUT_C(MAX_OUTPUT_C),
+    .DEVICE(DEVICE)
+  ) memory_manager (
+    .clk(clk),
+    .i_rstn(resetn_condition),
+    .i_addr_queue(addr_queue),
+    .i_output_line_num(output_line_num),
+    .i_actively_reading(actively_reading),
+    .i_fp16_data(fp16_val),
+    .i_completed_vol_tally(completed_vol_tally[0]),
+
+    .o_completed_vol_tally(completed_vol_tally[1]),
+    .o_output_line_data(output_line_data),
+    .o_curr_out_line(curr_out_line)
+  );
+
+  dla_lt_output_logic #(
+    .NUM_BUFFER_POOLS(n_buffer_pools),
+    .CVEC_PER_BUFFER(cvec_per_buffer),
+    .ELEMENTS_PER_CYCLE(ELEM_PER_DDR),
+    .N_POOL_BITS(n_pool_bits),
+    .MAX_OUTPUT_W(MAX_OUTPUT_W),
+    .MAX_OUTPUT_H(MAX_OUTPUT_H),
+    .MAX_OUTPUT_D(MAX_OUTPUT_D),
+    .MAX_OUTPUT_C(MAX_OUTPUT_C),
+    .CVEC(CVEC),
+    .CNT_BITS(CNT_BITS),
+    .MAX_DIM_BITS(MAX_DIM_BITS)
+  ) output_logic (
+    .clk(clk),
+    .i_rstn(resetn_condition),
+    .i_output_line_data(output_line_data),
+    .i_curr_out_line(curr_out_line),
+    .i_completed_vol_tally(completed_vol_tally[1]),
+    .i_stall(i_stall),
+    .if_lt_config(layout_transform_config),
+    .i_ready(start_rx),
+
+    .o_line_num(output_line_num),
+    .o_read_req(actively_reading),
+    .o_data(o_data),
+    .o_valid(o_valid),
+    .o_last(o_last),
+    .o_lines_written(lines_written),
+    .o_finished_lines(finished_lines_reg)
+  );
+
+endmodule