Add interface module (#1)
End-to-End test and sim
Co-authored-by: Ian Zhang <ianboyanzhang@gmail.com>
diff --git a/verilog/dv/la_test2/la_test2.c b/verilog/dv/la_test2/la_test2.c
index 25fad48..003bf4e 100644
--- a/verilog/dv/la_test2/la_test2.c
+++ b/verilog/dv/la_test2/la_test2.c
@@ -29,6 +29,48 @@
int clk = 0;
int i;
+uint32_t mat_A[9] = {
+ //1.126105
+ 0x00003c81,
+ //0.407351
+ 0x00003685,
+ //2.315680
+ 0x000040a2,
+ //0.930338
+ 0x00003b71,
+ //2.542255
+ 0x00004116,
+ //1.070112
+ 0x00003c48,
+ //1.107074
+ 0x00003c6e,
+ //1.020977
+ 0x00003c15,
+ //2.659628
+ 0x00004152
+};
+
+uint32_t mat_B[9] = {
+ //1.435914
+ 0x00003dbe,
+ //1.319322
+ 0x00003d47,
+ //0.348074
+ 0x00003592,
+ //1.898164
+ 0x00003f98,
+ //1.588423
+ 0x00003e5b,
+ //0.815995
+ 0x00003a87,
+ //2.724823
+ 0x00004173,
+ //0.130791
+ 0x0000302f,
+ //2.339815
+ 0x000040ae
+};
+
void main()
{
/* Set up the housekeeping SPI to be connected internally so */
@@ -79,6 +121,7 @@
reg_mprj_io_1 = GPIO_MODE_USER_STD_OUTPUT;
reg_mprj_io_0 = GPIO_MODE_USER_STD_OUTPUT;
+
/* Apply configuration */
reg_mprj_xfer = 1;
while (reg_mprj_xfer == 1);
@@ -92,29 +135,70 @@
// Flag start of the test
reg_mprj_datal = 0xAB600000;
- // Configure LA[64] LA[65] as outputs from the cpu
- reg_la2_oenb = reg_la2_iena = 0x00000003;
+ // Configure LA[64] LA[65] LA[66] as outputs from the cpu
+ reg_la2_oenb = reg_la2_iena = 0x00000007;
+ // clk, reset, cs
+ //reg_la2_oenb = reg_la2_iena = 0x00000007;
// Set clk & reset to one
reg_la2_data = 0x00000003;
+ // Configure LA[63:32] output from the cpu
+ reg_la1_oenb = reg_la1_iena = 0xFFFFFFFF;
+ reg_la1_data = 0x00000000;
+
// DELAY
for (i=0; i<5; i=i+1) {}
// Toggle clk & de-assert reset
- for (i=0; i<11; i=i+1) {
- clk = !clk;
- reg_la2_data = 0x00000000 | clk;
+ for (i=0; i<5; i=i+1) {
+ clk = !clk;
+ reg_la2_data = 0x00000000 | clk;
}
- // reg_mprj_datal = 0xAB610000;
+ // cs/data_input is ready/valid
+ reg_la2_data = reg_la2_data | 0x00000004;
+
+ uint32_t i_mat = 0;
+
+ // Toggle clk & send mat_A data
+ for (i=0; i<17; i=i+1) {
+ clk = !clk;
+ reg_la2_data = 0x00000000 | clk;
+ reg_la2_data = reg_la2_data | 0x00000004;
+ if (clk == 0) {
+ reg_la1_data = mat_A[i_mat];
+ i_mat += 1;
+ }
+ }
+
+ i_mat = 0;
+ // Toggle clk & send mat_B data
+ for (i=0; i<17; i=i+1) {
+ clk = !clk;
+ reg_la2_data = 0x00000000 | clk;
+ reg_la2_data = reg_la2_data | 0x00000004;
+ if (clk == 0) {
+ reg_la1_data = mat_B[i_mat];
+ i_mat += 1;
+ }
+ }
+
+ for (i=0; i<1; i=i+1) {
+ clk = !clk;
+ reg_la2_data = 0x00000000 | clk;
+ reg_la2_data = reg_la2_data | 0x00000004;
+ }
while (1){
- if (reg_la0_data_in >= 0x05) {
+ clk = !clk;
+ reg_la2_data = 0x00000000 | clk;
+
+ if ((reg_la0_data_in & 0x0000FFFF) >= 0x00000015) {
reg_mprj_datal = 0xAB610000;
break;
}
-
}
}
+
diff --git a/verilog/dv/la_test2/la_test2_tb.v b/verilog/dv/la_test2/la_test2_tb.v
index fff3b72..613bfa3 100644
--- a/verilog/dv/la_test2/la_test2_tb.v
+++ b/verilog/dv/la_test2/la_test2_tb.v
@@ -140,7 +140,8 @@
// Repeat cycles of 1000 clock edges as needed to complete testbench
repeat (75) begin
- repeat (1000) @(posedge clock);
+ //repeat (3000) @(posedge clock);
+ repeat (7000) @(posedge clock);
// $display("+1000 cycles");
end
$display("%c[1;31m",27);
diff --git a/verilog/includes/includes.rtl.caravel_user_project b/verilog/includes/includes.rtl.caravel_user_project
index 31ab09b..e24a0f6 100644
--- a/verilog/includes/includes.rtl.caravel_user_project
+++ b/verilog/includes/includes.rtl.caravel_user_project
@@ -1,5 +1,6 @@
# Caravel user project includes
-v $(USER_PROJECT_VERILOG)/rtl/user_project_wrapper.v
-v $(USER_PROJECT_VERILOG)/rtl/user_proj_example.v
+#-v $(USER_PROJECT_VERILOG)/rtl/srcs/*.v
-
\ No newline at end of file
+
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
index 26081e9..8fa97d2 100644
--- a/verilog/rtl/user_proj_example.v
+++ b/verilog/rtl/user_proj_example.v
@@ -1,3 +1,1610 @@
+`ifndef ADD_NORMALIZER_V_
+`define ADD_NORMALIZER_V_
+
+`timescale 1ns / 1ps
+
+module add_normalizer (
+ input sign,
+ input [ 4:0] exponent,
+ input [10:0] mantissa_add,
+ output reg [15:0] result,
+ input if_carray,
+ input if_sub
+);
+
+ reg [4:0] number_of_zero_lead;
+ reg [10:0] norm_mantissa_add;
+ reg [9:0] mantissa_tmp;
+
+ wire [4:0] shift_left_exp;
+ wire c1;
+
+ always @ (*) begin
+ if (mantissa_add[10:4] == 7'b0000_001) begin
+ number_of_zero_lead = 5'd6;
+ norm_mantissa_add = (mantissa_add << 4'd6);
+ end else if (mantissa_add[10:5] == 6'b0000_01) begin
+ number_of_zero_lead = 5'd5;
+ norm_mantissa_add = (mantissa_add << 4'd5);
+ end else if (mantissa_add[10:6] == 5'b0000_1) begin
+ number_of_zero_lead = 5'd4;
+ norm_mantissa_add = (mantissa_add << 4'd4);
+ end else if (mantissa_add[10:7] == 4'b0001) begin
+ number_of_zero_lead = 5'd3;
+ norm_mantissa_add = (mantissa_add << 4'd3);
+ end else if (mantissa_add[10:8] == 3'b001) begin
+ number_of_zero_lead = 5'd2;
+ norm_mantissa_add = (mantissa_add << 4'd2);
+ end else if (mantissa_add[10:9] == 2'b01) begin
+ number_of_zero_lead = 5'd1;
+ norm_mantissa_add = (mantissa_add << 4'd1);
+ end else begin
+ number_of_zero_lead = 5'd0;
+ norm_mantissa_add = mantissa_add[10:0];
+ end
+ end
+
+ always @(*) begin
+ result[15] = sign;
+ if (!if_sub) begin
+ result[14:10] = if_carray ? exponent + 1'b1 : exponent;
+ result[9:0] = if_carray ? mantissa_add[10:1] : mantissa_add[9:0];
+ end else begin
+ result[14:10] = shift_left_exp;
+ result[9:0] = norm_mantissa_add[9:0];
+ end
+ end
+
+ cla_nbit #(.n(5)) u1(exponent,~number_of_zero_lead+1'b1,1'b0,shift_left_exp,c1);
+
+endmodule
+
+`endif
+`ifndef CONTROL_V_
+`define CONTROL_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+// Really 3x3, a done output??
+
+
+// 6 logic cycles + 2 (buffered?) delay cycles
+module control #(
+ parameter W = 16,
+ parameter N = 3
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W * N * N - 1 : 0] i_A,
+ input wire [ W * N * N - 1 : 0] i_B,
+ output wire [ W * N * N - 1 : 0] o_C,
+ output wire o_done,
+
+ // debug
+ output wire [ W * N * 2 - 1 : 0] debug_pe_a,
+ output wire [ W * N * 2 - 1 : 0] debug_pe_b
+);
+
+ reg [3 : 0] states, next_states;
+
+ reg [W - 1 : 0] a00, a01, a02;
+ wire [W - 1 : 0] a01_q, a02_q;
+
+ reg [W - 1 : 0] b00, b01, b02;
+ wire [W - 1 : 0] b01_q, b02_q;
+
+ wire [W * N - 1 : 0] A_in;
+ wire [W * N - 1 : 0] B_in;
+
+ assign A_in = {a02_q, a01_q, a00};
+ assign B_in = {b02_q, b01_q, b00};
+
+ // 0000 is the idle / standby state
+ always @(posedge i_clk) begin
+ if (i_rst | ~i_en) begin
+ states <= 4'b0000;
+ end
+ else if (i_en) begin
+ states <= next_states;
+ end
+ end
+
+ always @(*) begin
+ if (next_states == 4'b1001) begin
+ // Done: Force waiting
+ // This can also be done by switching off input
+ next_states = 4'b1001;
+ end else begin
+ next_states = states + 4'b0001;
+ end
+ end
+
+ assign o_done = (states == 4'b1001);
+
+ always @(*) begin
+ case (states)
+ 4'b0001: begin
+ a00 = i_A[ W - 1 : 0 * W];
+ a01 = i_A[2 * W - 1 : 1 * W];
+ a02 = i_A[3 * W - 1 : 2 * W];
+
+
+ b00 = i_B[ W - 1 : 0 * W];
+ b01 = i_B[2 * W - 1 : 1 * W];
+ b02 = i_B[3 * W - 1 : 2 * W];
+ end
+ 4'b0010: begin
+ a00 = i_A[4 * W - 1 : 3 * W];
+ a01 = i_A[5 * W - 1 : 4 * W];
+ a02 = i_A[6 * W - 1 : 5 * W];
+
+ b00 = i_B[4 * W - 1 : 3 * W];
+ b01 = i_B[5 * W - 1 : 4 * W];
+ b02 = i_B[6 * W - 1 : 5 * W];
+ end
+ 4'b0011: begin
+ a00 = i_A[7 * W - 1 : 6 * W];
+ a01 = i_A[8 * W - 1 : 7 * W];
+ a02 = i_A[9 * W - 1 : 8 * W];
+
+ b00 = i_B[7 * W - 1 : 6 * W];
+ b01 = i_B[8 * W - 1 : 7 * W];
+ b02 = i_B[9 * W - 1 : 8 * W];
+ end
+ default: begin
+ a00 = 0;
+ a01 = 0;
+ a02 = 0;
+
+ b00 = 0;
+ b01 = 0;
+ b02 = 0;
+ end
+ endcase
+ end
+
+ systolic #(.W(W), .N(N)) sys(.i_clk(i_clk), .i_rst(i_rst), .i_en(i_en), .i_mode(i_mode), .i_A(A_in), .i_B(B_in), .o_C(o_C), .debug_pe_a(debug_pe_a), .debug_pe_b(debug_pe_b));
+
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayA1(.clk(i_clk), .reset(i_rst), .data_in(a01), .data_out(a01_q));
+ delay2 #(.WIDTH(W), .DEPTH(2)) delayA2(.clk(i_clk), .reset(i_rst), .data_in(a02), .data_out(a02_q));
+
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayB1(.clk(i_clk), .reset(i_rst), .data_in(b01), .data_out(b01_q));
+ delay2 #(.WIDTH(W), .DEPTH(2)) delayB2(.clk(i_clk), .reset(i_rst), .data_in(b02), .data_out(b02_q));
+
+
+endmodule
+`default_nettype wire
+`endif
+// No pipelined/piplined MAC
+// Version: 1.0
+
+// Description:
+
+// Function : mac_out = in_a * in_b + in_c. Both work for INT8 and FP16 mode. Default INT8 and FP16 are signed number
+// Exception : error detection for overflow and underflow in FP16 mode
+`ifndef MAC_UNIT_V_
+`define MAC_UNIT_V_
+
+
+`timescale 1ns / 1ps
+
+module mac_unit
+(
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input [15:0] in_a, // multiplier input1
+ input [15:0] in_b, // multiplier input2
+ input [15:0] in_c, // adder input2 ; adder input1 = in_a*in_b
+ input mode,
+ //output [15:0] mac_out,
+ output [15:0] mac_out,
+ output error
+);
+
+ wire [15:0] mul_out;
+
+ int_fp_add add(
+ `ifdef PIPELINE
+ .clk (clk ),
+ .rst_n (rst_n ),
+ `endif
+ .mode (mode ),
+ .a (mul_out),
+ .b (in_c ),
+ .c (mac_out)
+ );
+
+ int_fp_mul mul(
+ `ifdef PIPELINE
+ .clk (clk ),
+ .rst_n (rst_n ),
+ `endif
+ .mode (mode ),
+ .a (in_a ),
+ .b (in_b ),
+ .c (mul_out),
+ .error (error )
+ );
+
+endmodule
+`endif
+`ifndef PE_2_V_
+`define PE_2_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+module PE #(
+ //parameter W = 32
+ parameter W = 16
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W - 1 : 0] i_A,
+ input wire [ W - 1 : 0] i_B,
+ output wire [ W - 1 : 0] o_A,
+ output wire [ W - 1 : 0] o_B,
+ //output wire [ W - 1 : 0] o_C
+ output reg [ W - 1 : 0] o_C
+);
+
+ //wire mode;
+ //assign mode = 1;
+
+ wire sync_load;
+ assign o_A = i_A_buffered;
+ assign o_B = i_B_buffered;
+ assign sync_load = i_rst | ~i_en;
+
+ wire [W - 1 : 0] i_A_buffered;
+ wire [W - 1 : 0] i_B_buffered;
+
+ reg [15 : 0] accu;
+ wire [15 : 0] mac_out;
+
+ // Buffered in MAC
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayA(.clk(i_clk), .reset(i_rst), .data_in(i_A), .data_out(i_A_buffered));
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayB(.clk(i_clk), .reset(i_rst), .data_in(i_B), .data_out(i_B_buffered));
+
+ always @(posedge i_clk) begin
+ if (sync_load) begin
+ accu <= 0;
+ o_C <= 0;
+ end
+ else begin
+ accu <= mac_out;
+ o_C <= mac_out;
+ end
+ end
+
+ // Optional: making it clocked
+ mac_unit u0_mac(
+ .in_a (i_A_buffered),
+ .in_b (i_B_buffered),
+ .in_c (accu),
+ .mode (i_mode),
+ .mac_out (mac_out)
+ );
+
+endmodule
+
+`default_nettype wire
+`endif
+`ifndef DELAY_2_V_
+`define DELAY_2_V_
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+module delay2 #(
+ parameter WIDTH = 16,
+ parameter DEPTH = 3
+) (
+ input wire clk,
+ input wire reset,
+ input wire [WIDTH - 1 : 0] data_in,
+ output wire [WIDTH - 1 : 0] data_out
+);
+
+ wire [WIDTH - 1 : 0] connect_wire [DEPTH : 0];
+
+ assign data_out = connect_wire[DEPTH];
+ assign connect_wire[0] = data_in;
+
+ genvar i;
+ generate
+ for (i = 1; i <= DEPTH; i = i + 1) begin
+ dff #(.WIDTH(WIDTH)) DFF(
+ .clk(clk),
+ .rst(reset),
+ .inp(connect_wire[i-1]),
+ .outp(connect_wire[i]));
+ end
+ endgenerate
+endmodule
+
+// D flip-flop with synchronous reset
+module dff#(
+ parameter WIDTH = 1
+ ) (
+ input wire clk,
+ input wire rst,
+
+ input wire [WIDTH-1:0] inp,
+ output reg [WIDTH-1:0] outp
+ );
+
+ always @(posedge clk) begin
+ outp <= rst ? 0 : inp;
+ end
+
+endmodule
+
+`default_nettype wire
+`endif
+`ifndef MUL_2x2_V_
+`define MUL_2x2_V_
+
+`timescale 1ns / 1ps
+
+module mul2x2(
+ input [1:0] a,
+ input [1:0] b,
+ output [3:0] c
+);
+
+ wire [3:0] tmp;
+
+ assign tmp[0] = a[0] & b[0];
+ assign tmp[1] = (a[1]&b[0]) ^ (a[0]&b[1]);
+ assign tmp[2] = (a[0]&b[1]) & (a[1]&b[0]) ^ (a[1]&b[1]);
+ assign tmp[3] = (a[0]&b[1]) & (a[1]&b[0]) & (a[1]&b[1]);
+ assign c = {tmp[3],tmp[2],tmp[1],tmp[0]};
+
+endmodule
+`endif
+`ifndef MUL_4x4_V_
+`define MUL_4x4_V_
+
+`timescale 1ns / 1ps
+
+module mul4x4(
+ input [3:0] a,
+ input [3:0] b,
+ output [7:0] c
+);
+
+ wire [15:0] tmp1;
+ wire [ 5:0] result1;
+ wire [ 5:0] result2;
+ wire co1,co2,co3;
+
+ mul2x2 u1(a[3:2],b[3:2],tmp1[15:12]);
+ mul2x2 u2(a[1:0],b[3:2],tmp1[11:8]);
+ mul2x2 u3(a[3:2],b[1:0],tmp1[7:4]);
+ mul2x2 u4(a[1:0],b[1:0],tmp1[3:0]);
+
+ cla_nbit #(.n(6)) u5({tmp1[15:12],2'b0},{2'b0,tmp1[11:8]},1'b0 ,result1 ,co1);
+ cla_nbit #(.n(6)) u6({2'b0,tmp1[7:4]} ,{4'b0,tmp1[3:2]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(6)) u7(result1 ,result2 ,co2 ,c[7:2] ,co3);
+
+ assign c[1:0] = tmp1[1:0];
+
+endmodule
+`endif
+`ifndef MUL_8x8_V_
+`define MUL_8x8_V_
+
+`timescale 1ns / 1ps
+
+module mul8x8(
+ input [ 7:0] a,
+ input [ 7:0] b,
+ output [15:0] c
+);
+
+ wire [31:0] tmp1;
+ wire [11:0] result1;
+ wire [11:0] result2;
+ wire co1,co2,co3;
+
+ mul4x4 u1(a[7:4],b[7:4],tmp1[31:24]);
+ mul4x4 u2(a[3:0],b[7:4],tmp1[23:16]);
+ mul4x4 u3(a[7:4],b[3:0],tmp1[15:8]);
+ mul4x4 u4(a[3:0],b[3:0],tmp1[7:0]);
+
+ cla_nbit #(.n(12)) u5({tmp1[31:24],4'b0} ,{4'b0,tmp1[23:16]} ,1'b0 ,result1 ,co1);
+ cla_nbit #(.n(12)) u6({4'b0,tmp1[15:8]} ,{8'b0,tmp1[7:4]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(12)) u7(result1 ,result2 ,co2 ,c[15:4] ,co3);
+
+ assign c[3:0] = tmp1[3:0];
+
+endmodule
+`endif
+`ifndef MUL_16x16_V_
+`define MUL_16x16_V_
+
+`timescale 1ns / 1ps
+
+module mul16x16(
+`ifdef PIPLINE
+ input clk,
+ input rst_n,
+`endif
+ input [15:0] a,
+ input [15:0] b,
+ output [31:0] c);
+
+ wire [63:0] tmp1,tmp2;
+ wire [23:0] result1;
+ wire [23:0] result2;
+ wire co1,co2,co3;
+
+`ifdef PIPLINE
+ // one stage pipline
+ reg [63:0] tmp1_reg;
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ tmp1_reg <= 64'b0;
+ end else begin
+ tmp1_reg <= tmp1;
+ end
+ end
+ assign tmp2 = tmp1_reg;
+
+`else
+ assign tmp2 = tmp1;
+
+`endif
+
+ mul8x8 u1(a[15:8],b[15:8],tmp1[63:48]);
+ mul8x8 u2(a[7:0] ,b[15:8],tmp1[47:32]);
+ mul8x8 u3(a[15:8],b[ 7:0],tmp1[31:16]);
+ mul8x8 u4(a[7:0] ,b[ 7:0],tmp1[15:0]);
+
+ cla_nbit #(.n(24)) u5({tmp2[63:48],8'b0} ,{8'b0,tmp2[47:32]} ,1'b0 ,result1 ,co1);
+ cla_nbit #(.n(24)) u6({8'b0,tmp2[31:16]} ,{16'b0,tmp2[15:8]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(24)) u7(result1 ,result2 ,co2 ,c[31:8] ,co3);
+
+ assign c[7:0] = tmp2[7:0];
+
+endmodule
+`endif
+`ifndef ALIGNMENT_V_
+`define ALIGNMENT_V_
+
+`timescale 1ns / 1ps
+
+module alignment (
+ input [14:0] bigger,
+ input [14:0] smaller,
+ output [10:0] aligned_small
+);
+
+ wire c1;
+ wire [4:0] bigger_exponent, smaller_exponent, shift_bits;
+
+ assign bigger_exponent = bigger [14:10];
+ assign smaller_exponent = smaller [14:10];
+ assign aligned_small = ({1'b1,smaller[9:0]} >> shift_bits);
+
+ cla_nbit #(.n(5)) u1(bigger_exponent,~smaller_exponent+1'b1,1'b0,shift_bits,c1);
+
+endmodule
+`endif
+`ifndef SYSTOLIC_V_
+`define SYSTOLIC_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+// row
+// 3 x 3
+module systolic #(
+ parameter W = 16,
+ parameter N = 3
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W * N - 1 : 0] i_A,
+ input wire [ W * N - 1 : 0] i_B,
+ output wire [ W * N * N - 1 : 0] o_C,
+
+ // debug
+ output wire [ W * N * 2 - 1 : 0] debug_pe_a,
+ output wire [ W * N * 2 - 1 : 0] debug_pe_b
+);
+
+ //localparam O_VEC_WIDTH = 2 * W;
+ localparam O_VEC_WIDTH = W;
+
+ wire [W - 1 : 0] a00, a01, a02, b00, b01, b02;
+ wire [W - 1 : 0] pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22;
+ wire [W - 1 : 0] pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22;
+
+ wire [O_VEC_WIDTH - 1 : 0] c00, c01, c02, c10, c11, c12, c20, c21, c22;
+
+ assign a00 = i_A[0 * W +: W];
+ assign a01 = i_A[1 * W +: W];
+ assign a02 = i_A[2 * W +: W];
+
+ assign b00 = i_B[0 * W +: W];
+ assign b01 = i_B[1 * W +: W];
+ assign b02 = i_B[2 * W +: W];
+
+ PE #(.W(W)) PE00(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a00), .i_B(b00), .o_A(pe_a_00_01),.o_B(pe_b_00_10),.o_C(c00));
+ PE #(.W(W)) PE01(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_00_01), .i_B(b01), .o_A(pe_a_01_02),.o_B(pe_b_01_11),.o_C(c01));
+ PE #(.W(W)) PE02(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_01_02), .i_B(b02), .o_A(), .o_B(pe_b_02_12),.o_C(c02));
+
+ PE #(.W(W)) PE10(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a01), .i_B(pe_b_00_10),.o_A(pe_a_10_11),.o_B(pe_b_10_20),.o_C(c10));
+ PE #(.W(W)) PE11(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_10_11),.i_B(pe_b_01_11),.o_A(pe_a_11_12),.o_B(pe_b_11_21),.o_C(c11));
+ PE #(.W(W)) PE12(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_11_12),.i_B(pe_b_02_12),.o_A(), .o_B(pe_b_12_22),.o_C(c12));
+
+ PE #(.W(W)) PE20(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a02), .i_B(pe_b_10_20),.o_A(pe_a_20_21),.o_B(),.o_C(c20));
+ PE #(.W(W)) PE21(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_20_21),.i_B(pe_b_11_21),.o_A(pe_a_21_22),.o_B(),.o_C(c21));
+ PE #(.W(W)) PE22(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_21_22),.i_B(pe_b_12_22),.o_A(), .o_B(),.o_C(c22));
+
+
+ // https://stackoverflow.com/questions/18067571/indexing-vectors-and-arrays-with
+ // https://standards.ieee.org/ieee/1800/6700/
+ // a_vect[ 0 +: 8] // == a_vect[ 7 : 0]
+ //assign o_C[1 * O_VEC_WIDTH - 1 -: O_VEC_WIDTH] = c00;
+
+ assign o_C[0 * O_VEC_WIDTH +: O_VEC_WIDTH] = c00;
+ assign o_C[1 * O_VEC_WIDTH +: O_VEC_WIDTH] = c01;
+ assign o_C[2 * O_VEC_WIDTH +: O_VEC_WIDTH] = c02;
+ assign o_C[3 * O_VEC_WIDTH +: O_VEC_WIDTH] = c10;
+ assign o_C[4 * O_VEC_WIDTH +: O_VEC_WIDTH] = c11;
+ assign o_C[5 * O_VEC_WIDTH +: O_VEC_WIDTH] = c12;
+ assign o_C[6 * O_VEC_WIDTH +: O_VEC_WIDTH] = c20;
+ assign o_C[7 * O_VEC_WIDTH +: O_VEC_WIDTH] = c21;
+ assign o_C[8 * O_VEC_WIDTH +: O_VEC_WIDTH] = c22;
+
+ assign debug_pe_a = {pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22};
+ assign debug_pe_b = {pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22};
+
+endmodule
+`endif
+`ifndef CLA_NBIT_V_
+`define CLA_NBIT_V_
+
+`timescale 1ns / 1ps
+
+// Carry Look-ahead adder (CLA)
+module cla_nbit #(
+ parameter n = 4
+) (
+ input [n-1:0] a,
+ input [n-1:0] b,
+ input ci,
+ output [n-1:0] s,
+ output co
+);
+
+ wire [n-1:0] g;
+ wire [n-1:0] p;
+ wire [ n:0] c;
+
+ assign c[0] = ci;
+ assign co = c[n];
+
+ genvar i; /* i - generate index variable */
+
+ generate
+ for (i = 0; i < n; i = i + 1) begin : addbit
+ assign s[i] = a[i] ^ b[i] ^ c[i];
+ assign g[i] = a[i] & b[i];
+ assign p[i] = a[i] | b[i];
+ assign c[i + 1] = g[i] | (p[i] & c[i]);
+ end
+ endgenerate
+
+endmodule
+`endif
+`ifndef INT_FP_ADD_V_
+`define INT_FP_ADD_V_
+
+
+`timescale 1ns / 1ps
+
+module int_fp_add (
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input mode,
+ input [15:0] a,
+ input [15:0] b,
+ output [15:0] c
+);
+
+ wire [10:0] adder_input_1,adder_input_2,aligned_small,adder_output;
+ wire if_sub,a_sign, b_sign, c_sign,c1, c2;
+ wire [15:0] normalized_out;
+
+ // only used in INT8 MAC mode
+ wire [4:0] higher_add,higher_a,higher_b;
+
+ wire [15:0] result;
+ reg [14:0] bigger, smaller;
+ reg a_larger_b;
+
+`ifdef PIPELINE
+ reg [14:0] bigger_reg, smaller_reg;
+ reg [10:0] adder_output_reg;
+ wire [14:0] bigger_tmp, smaller_tmp;
+ wire [10:0] adder_output_tmp;
+`endif
+
+
+ assign a_sign = a[15];
+ assign b_sign = b[15];
+ assign if_sub = (a_sign == b_sign) ? 1'b0 : 1'b1;
+ assign c_sign = a_larger_b ? a_sign : b_sign;
+ assign higher_a = (mode == 1'b0) ? a[15:11] : 5'b0;
+ assign higher_b = (mode == 1'b0) ? b[15:11] : 5'b0;
+ assign adder_input_1 = (mode==1'b0) ? a[10:0] :{1'b1,bigger[9:0]};
+ assign adder_input_2 = (mode==1'b0) ? b[10:0] : (if_sub ? ~aligned_small + 1'b1 : aligned_small);
+ assign c = (mode == 1'b0) ? {higher_add,adder_output} : result;
+
+ //compare two number regardless sign
+ always @(*) begin
+ if (a[14:0] > b[14:0]) begin
+ bigger = a[14:0];
+ smaller = b[14:0];
+ a_larger_b = 1'b1;
+ end else begin
+ bigger = b[14:0];
+ smaller = a[14:0];
+ a_larger_b = 1'b0;
+ end
+ end
+
+`ifdef PIPELINE
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ bigger_reg <= 15'b0;
+ smaller_reg <= 15'b0;
+ adder_output_reg <= 11'b0;
+ end else begin
+ bigger_reg <= bigger;
+ smaller_reg <= smaller;
+ adder_output_reg <= adder_output;
+ end
+ end
+ assign bigger_tmp = bigger_reg[14:0];
+ assign smaller_tmp = smaller_reg[14:0];
+ assign adder_output_tmp = adder_output_reg[10:0];
+`endif
+
+`ifdef PIPELINE
+ // align small number
+ alignment u1(bigger_tmp,smaller_tmp,aligned_small);
+`else
+ // align small number
+ alignment u1(bigger,smaller,aligned_small);
+`endif
+
+ cla_nbit #(.n(11)) u2(adder_input_1,adder_input_2,1'b0,adder_output,c1);
+
+ // This 5 bit adder only used in INT8 MAC mode
+ cla_nbit #(.n(5)) u3(higher_a,higher_b,c1,higher_add,c2);
+
+`ifdef PIPELINE
+ add_normalizer u4(c_sign,bigger[14:10],adder_output_tmp,result,c1,if_sub);
+`else
+ add_normalizer u4(c_sign,bigger[14:10],adder_output,result,c1,if_sub);
+`endif
+
+endmodule
+`endif
+`ifndef INT_FP_MUL_V_
+`define INT_FP_MUL_V_
+
+
+module int_fp_mul (
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input mode,
+ input [15:0] a,
+ input [15:0] b,
+ output [15:0] c,
+ output error // valid in fp16 mode
+);
+
+ wire [15:0] c_tmp;
+ wire c_sign,a_zero,b_zero;
+ wire [ 4:0] sum_exponent, biased_sum_exponent;
+ wire [15:0] multiplier_input1,multiplier_input2;
+
+ wire [31:0] multiplier_output;
+ wire [14:0] normalized_out;
+ wire [21:0] mantissa_prod;
+ wire c1,c2,underflow,overflow;
+
+ assign overflow = (c1 && c2 && ~biased_sum_exponent[4]) ? 1'b1 :1'b0;
+ assign underflow = (~c1 && ~c2 && biased_sum_exponent[4]) ? 1'b1:1'b0;
+
+ assign a_zero = ~(|a);
+ assign b_zero = ~(|b);
+ assign c_sign = a[15] ^ b[15];
+ assign multiplier_input1 = mode ? {5'b0,1'b1,a[9:0]} : ((a[7]==1'b0) ? {9'b0,a[6:0]} : {9'b0,~a[6:0]+1'b1});
+ assign multiplier_input2 = mode ? {5'b0,1'b1,b[9:0]} : ((b[7]==1'b0) ? {9'b0,b[6:0]} : {9'b0,~b[6:0]+1'b1});
+
+ assign c = mode ? ((a_zero | b_zero) ? 16'b0 : c_tmp) : ((a[7]^b[7] == 1'b0) ? multiplier_output[15:0] : {1'b1,~multiplier_output[14:0]+1'b1});
+ //error detect
+ assign c_tmp = (~error) ? {c_sign,normalized_out} : (underflow ? {c_sign,15'b0000_0000_0000_000} : {c_sign,5'b1111_1,10'b0000_0000_00});
+
+ assign error = overflow | underflow;
+
+
+`ifdef PIPELINE
+
+ reg [31:0] multiplier_output_tmp;
+
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ multiplier_output_tmp <= 32'b0;
+ end else begin
+ multiplier_output_tmp <= multiplier_output;
+ end
+ end
+
+ assign mantissa_prod = multiplier_output_tmp[21:0];
+ mul16x16 u1(clk,rst_n,multiplier_input1,multiplier_input2,multiplier_output);
+
+`else
+
+ assign mantissa_prod = multiplier_output[21:0];
+ mul16x16 u1(multiplier_input1,multiplier_input2,multiplier_output);
+
+`endif
+
+ cla_nbit #(.n(5)) u2(a[14:10],b[14:10],1'b0,sum_exponent,c1); // add exponent
+ cla_nbit #(.n(5)) u3(sum_exponent, 5'b10001,1'b0,biased_sum_exponent,c2); // minus bias
+ mul_normalizer u4(biased_sum_exponent,mantissa_prod,normalized_out);
+
+endmodule
+`endif
+`ifndef MUL_NORMALIZER_V_
+`define MUL_NORMALIZER_V_
+
+`timescale 1ns / 1ps
+
+module mul_normalizer (
+ input [ 4:0] exponent,
+ input [21:0] mantissa_prod,
+ output [14:0] result
+);
+
+ wire [4:0] result_exponent;
+ wire [9:0] result_mantissa;
+
+ assign result_exponent = (mantissa_prod[21]) ? (exponent + 1'b1): exponent;
+ assign result_mantissa = (mantissa_prod[21]) ? mantissa_prod[20:11]:mantissa_prod[19:10];
+ assign result = {result_exponent,result_mantissa};
+
+// No rounding and No overflow/underflow detection
+
+endmodule
+`endif
+`ifndef ADD_NORMALIZER_V_
+`define ADD_NORMALIZER_V_
+
+`timescale 1ns / 1ps
+
+module add_normalizer (
+ input sign,
+ input [ 4:0] exponent,
+ input [10:0] mantissa_add,
+ output reg [15:0] result,
+ input if_carray,
+ input if_sub
+);
+
+ reg [4:0] number_of_zero_lead;
+ reg [10:0] norm_mantissa_add;
+ reg [9:0] mantissa_tmp;
+
+ wire [4:0] shift_left_exp;
+ wire c1;
+
+ always @ (*) begin
+ if (mantissa_add[10:4] == 7'b0000_001) begin
+ number_of_zero_lead = 5'd6;
+ norm_mantissa_add = (mantissa_add << 4'd6);
+ end else if (mantissa_add[10:5] == 6'b0000_01) begin
+ number_of_zero_lead = 5'd5;
+ norm_mantissa_add = (mantissa_add << 4'd5);
+ end else if (mantissa_add[10:6] == 5'b0000_1) begin
+ number_of_zero_lead = 5'd4;
+ norm_mantissa_add = (mantissa_add << 4'd4);
+ end else if (mantissa_add[10:7] == 4'b0001) begin
+ number_of_zero_lead = 5'd3;
+ norm_mantissa_add = (mantissa_add << 4'd3);
+ end else if (mantissa_add[10:8] == 3'b001) begin
+ number_of_zero_lead = 5'd2;
+ norm_mantissa_add = (mantissa_add << 4'd2);
+ end else if (mantissa_add[10:9] == 2'b01) begin
+ number_of_zero_lead = 5'd1;
+ norm_mantissa_add = (mantissa_add << 4'd1);
+ end else begin
+ number_of_zero_lead = 5'd0;
+ norm_mantissa_add = mantissa_add[10:0];
+ end
+ end
+
+ always @(*) begin
+ result[15] = sign;
+ if (!if_sub) begin
+ result[14:10] = if_carray ? exponent + 1'b1 : exponent;
+ result[9:0] = if_carray ? mantissa_add[10:1] : mantissa_add[9:0];
+ end else begin
+ result[14:10] = shift_left_exp;
+ result[9:0] = norm_mantissa_add[9:0];
+ end
+ end
+
+ cla_nbit #(.n(5)) u1(exponent,~number_of_zero_lead+1'b1,1'b0,shift_left_exp,c1);
+
+endmodule
+
+`endif
+`ifndef CONTROL_V_
+`define CONTROL_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+// Really 3x3, a done output??
+
+
+// 6 logic cycles + 2 (buffered?) delay cycles
+module control #(
+ parameter W = 16,
+ parameter N = 3
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W * N * N - 1 : 0] i_A,
+ input wire [ W * N * N - 1 : 0] i_B,
+ output wire [ W * N * N - 1 : 0] o_C,
+ output wire o_done,
+ output wire [ W * N - 1 : 0] A_in,
+ output wire [ W * N - 1 : 0] B_in,
+
+ // debug
+ output wire [ W * N * 2 - 1 : 0] debug_pe_a,
+ output wire [ W * N * 2 - 1 : 0] debug_pe_b
+);
+
+ reg [3 : 0] states, next_states;
+
+ reg [W - 1 : 0] a00, a01, a02;
+ wire [W - 1 : 0] a01_q, a02_q;
+
+ reg [W - 1 : 0] b00, b01, b02;
+ wire [W - 1 : 0] b01_q, b02_q;
+
+ assign A_in = {a02_q, a01_q, a00};
+ assign B_in = {b02_q, b01_q, b00};
+
+ // 0000 is the idle / standby state
+ always @(posedge i_clk) begin
+ if (i_rst | ~i_en) begin
+ states <= 4'b0000;
+ end
+ else if (i_en) begin
+ states <= next_states;
+ end
+ end
+
+ always @(*) begin
+ if (next_states == 4'b1001) begin
+ // Done: Force waiting
+ // This can also be done by switching off input
+ next_states = 4'b1001;
+ end else begin
+ next_states = states + 4'b0001;
+ end
+ end
+
+ assign o_done = (states == 4'b1001);
+
+ always @(*) begin
+ case (states)
+ 4'b0001: begin
+ a00 = i_A[ W - 1 : 0 * W];
+ a01 = i_A[2 * W - 1 : 1 * W];
+ a02 = i_A[3 * W - 1 : 2 * W];
+
+
+ b00 = i_B[ W - 1 : 0 * W];
+ b01 = i_B[2 * W - 1 : 1 * W];
+ b02 = i_B[3 * W - 1 : 2 * W];
+ end
+ 4'b0010: begin
+ a00 = i_A[4 * W - 1 : 3 * W];
+ a01 = i_A[5 * W - 1 : 4 * W];
+ a02 = i_A[6 * W - 1 : 5 * W];
+
+ b00 = i_B[4 * W - 1 : 3 * W];
+ b01 = i_B[5 * W - 1 : 4 * W];
+ b02 = i_B[6 * W - 1 : 5 * W];
+ end
+ 4'b0011: begin
+ a00 = i_A[7 * W - 1 : 6 * W];
+ a01 = i_A[8 * W - 1 : 7 * W];
+ a02 = i_A[9 * W - 1 : 8 * W];
+
+ b00 = i_B[7 * W - 1 : 6 * W];
+ b01 = i_B[8 * W - 1 : 7 * W];
+ b02 = i_B[9 * W - 1 : 8 * W];
+ end
+ default: begin
+ a00 = 0;
+ a01 = 0;
+ a02 = 0;
+
+ b00 = 0;
+ b01 = 0;
+ b02 = 0;
+ end
+ endcase
+ end
+
+ systolic #(.W(W), .N(N)) sys(.i_clk(i_clk), .i_rst(i_rst), .i_en(i_en), .i_mode(i_mode), .i_A(A_in), .i_B(B_in), .o_C(o_C), .debug_pe_a(debug_pe_a), .debug_pe_b(debug_pe_b));
+
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayA1(.clk(i_clk), .reset(i_rst), .data_in(a01), .data_out(a01_q));
+ delay2 #(.WIDTH(W), .DEPTH(2)) delayA2(.clk(i_clk), .reset(i_rst), .data_in(a02), .data_out(a02_q));
+
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayB1(.clk(i_clk), .reset(i_rst), .data_in(b01), .data_out(b01_q));
+ delay2 #(.WIDTH(W), .DEPTH(2)) delayB2(.clk(i_clk), .reset(i_rst), .data_in(b02), .data_out(b02_q));
+
+
+endmodule
+`default_nettype wire
+`endif
+// No pipelined/piplined MAC
+// Version: 1.0
+
+// Description:
+
+// Function : mac_out = in_a * in_b + in_c. Both work for INT8 and FP16 mode. Default INT8 and FP16 are signed number
+// Exception : error detection for overflow and underflow in FP16 mode
+`ifndef MAC_UNIT_V_
+`define MAC_UNIT_V_
+
+
+`timescale 1ns / 1ps
+
+module mac_unit
+(
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input [15:0] in_a, // multiplier input1
+ input [15:0] in_b, // multiplier input2
+ input [15:0] in_c, // adder input2 ; adder input1 = in_a*in_b
+ input mode,
+ //output [15:0] mac_out,
+ output [15:0] mac_out,
+ output error
+);
+
+ wire [15:0] mul_out;
+
+ int_fp_add add(
+ `ifdef PIPELINE
+ .clk (clk ),
+ .rst_n (rst_n ),
+ `endif
+ .mode (mode ),
+ .a (mul_out),
+ .b (in_c ),
+ .c (mac_out)
+ );
+
+ int_fp_mul mul(
+ `ifdef PIPELINE
+ .clk (clk ),
+ .rst_n (rst_n ),
+ `endif
+ .mode (mode ),
+ .a (in_a ),
+ .b (in_b ),
+ .c (mul_out),
+ .error (error )
+ );
+
+endmodule
+`endif
+`ifndef PE_2_V_
+`define PE_2_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+module PE #(
+ //parameter W = 32
+ parameter W = 16
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W - 1 : 0] i_A,
+ input wire [ W - 1 : 0] i_B,
+ output wire [ W - 1 : 0] o_A,
+ output wire [ W - 1 : 0] o_B,
+ //output wire [ W - 1 : 0] o_C
+ output reg [ W - 1 : 0] o_C
+);
+
+ //wire mode;
+ //assign mode = 1;
+
+ wire sync_load;
+ assign o_A = i_A_buffered;
+ assign o_B = i_B_buffered;
+ assign sync_load = i_rst | ~i_en;
+
+ wire [W - 1 : 0] i_A_buffered;
+ wire [W - 1 : 0] i_B_buffered;
+
+ reg [15 : 0] accu;
+ wire [15 : 0] mac_out;
+
+ // Buffered in MAC
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayA(.clk(i_clk), .reset(i_rst), .data_in(i_A), .data_out(i_A_buffered));
+ delay2 #(.WIDTH(W), .DEPTH(1)) delayB(.clk(i_clk), .reset(i_rst), .data_in(i_B), .data_out(i_B_buffered));
+
+ always @(posedge i_clk) begin
+ if (sync_load) begin
+ accu <= 0;
+ o_C <= 0;
+ end
+ else begin
+ accu <= mac_out;
+ o_C <= mac_out;
+ end
+ end
+
+ // Optional: making it clocked
+ mac_unit u0_mac(
+ .in_a (i_A_buffered),
+ .in_b (i_B_buffered),
+ .in_c (accu),
+ .mode (i_mode),
+ .mac_out (mac_out)
+ );
+
+endmodule
+
+`default_nettype wire
+`endif
+`ifndef DELAY_2_V_
+`define DELAY_2_V_
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+module delay2 #(
+ parameter WIDTH = 16,
+ parameter DEPTH = 3
+) (
+ input wire clk,
+ input wire reset,
+ input wire [WIDTH - 1 : 0] data_in,
+ output wire [WIDTH - 1 : 0] data_out
+);
+
+ wire [WIDTH - 1 : 0] connect_wire [DEPTH : 0];
+
+ assign data_out = connect_wire[DEPTH];
+ assign connect_wire[0] = data_in;
+
+ genvar i;
+ generate
+ for (i = 1; i <= DEPTH; i = i + 1) begin
+ dff #(.WIDTH(WIDTH)) DFF(
+ .clk(clk),
+ .rst(reset),
+ .inp(connect_wire[i-1]),
+ .outp(connect_wire[i]));
+ end
+ endgenerate
+endmodule
+
+// D flip-flop with synchronous reset
+module dff#(
+ parameter WIDTH = 1
+ ) (
+ input wire clk,
+ input wire rst,
+
+ input wire [WIDTH-1:0] inp,
+ output reg [WIDTH-1:0] outp
+ );
+
+ always @(posedge clk) begin
+ outp <= rst ? 0 : inp;
+ end
+
+endmodule
+
+`default_nettype wire
+`endif
+`ifndef MUL_2x2_V_
+`define MUL_2x2_V_
+
+`timescale 1ns / 1ps
+
+module mul2x2(
+ input [1:0] a,
+ input [1:0] b,
+ output [3:0] c
+);
+
+ wire [3:0] tmp;
+
+ assign tmp[0] = a[0] & b[0];
+ assign tmp[1] = (a[1]&b[0]) ^ (a[0]&b[1]);
+ assign tmp[2] = (a[0]&b[1]) & (a[1]&b[0]) ^ (a[1]&b[1]);
+ assign tmp[3] = (a[0]&b[1]) & (a[1]&b[0]) & (a[1]&b[1]);
+ assign c = {tmp[3],tmp[2],tmp[1],tmp[0]};
+
+endmodule
+`endif
+`ifndef MUL_4x4_V_
+`define MUL_4x4_V_
+
+`timescale 1ns / 1ps
+
+module mul4x4(
+ input [3:0] a,
+ input [3:0] b,
+ output [7:0] c
+);
+
+ wire [15:0] tmp1;
+ wire [ 5:0] result1;
+ wire [ 5:0] result2;
+ wire co1,co2,co3;
+
+ mul2x2 u1(a[3:2],b[3:2],tmp1[15:12]);
+ mul2x2 u2(a[1:0],b[3:2],tmp1[11:8]);
+ mul2x2 u3(a[3:2],b[1:0],tmp1[7:4]);
+ mul2x2 u4(a[1:0],b[1:0],tmp1[3:0]);
+
+ cla_nbit #(.n(6)) u5({tmp1[15:12],2'b0},{2'b0,tmp1[11:8]},1'b0 ,result1 ,co1);
+ cla_nbit #(.n(6)) u6({2'b0,tmp1[7:4]} ,{4'b0,tmp1[3:2]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(6)) u7(result1 ,result2 ,co2 ,c[7:2] ,co3);
+
+ assign c[1:0] = tmp1[1:0];
+
+endmodule
+`endif
+`ifndef MUL_8x8_V_
+`define MUL_8x8_V_
+
+`timescale 1ns / 1ps
+
+module mul8x8(
+ input [ 7:0] a,
+ input [ 7:0] b,
+ output [15:0] c
+);
+
+ wire [31:0] tmp1;
+ wire [11:0] result1;
+ wire [11:0] result2;
+ wire co1,co2,co3;
+
+ mul4x4 u1(a[7:4],b[7:4],tmp1[31:24]);
+ mul4x4 u2(a[3:0],b[7:4],tmp1[23:16]);
+ mul4x4 u3(a[7:4],b[3:0],tmp1[15:8]);
+ mul4x4 u4(a[3:0],b[3:0],tmp1[7:0]);
+
+ cla_nbit #(.n(12)) u5({tmp1[31:24],4'b0} ,{4'b0,tmp1[23:16]} ,1'b0 ,result1 ,co1);
+ cla_nbit #(.n(12)) u6({4'b0,tmp1[15:8]} ,{8'b0,tmp1[7:4]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(12)) u7(result1 ,result2 ,co2 ,c[15:4] ,co3);
+
+ assign c[3:0] = tmp1[3:0];
+
+endmodule
+`endif
+`ifndef MUL_16x16_V_
+`define MUL_16x16_V_
+
+`timescale 1ns / 1ps
+
+module mul16x16(
+`ifdef PIPLINE
+ input clk,
+ input rst_n,
+`endif
+ input [15:0] a,
+ input [15:0] b,
+ output [31:0] c);
+
+ wire [63:0] tmp1,tmp2;
+ wire [23:0] result1;
+ wire [23:0] result2;
+ wire co1,co2,co3;
+
+`ifdef PIPLINE
+ // one stage pipline
+ reg [63:0] tmp1_reg;
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ tmp1_reg <= 64'b0;
+ end else begin
+ tmp1_reg <= tmp1;
+ end
+ end
+ assign tmp2 = tmp1_reg;
+
+`else
+ assign tmp2 = tmp1;
+
+`endif
+
+ mul8x8 u1(a[15:8],b[15:8],tmp1[63:48]);
+ mul8x8 u2(a[7:0] ,b[15:8],tmp1[47:32]);
+ mul8x8 u3(a[15:8],b[ 7:0],tmp1[31:16]);
+ mul8x8 u4(a[7:0] ,b[ 7:0],tmp1[15:0]);
+
+ cla_nbit #(.n(24)) u5({tmp2[63:48],8'b0} ,{8'b0,tmp2[47:32]} ,1'b0 ,result1 ,co1);
+ cla_nbit #(.n(24)) u6({8'b0,tmp2[31:16]} ,{16'b0,tmp2[15:8]} ,co1 ,result2 ,co2);
+ cla_nbit #(.n(24)) u7(result1 ,result2 ,co2 ,c[31:8] ,co3);
+
+ assign c[7:0] = tmp2[7:0];
+
+endmodule
+`endif
+`ifndef ALIGNMENT_V_
+`define ALIGNMENT_V_
+
+`timescale 1ns / 1ps
+
+module alignment (
+ input [14:0] bigger,
+ input [14:0] smaller,
+ output [10:0] aligned_small
+);
+
+ wire c1;
+ wire [4:0] bigger_exponent, smaller_exponent, shift_bits;
+
+ assign bigger_exponent = bigger [14:10];
+ assign smaller_exponent = smaller [14:10];
+ assign aligned_small = ({1'b1,smaller[9:0]} >> shift_bits);
+
+ cla_nbit #(.n(5)) u1(bigger_exponent,~smaller_exponent+1'b1,1'b0,shift_bits,c1);
+
+endmodule
+`endif
+`ifndef SYSTOLIC_V_
+`define SYSTOLIC_V_
+
+
+`timescale 1ns / 1ps
+`default_nettype none
+
+// row
+// 3 x 3
+module systolic #(
+ parameter W = 16,
+ parameter N = 3
+) (
+ input wire i_clk,
+ input wire i_rst,
+ input wire i_en,
+ input wire i_mode,
+ input wire [ W * N - 1 : 0] i_A,
+ input wire [ W * N - 1 : 0] i_B,
+ output wire [ W * N * N - 1 : 0] o_C,
+
+ // debug
+ output wire [ W * N * 2 - 1 : 0] debug_pe_a,
+ output wire [ W * N * 2 - 1 : 0] debug_pe_b
+);
+
+ //localparam O_VEC_WIDTH = 2 * W;
+ localparam O_VEC_WIDTH = W;
+
+ wire [W - 1 : 0] a00, a01, a02, b00, b01, b02;
+ wire [W - 1 : 0] pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22;
+ wire [W - 1 : 0] pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22;
+
+ wire [O_VEC_WIDTH - 1 : 0] c00, c01, c02, c10, c11, c12, c20, c21, c22;
+
+ assign a00 = i_A[0 * W +: W];
+ assign a01 = i_A[1 * W +: W];
+ assign a02 = i_A[2 * W +: W];
+
+ assign b00 = i_B[0 * W +: W];
+ assign b01 = i_B[1 * W +: W];
+ assign b02 = i_B[2 * W +: W];
+
+ PE #(.W(W)) PE00(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a00), .i_B(b00), .o_A(pe_a_00_01),.o_B(pe_b_00_10),.o_C(c00));
+ PE #(.W(W)) PE01(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_00_01), .i_B(b01), .o_A(pe_a_01_02),.o_B(pe_b_01_11),.o_C(c01));
+ PE #(.W(W)) PE02(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_01_02), .i_B(b02), .o_A(), .o_B(pe_b_02_12),.o_C(c02));
+
+ PE #(.W(W)) PE10(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a01), .i_B(pe_b_00_10),.o_A(pe_a_10_11),.o_B(pe_b_10_20),.o_C(c10));
+ PE #(.W(W)) PE11(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_10_11),.i_B(pe_b_01_11),.o_A(pe_a_11_12),.o_B(pe_b_11_21),.o_C(c11));
+ PE #(.W(W)) PE12(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_11_12),.i_B(pe_b_02_12),.o_A(), .o_B(pe_b_12_22),.o_C(c12));
+
+ PE #(.W(W)) PE20(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a02), .i_B(pe_b_10_20),.o_A(pe_a_20_21),.o_B(),.o_C(c20));
+ PE #(.W(W)) PE21(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_20_21),.i_B(pe_b_11_21),.o_A(pe_a_21_22),.o_B(),.o_C(c21));
+ PE #(.W(W)) PE22(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_21_22),.i_B(pe_b_12_22),.o_A(), .o_B(),.o_C(c22));
+
+
+ // https://stackoverflow.com/questions/18067571/indexing-vectors-and-arrays-with
+ // https://standards.ieee.org/ieee/1800/6700/
+ // a_vect[ 0 +: 8] // == a_vect[ 7 : 0]
+ //assign o_C[1 * O_VEC_WIDTH - 1 -: O_VEC_WIDTH] = c00;
+
+ assign o_C[0 * O_VEC_WIDTH +: O_VEC_WIDTH] = c00;
+ assign o_C[1 * O_VEC_WIDTH +: O_VEC_WIDTH] = c01;
+ assign o_C[2 * O_VEC_WIDTH +: O_VEC_WIDTH] = c02;
+ assign o_C[3 * O_VEC_WIDTH +: O_VEC_WIDTH] = c10;
+ assign o_C[4 * O_VEC_WIDTH +: O_VEC_WIDTH] = c11;
+ assign o_C[5 * O_VEC_WIDTH +: O_VEC_WIDTH] = c12;
+ assign o_C[6 * O_VEC_WIDTH +: O_VEC_WIDTH] = c20;
+ assign o_C[7 * O_VEC_WIDTH +: O_VEC_WIDTH] = c21;
+ assign o_C[8 * O_VEC_WIDTH +: O_VEC_WIDTH] = c22;
+
+ assign debug_pe_a = {pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22};
+ assign debug_pe_b = {pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22};
+
+endmodule
+`endif
+`ifndef CLA_NBIT_V_
+`define CLA_NBIT_V_
+
+`timescale 1ns / 1ps
+
+// Carry Look-ahead adder (CLA)
+module cla_nbit #(
+ parameter n = 4
+) (
+ input [n-1:0] a,
+ input [n-1:0] b,
+ input ci,
+ output [n-1:0] s,
+ output co
+);
+
+ wire [n-1:0] g;
+ wire [n-1:0] p;
+ wire [ n:0] c;
+
+ assign c[0] = ci;
+ assign co = c[n];
+
+ genvar i; /* i - generate index variable */
+
+ generate
+ for (i = 0; i < n; i = i + 1) begin : addbit
+ assign s[i] = a[i] ^ b[i] ^ c[i];
+ assign g[i] = a[i] & b[i];
+ assign p[i] = a[i] | b[i];
+ assign c[i + 1] = g[i] | (p[i] & c[i]);
+ end
+ endgenerate
+
+endmodule
+`endif
+`ifndef INT_FP_ADD_V_
+`define INT_FP_ADD_V_
+
+
+`timescale 1ns / 1ps
+
+module int_fp_add (
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input mode,
+ input [15:0] a,
+ input [15:0] b,
+ output [15:0] c
+);
+
+ wire [10:0] adder_input_1,adder_input_2,aligned_small,adder_output;
+ wire if_sub,a_sign, b_sign, c_sign,c1, c2;
+ wire [15:0] normalized_out;
+
+ // only used in INT8 MAC mode
+ wire [4:0] higher_add,higher_a,higher_b;
+
+ wire [15:0] result;
+ reg [14:0] bigger, smaller;
+ reg a_larger_b;
+
+`ifdef PIPELINE
+ reg [14:0] bigger_reg, smaller_reg;
+ reg [10:0] adder_output_reg;
+ wire [14:0] bigger_tmp, smaller_tmp;
+ wire [10:0] adder_output_tmp;
+`endif
+
+
+ assign a_sign = a[15];
+ assign b_sign = b[15];
+ assign if_sub = (a_sign == b_sign) ? 1'b0 : 1'b1;
+ assign c_sign = a_larger_b ? a_sign : b_sign;
+ assign higher_a = (mode == 1'b0) ? a[15:11] : 5'b0;
+ assign higher_b = (mode == 1'b0) ? b[15:11] : 5'b0;
+ assign adder_input_1 = (mode==1'b0) ? a[10:0] :{1'b1,bigger[9:0]};
+ assign adder_input_2 = (mode==1'b0) ? b[10:0] : (if_sub ? ~aligned_small + 1'b1 : aligned_small);
+ assign c = (mode == 1'b0) ? {higher_add,adder_output} : result;
+
+ //compare two number regardless sign
+ always @(*) begin
+ if (a[14:0] > b[14:0]) begin
+ bigger = a[14:0];
+ smaller = b[14:0];
+ a_larger_b = 1'b1;
+ end else begin
+ bigger = b[14:0];
+ smaller = a[14:0];
+ a_larger_b = 1'b0;
+ end
+ end
+
+`ifdef PIPELINE
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ bigger_reg <= 15'b0;
+ smaller_reg <= 15'b0;
+ adder_output_reg <= 11'b0;
+ end else begin
+ bigger_reg <= bigger;
+ smaller_reg <= smaller;
+ adder_output_reg <= adder_output;
+ end
+ end
+ assign bigger_tmp = bigger_reg[14:0];
+ assign smaller_tmp = smaller_reg[14:0];
+ assign adder_output_tmp = adder_output_reg[10:0];
+`endif
+
+`ifdef PIPELINE
+ // align small number
+ alignment u1(bigger_tmp,smaller_tmp,aligned_small);
+`else
+ // align small number
+ alignment u1(bigger,smaller,aligned_small);
+`endif
+
+ cla_nbit #(.n(11)) u2(adder_input_1,adder_input_2,1'b0,adder_output,c1);
+
+ // This 5 bit adder only used in INT8 MAC mode
+ cla_nbit #(.n(5)) u3(higher_a,higher_b,c1,higher_add,c2);
+
+`ifdef PIPELINE
+ add_normalizer u4(c_sign,bigger[14:10],adder_output_tmp,result,c1,if_sub);
+`else
+ add_normalizer u4(c_sign,bigger[14:10],adder_output,result,c1,if_sub);
+`endif
+
+endmodule
+`endif
+`ifndef INT_FP_MUL_V_
+`define INT_FP_MUL_V_
+
+
+module int_fp_mul (
+`ifdef PIPELINE
+ input clk,
+ input rst_n,
+`endif
+ input mode,
+ input [15:0] a,
+ input [15:0] b,
+ output [15:0] c,
+ output error // valid in fp16 mode
+);
+
+ wire [15:0] c_tmp;
+ wire c_sign,a_zero,b_zero;
+ wire [ 4:0] sum_exponent, biased_sum_exponent;
+ wire [15:0] multiplier_input1,multiplier_input2;
+
+ wire [31:0] multiplier_output;
+ wire [14:0] normalized_out;
+ wire [21:0] mantissa_prod;
+ wire c1,c2,underflow,overflow;
+
+ assign overflow = (c1 && c2 && ~biased_sum_exponent[4]) ? 1'b1 :1'b0;
+ assign underflow = (~c1 && ~c2 && biased_sum_exponent[4]) ? 1'b1:1'b0;
+
+ assign a_zero = ~(|a);
+ assign b_zero = ~(|b);
+ assign c_sign = a[15] ^ b[15];
+ assign multiplier_input1 = mode ? {5'b0,1'b1,a[9:0]} : ((a[7]==1'b0) ? {9'b0,a[6:0]} : {9'b0,~a[6:0]+1'b1});
+ assign multiplier_input2 = mode ? {5'b0,1'b1,b[9:0]} : ((b[7]==1'b0) ? {9'b0,b[6:0]} : {9'b0,~b[6:0]+1'b1});
+
+ assign c = mode ? ((a_zero | b_zero) ? 16'b0 : c_tmp) : ((a[7]^b[7] == 1'b0) ? multiplier_output[15:0] : {1'b1,~multiplier_output[14:0]+1'b1});
+ //error detect
+ assign c_tmp = (~error) ? {c_sign,normalized_out} : (underflow ? {c_sign,15'b0000_0000_0000_000} : {c_sign,5'b1111_1,10'b0000_0000_00});
+
+ assign error = overflow | underflow;
+
+
+`ifdef PIPELINE
+
+ reg [31:0] multiplier_output_tmp;
+
+ always @ (posedge clk or negedge rst_n) begin
+ if (!rst_n) begin
+ multiplier_output_tmp <= 32'b0;
+ end else begin
+ multiplier_output_tmp <= multiplier_output;
+ end
+ end
+
+ assign mantissa_prod = multiplier_output_tmp[21:0];
+ mul16x16 u1(clk,rst_n,multiplier_input1,multiplier_input2,multiplier_output);
+
+`else
+
+ assign mantissa_prod = multiplier_output[21:0];
+ mul16x16 u1(multiplier_input1,multiplier_input2,multiplier_output);
+
+`endif
+
+ cla_nbit #(.n(5)) u2(a[14:10],b[14:10],1'b0,sum_exponent,c1); // add exponent
+ cla_nbit #(.n(5)) u3(sum_exponent, 5'b10001,1'b0,biased_sum_exponent,c2); // minus bias
+ mul_normalizer u4(biased_sum_exponent,mantissa_prod,normalized_out);
+
+endmodule
+`endif
+`ifndef MUL_NORMALIZER_V_
+`define MUL_NORMALIZER_V_
+
+`timescale 1ns / 1ps
+
+module mul_normalizer (
+ input [ 4:0] exponent,
+ input [21:0] mantissa_prod,
+ output [14:0] result
+);
+
+ wire [4:0] result_exponent;
+ wire [9:0] result_mantissa;
+
+ assign result_exponent = (mantissa_prod[21]) ? (exponent + 1'b1): exponent;
+ assign result_mantissa = (mantissa_prod[21]) ? mantissa_prod[20:11]:mantissa_prod[19:10];
+ assign result = {result_exponent,result_mantissa};
+
+// No rounding and No overflow/underflow detection
+
+endmodule
+`endif
// SPDX-FileCopyrightText: 2020 Efabless Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
@@ -104,6 +1711,11 @@
assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
+ wire cs;
+ // Assuming LA probes [66] are for controlling cs (data ready)
+ assign cs = (~la_oenb[66]) ? la_data_in[66] : 0;
+
+ /*
counter #(
.BITS(BITS)
) counter(
@@ -118,6 +1730,15 @@
.la_input(la_data_in[63:32]),
.count(count)
);
+ */
+
+ interface_top interface_inst(
+ .clk(clk),
+ .rst(rst),
+ .cs(1'b1),
+ .data_in(la_data_in[63:32]),
+ .data_out(count)
+ );
endmodule
@@ -163,3 +1784,121 @@
endmodule
`default_nettype wire
+
+`default_nettype none
+
+// Looks like we need our own version of transactional memory definition
+// SPI alike
+// 3 x 3, 3 x 3 -- 18 in
+// 3 x 3 9 out
+// GEMM
+
+module interface_top (
+ input wire clk,
+ input wire rst,
+ input wire cs, // data ready
+ input wire [31:0] data_in,
+ output reg [31:0] data_out
+);
+
+ localparam W = 16;
+ localparam N = 3;
+
+ wire done_o;
+ //assign data_out = 1;
+
+ // scratch pad
+ reg [2 * W * N * N - 1 : 0] input_registers;
+ wire [ W * N * N - 1 : 0] C_mat;
+
+ // 16bit - 8
+ // 32bit - 9
+ // clog2 function
+ reg [7 : 0] addr_ptr;
+
+ wire [W * N * N - 1 : 0] A_mat;
+ wire [W * N * N - 1 : 0] B_mat;
+
+ assign A_mat = input_registers[ 0 +: W * N * N];
+ assign B_mat = input_registers[W * N * N +: W * N * N];
+
+ // mode selection
+ // First try on FP16
+
+ reg [2:0] state;
+ reg [2:0] next_state;
+
+ wire [2:0] IDLE = 3'b000;
+ wire [2:0] LOAD = 3'b001;
+ wire [2:0] PROCESS = 3'b011;
+
+ // Refactor: Moving them inside control
+ wire [W * N - 1 : 0] A_in;
+ wire [W * N - 1 : 0] B_in;
+
+ control #(.W(W), .N(N)) control_inst(
+ .i_clk(clk),
+ .i_rst(rst),
+ .i_en(mat_en),
+ .i_mode(1'b1),
+ .i_A(A_mat),
+ .i_B(B_mat),
+ .o_C(C_mat),
+ .o_done(done_o)
+ );
+
+ // memory counter
+ reg [4:0] addr_cnter;
+ reg [4:0] next_addr_cnter;
+
+ //assign data_out = A_mat[W * 3 +: W];
+ //assign data_out = done_o;
+ //assign data_out[2:0] = state[2:0];
+
+ // TODO: might be redundant
+ reg mat_en;
+
+ always @(posedge clk) begin
+ if (rst) begin
+ state <= IDLE;
+ next_state <= IDLE;
+ addr_cnter <= 5'b0;
+ next_addr_cnter <= 5'b0;
+ data_out <= 32'b0;
+ end
+ else begin
+ state <= next_state;
+ addr_cnter <= next_addr_cnter;
+ data_out <= {31'b0, done_o};
+ end
+ end
+
+ always @(*) begin
+ case (state)
+ IDLE: begin
+ if (cs) begin
+ next_state = LOAD;
+ end
+ mat_en = 1'b0;
+ end
+ LOAD: begin
+ input_registers[addr_cnter * W +: W] = data_in[W - 1 : 0];
+
+ if (addr_cnter >= 2 * N * N - 1) begin
+ mat_en = 1'b1;
+ next_state = PROCESS;
+ next_addr_cnter = 0;
+ end
+ else begin
+ next_addr_cnter = addr_cnter + 1;
+ end
+ end
+ PROCESS: begin
+ next_state = done_o ? IDLE : PROCESS;
+ end
+ endcase
+ end
+
+endmodule
+
+`default_nettype wire