verilog/rtl/user_proj_example.v - third_party/shuttle/sky130/mpw-007/slot-037 - Git at Google

 `ifndef ADD_NORMALIZER_V_
 `define ADD_NORMALIZER_V_

 `timescale 1ns / 1ps

 module add_normalizer (
   input             sign,
   input      [ 4:0] exponent,
   input      [10:0] mantissa_add,
   output reg [15:0] result,
   input             if_carray,
   input             if_sub
 );

   reg [4:0] number_of_zero_lead;
   reg [10:0] norm_mantissa_add;
   reg [9:0] mantissa_tmp;

   wire [4:0] shift_left_exp;
   wire c1;

   always @ (*) begin
     if (mantissa_add[10:4] == 7'b0000_001) begin
       number_of_zero_lead = 5'd6;
       norm_mantissa_add   = (mantissa_add << 4'd6);
     end else if (mantissa_add[10:5] == 6'b0000_01) begin
       number_of_zero_lead = 5'd5;
       norm_mantissa_add   = (mantissa_add << 4'd5);
     end else if (mantissa_add[10:6] == 5'b0000_1) begin
       number_of_zero_lead = 5'd4;
       norm_mantissa_add   = (mantissa_add << 4'd4);
     end else if (mantissa_add[10:7] == 4'b0001) begin
       number_of_zero_lead = 5'd3;
       norm_mantissa_add   = (mantissa_add << 4'd3);
     end else if (mantissa_add[10:8] == 3'b001) begin
       number_of_zero_lead = 5'd2;
       norm_mantissa_add   = (mantissa_add << 4'd2);
     end else if (mantissa_add[10:9] == 2'b01) begin
       number_of_zero_lead = 5'd1;
       norm_mantissa_add   = (mantissa_add << 4'd1);
     end else begin
       number_of_zero_lead = 5'd0;
       norm_mantissa_add   = mantissa_add[10:0];
     end
   end

   always @(*) begin
     result[15]      = sign;
     if (!if_sub) begin
       result[14:10] = if_carray ? exponent + 1'b1 : exponent;
       result[9:0]   = if_carray ? mantissa_add[10:1] : mantissa_add[9:0];
     end else begin
       result[14:10] = shift_left_exp;
       result[9:0]   = norm_mantissa_add[9:0];
     end
   end

   cla_nbit #(.n(5)) u1(exponent,~number_of_zero_lead+1'b1,1'b0,shift_left_exp,c1);

 endmodule

 `endif
 `ifndef CONTROL_V_
 `define CONTROL_V_


 `timescale 1ns / 1ps
 `default_nettype none

 // Really 3x3, a done output??


 // 6 logic cycles + 2 (buffered?) delay cycles
 module control #(
   parameter W = 16,
   parameter N = 3
 ) (
   input  wire                         i_clk,
   input  wire                         i_rst,
   input  wire                         i_en,
   input  wire                         i_mode,
   input  wire [    W * N * N - 1 : 0] i_A,
   input  wire [    W * N * N - 1 : 0] i_B,
   output wire [    W * N * N - 1 : 0] o_C,
   output wire                         o_done,

   // debug
   output wire [    W * N * 2 - 1 : 0] debug_pe_a,
   output wire [    W * N * 2 - 1 : 0] debug_pe_b
 );

   reg [3 : 0] states, next_states;

   reg  [W - 1 : 0] a00, a01, a02;
   wire [W - 1 : 0] a01_q, a02_q;

   reg  [W - 1 : 0] b00, b01, b02;
   wire [W - 1 : 0] b01_q, b02_q;

   wire [W * N - 1 : 0] A_in;
   wire [W * N - 1 : 0] B_in;

   assign A_in = {a02_q, a01_q, a00};
   assign B_in = {b02_q, b01_q, b00};

   // 0000 is the idle / standby state
   always @(posedge i_clk) begin
     if (i_rst | ~i_en) begin
       states <= 4'b0000;
     end
     else if (i_en) begin
       states <= next_states;
     end
   end

   always @(*) begin
     if (states == 4'b1001) begin
       // Done: Force waiting
       // This can also be done by switching off input
       next_states = 4'b1001;
     end else begin
       next_states = states + 4'b0001;
     end
   end

   assign o_done = (states == 4'b1001);

   always @(*) begin
     case (states)
       4'b0001: begin
         a00 = i_A[    W - 1 : 0 * W];
         a01 = i_A[2 * W - 1 : 1 * W];
         a02 = i_A[3 * W - 1 : 2 * W];


         b00 = i_B[    W - 1 : 0 * W];
         b01 = i_B[2 * W - 1 : 1 * W];
         b02 = i_B[3 * W - 1 : 2 * W];
       end
       4'b0010: begin
         a00 = i_A[4 * W - 1 : 3 * W];
         a01 = i_A[5 * W - 1 : 4 * W];
         a02 = i_A[6 * W - 1 : 5 * W];

         b00 = i_B[4 * W - 1 : 3 * W];
         b01 = i_B[5 * W - 1 : 4 * W];
         b02 = i_B[6 * W - 1 : 5 * W];
       end
       4'b0011: begin
         a00 = i_A[7 * W - 1 : 6 * W];
         a01 = i_A[8 * W - 1 : 7 * W];
         a02 = i_A[9 * W - 1 : 8 * W];

         b00 = i_B[7 * W - 1 : 6 * W];
         b01 = i_B[8 * W - 1 : 7 * W];
         b02 = i_B[9 * W - 1 : 8 * W];
       end
       default: begin
 	a00 = 0;
 	a01 = 0;
 	a02 = 0;

 	b00 = 0;
 	b01 = 0;
 	b02 = 0;
       end
     endcase
   end

   systolic #(.W(W), .N(N)) sys(.i_clk(i_clk), .i_rst(i_rst), .i_en(i_en), .i_mode(i_mode), .i_A(A_in), .i_B(B_in), .o_C(o_C), .debug_pe_a(debug_pe_a), .debug_pe_b(debug_pe_b));

   delay2 #(.WIDTH(W), .DEPTH(1)) delayA1(.clk(i_clk), .reset(i_rst), .data_in(a01), .data_out(a01_q));
   delay2 #(.WIDTH(W), .DEPTH(2)) delayA2(.clk(i_clk), .reset(i_rst), .data_in(a02), .data_out(a02_q));

   delay2 #(.WIDTH(W), .DEPTH(1)) delayB1(.clk(i_clk), .reset(i_rst), .data_in(b01), .data_out(b01_q));
   delay2 #(.WIDTH(W), .DEPTH(2)) delayB2(.clk(i_clk), .reset(i_rst), .data_in(b02), .data_out(b02_q));


 endmodule
 `default_nettype wire
 `endif
 // No pipelined/piplined MAC
 // Version: 1.0

 // Description:

 // Function : mac_out = in_a * in_b + in_c.  Both work for INT8 and FP16 mode. Default INT8 and FP16 are signed number
 // Exception : error detection for overflow and underflow in FP16 mode
 `ifndef MAC_UNIT_V_
 `define MAC_UNIT_V_


 `timescale 1ns / 1ps

 module mac_unit
 (
 `ifdef PIPELINE
   input            clk,
   input            rst_n,
 `endif
   input     [15:0] in_a, // multiplier input1
   input     [15:0] in_b, // multiplier input2
   input     [15:0] in_c, // adder input2 ; adder input1 = in_a*in_b
   input 	   mode,
   //output    [15:0] mac_out,
   output    [15:0] mac_out,
   output 	   error
 );

   wire [15:0] mul_out;

   int_fp_add add(
   `ifdef PIPELINE
     .clk   (clk    ),
     .rst_n (rst_n  ),
   `endif
     .mode  (mode   ),
     .a     (mul_out),
     .b     (in_c   ),
     .c     (mac_out)
   );

   int_fp_mul mul(
   `ifdef PIPELINE
     .clk   (clk    ),
     .rst_n (rst_n  ),
   `endif
     .mode  (mode   ),
     .a     (in_a   ),
     .b     (in_b   ),
     .c     (mul_out),
     .error (error  )
   );

 endmodule
 `endif
 `ifndef PE_2_V_
 `define PE_2_V_


 `timescale 1ns / 1ps
 `default_nettype none

 module PE #(
   //parameter W = 32
   parameter W = 16
 ) (
   input  wire                 i_clk,
   input  wire                 i_rst,
   input  wire                 i_en,
   input  wire                 i_mode,
   input  wire [    W - 1 : 0] i_A,
   input  wire [    W - 1 : 0] i_B,
   output wire [    W - 1 : 0] o_A,
   output wire [    W - 1 : 0] o_B,
   //output wire [    W - 1 : 0] o_C
   output reg  [    W - 1 : 0] o_C
 );

   //wire mode;
   //assign mode = 1;

   wire sync_load;
   assign o_A = i_A_buffered;
   assign o_B = i_B_buffered;
   assign sync_load = i_rst | ~i_en;

   wire [W - 1 : 0] i_A_buffered;
   wire [W - 1 : 0] i_B_buffered;

   reg  [15 : 0] accu;
   wire [15 : 0] mac_out;

   // Buffered in MAC
   delay2 #(.WIDTH(W), .DEPTH(1)) delayA(.clk(i_clk), .reset(i_rst), .data_in(i_A), .data_out(i_A_buffered));
   delay2 #(.WIDTH(W), .DEPTH(1)) delayB(.clk(i_clk), .reset(i_rst), .data_in(i_B), .data_out(i_B_buffered));

   always @(posedge i_clk) begin
     if (sync_load) begin
       accu <= 0;
       o_C  <= 0;
     end
     else begin
       accu <= mac_out;
       o_C  <= mac_out;
     end
   end

   // Optional: making it clocked
   mac_unit u0_mac(
     .in_a    (i_A_buffered),
     .in_b    (i_B_buffered),
     .in_c    (accu),
     .mode    (i_mode),
     .mac_out (mac_out)
   );

 endmodule

 `default_nettype wire
 `endif
 `ifndef DELAY_2_V_
 `define DELAY_2_V_

 `timescale 1ns / 1ps
 `default_nettype none

 module delay2 #(
   parameter WIDTH = 16,
   parameter DEPTH = 3
 ) (
   input  wire                 clk,
   input  wire                 reset,
   input  wire [WIDTH - 1 : 0] data_in,
   output wire [WIDTH - 1 : 0] data_out
 );

   wire [WIDTH - 1 : 0] connect_wire [DEPTH : 0];

   assign data_out        = connect_wire[DEPTH];
   assign connect_wire[0] = data_in;

   genvar i;
   generate
     for (i = 1; i <= DEPTH; i = i + 1) begin
       dff #(.WIDTH(WIDTH)) DFF(
         .clk(clk),
         .rst(reset),
         .inp(connect_wire[i-1]),
         .outp(connect_wire[i]));
     end
   endgenerate
 endmodule

 // D flip-flop with synchronous reset
 module dff#(
     parameter WIDTH = 1
   ) (
     input wire clk,
     input wire rst,

     input wire [WIDTH-1:0] inp,
     output reg [WIDTH-1:0] outp
   );

   always @(posedge clk) begin
     outp <= rst ? {WIDTH{1'b0}} : inp;
   end

 endmodule

 `default_nettype wire
 `endif
 `ifndef MUL_2x2_V_
 `define MUL_2x2_V_

 `timescale 1ns / 1ps

 module mul2x2(
   input  [1:0] a,
   input  [1:0] b,
   output [3:0] c
 );

   wire [3:0] tmp;

   assign tmp[0] = a[0] & b[0];
   assign tmp[1] = (a[1]&b[0]) ^ (a[0]&b[1]);
   assign tmp[2] = (a[0]&b[1]) & (a[1]&b[0]) ^ (a[1]&b[1]);
   assign tmp[3] = (a[0]&b[1]) & (a[1]&b[0]) & (a[1]&b[1]);
   assign c 	= {tmp[3],tmp[2],tmp[1],tmp[0]};

 endmodule
 `endif
 `ifndef MUL_4x4_V_
 `define MUL_4x4_V_

 `timescale 1ns / 1ps

 module mul4x4(
   input  [3:0] a,
   input  [3:0] b,
   output [7:0] c
 );

   wire [15:0] tmp1;
   wire [ 5:0] result1;
   wire [ 5:0] result2;
   wire 	      co1,co2,co3;

   mul2x2 u1(a[3:2],b[3:2],tmp1[15:12]);
   mul2x2 u2(a[1:0],b[3:2],tmp1[11:8]);
   mul2x2 u3(a[3:2],b[1:0],tmp1[7:4]);
   mul2x2 u4(a[1:0],b[1:0],tmp1[3:0]);

   cla_nbit #(.n(6)) u5({tmp1[15:12],2'b0},{2'b0,tmp1[11:8]},1'b0	,result1	,co1);
   cla_nbit #(.n(6)) u6({2'b0,tmp1[7:4]}  ,{4'b0,tmp1[3:2]} ,co1 	,result2	,co2);
   cla_nbit #(.n(6)) u7(result1           ,result2	   ,co2 	,c[7:2] 	,co3);

   assign c[1:0] = tmp1[1:0];

 endmodule
 `endif
 `ifndef MUL_8x8_V_
 `define MUL_8x8_V_

 `timescale 1ns / 1ps

 module mul8x8(
   input  [ 7:0] a,
   input  [ 7:0] b,
   output [15:0] c
 );

   wire [31:0] tmp1;
   wire [11:0] result1;
   wire [11:0] result2;
   wire        co1,co2,co3;

   mul4x4 u1(a[7:4],b[7:4],tmp1[31:24]);
   mul4x4 u2(a[3:0],b[7:4],tmp1[23:16]);
   mul4x4 u3(a[7:4],b[3:0],tmp1[15:8]);
   mul4x4 u4(a[3:0],b[3:0],tmp1[7:0]);

   cla_nbit #(.n(12)) u5({tmp1[31:24],4'b0} ,{4'b0,tmp1[23:16]} ,1'b0 ,result1 ,co1);
   cla_nbit #(.n(12)) u6({4'b0,tmp1[15:8]}  ,{8'b0,tmp1[7:4]}   ,co1  ,result2 ,co2);
   cla_nbit #(.n(12)) u7(result1		   ,result2	       ,co2  ,c[15:4] ,co3);

   assign c[3:0] = tmp1[3:0];

 endmodule
 `endif
 `ifndef MUL_16x16_V_
 `define MUL_16x16_V_

 `timescale 1ns / 1ps

 module mul16x16(
 `ifdef PIPLINE
   input clk,
   input rst_n,
 `endif
   input  [15:0] a,
   input  [15:0] b,
   output [31:0] c);

   wire [63:0] tmp1,tmp2;
   wire [23:0] result1;
   wire [23:0] result2;
   wire co1,co2,co3;

 `ifdef PIPLINE
   // one stage pipline
   reg [63:0] tmp1_reg;
   always @ (posedge clk or negedge rst_n) begin
     if (!rst_n) begin
       tmp1_reg <= 64'b0;
     end else begin
       tmp1_reg <= tmp1;
     end
   end
   assign tmp2 = tmp1_reg;

 `else
   assign tmp2 = tmp1;

 `endif

   mul8x8 u1(a[15:8],b[15:8],tmp1[63:48]);
   mul8x8 u2(a[7:0] ,b[15:8],tmp1[47:32]);
   mul8x8 u3(a[15:8],b[ 7:0],tmp1[31:16]);
   mul8x8 u4(a[7:0] ,b[ 7:0],tmp1[15:0]);

   cla_nbit #(.n(24)) u5({tmp2[63:48],8'b0} ,{8'b0,tmp2[47:32]} ,1'b0 ,result1 ,co1);
   cla_nbit #(.n(24)) u6({8'b0,tmp2[31:16]} ,{16'b0,tmp2[15:8]} ,co1  ,result2 ,co2);
   cla_nbit #(.n(24)) u7(result1            ,result2            ,co2  ,c[31:8] ,co3);

   assign c[7:0] = tmp2[7:0];

 endmodule
 `endif
 `ifndef ALIGNMENT_V_
 `define ALIGNMENT_V_

 `timescale 1ns / 1ps

 module alignment (
   input  [14:0] bigger,
   input  [14:0] smaller,
   output [10:0] aligned_small
 );

   wire c1;
   wire [4:0] bigger_exponent, smaller_exponent, shift_bits;

   assign bigger_exponent  = bigger  [14:10];
   assign smaller_exponent = smaller [14:10];
   assign aligned_small    = ({1'b1,smaller[9:0]} >> shift_bits);

   cla_nbit #(.n(5)) u1(bigger_exponent,~smaller_exponent+1'b1,1'b0,shift_bits,c1);

 endmodule
 `endif
 `ifndef SYSTOLIC_V_
 `define SYSTOLIC_V_


 `timescale 1ns / 1ps
 `default_nettype none

 // row
 // 3 x 3
 module systolic #(
   parameter W = 16,
   parameter N = 3
 ) (
   input  wire                         i_clk,
   input  wire                         i_rst,
   input  wire                         i_en,
   input  wire                         i_mode,
   input  wire [        W * N - 1 : 0] i_A,
   input  wire [        W * N - 1 : 0] i_B,
   output wire [    W * N * N - 1 : 0] o_C,

   // debug
   output wire [    W * N * 2 - 1 : 0] debug_pe_a,
   output wire [    W * N * 2 - 1 : 0] debug_pe_b
 );

   //localparam O_VEC_WIDTH = 2 * W;
   localparam O_VEC_WIDTH = W;

   wire [W - 1 : 0] a00, a01, a02, b00, b01, b02;
   wire [W - 1 : 0] pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22;
   wire [W - 1 : 0] pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22;

   wire [O_VEC_WIDTH - 1 : 0] c00, c01, c02, c10, c11, c12, c20, c21, c22;

   assign a00 = i_A[0 * W +: W];
   assign a01 = i_A[1 * W +: W];
   assign a02 = i_A[2 * W +: W];

   assign b00 = i_B[0 * W +: W];
   assign b01 = i_B[1 * W +: W];
   assign b02 = i_B[2 * W +: W];

   PE #(.W(W)) PE00(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a00),        .i_B(b00), .o_A(pe_a_00_01),.o_B(pe_b_00_10),.o_C(c00));
   PE #(.W(W)) PE01(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_00_01), .i_B(b01), .o_A(pe_a_01_02),.o_B(pe_b_01_11),.o_C(c01));
   PE #(.W(W)) PE02(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_01_02), .i_B(b02), .o_A(),          .o_B(pe_b_02_12),.o_C(c02));

   PE #(.W(W)) PE10(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a01),       .i_B(pe_b_00_10),.o_A(pe_a_10_11),.o_B(pe_b_10_20),.o_C(c10));
   PE #(.W(W)) PE11(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_10_11),.i_B(pe_b_01_11),.o_A(pe_a_11_12),.o_B(pe_b_11_21),.o_C(c11));
   PE #(.W(W)) PE12(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_11_12),.i_B(pe_b_02_12),.o_A(),          .o_B(pe_b_12_22),.o_C(c12));

   PE #(.W(W)) PE20(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a02),       .i_B(pe_b_10_20),.o_A(pe_a_20_21),.o_B(),.o_C(c20));
   PE #(.W(W)) PE21(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_20_21),.i_B(pe_b_11_21),.o_A(pe_a_21_22),.o_B(),.o_C(c21));
   PE #(.W(W)) PE22(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_21_22),.i_B(pe_b_12_22),.o_A(),          .o_B(),.o_C(c22));


   // https://stackoverflow.com/questions/18067571/indexing-vectors-and-arrays-with
   // https://standards.ieee.org/ieee/1800/6700/
   // a_vect[ 0 +: 8] // == a_vect[ 7 : 0]
   //assign o_C[1 * O_VEC_WIDTH - 1 -: O_VEC_WIDTH] = c00;

   assign o_C[0 * O_VEC_WIDTH +: O_VEC_WIDTH] = c00;
   assign o_C[1 * O_VEC_WIDTH +: O_VEC_WIDTH] = c01;
   assign o_C[2 * O_VEC_WIDTH +: O_VEC_WIDTH] = c02;
   assign o_C[3 * O_VEC_WIDTH +: O_VEC_WIDTH] = c10;
   assign o_C[4 * O_VEC_WIDTH +: O_VEC_WIDTH] = c11;
   assign o_C[5 * O_VEC_WIDTH +: O_VEC_WIDTH] = c12;
   assign o_C[6 * O_VEC_WIDTH +: O_VEC_WIDTH] = c20;
   assign o_C[7 * O_VEC_WIDTH +: O_VEC_WIDTH] = c21;
   assign o_C[8 * O_VEC_WIDTH +: O_VEC_WIDTH] = c22;

   assign debug_pe_a = {pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22};
   assign debug_pe_b = {pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22};

 endmodule
 `endif
 `ifndef CLA_NBIT_V_
 `define CLA_NBIT_V_

 `timescale 1ns / 1ps

 // Carry Look-ahead adder (CLA)
 module cla_nbit #(
   parameter n = 4
 ) (
   input   [n-1:0] a,
   input   [n-1:0] b,
   input           ci,
   output  [n-1:0] s,
   output          co
 );

   wire [n-1:0] g;
   wire [n-1:0] p;
   wire [  n:0] c;

   assign c[0] = ci;
   assign co   = c[n];

   genvar i;  /* i - generate index variable */

   generate
     for (i = 0; i < n; i = i + 1) begin : addbit
       assign s[i] = a[i] ^ b[i] ^ c[i];
       assign g[i] = a[i] & b[i];
       assign p[i] = a[i] | b[i];
       assign c[i + 1] = g[i] | (p[i] & c[i]);
     end
   endgenerate

 endmodule
 `endif
 `ifndef INT_FP_ADD_V_
 `define INT_FP_ADD_V_


 `timescale 1ns / 1ps

 module int_fp_add (
 `ifdef PIPELINE
   input         clk,
   input         rst_n,
 `endif
   input         mode,
   input  [15:0] a,
   input  [15:0] b,
   output [15:0] c
 );

   wire [10:0] adder_input_1,adder_input_2,aligned_small,adder_output;
   wire if_sub,a_sign, b_sign, c_sign,c1, c2;
   wire [15:0] normalized_out;

   // only used in INT8 MAC mode
   wire [4:0] higher_add,higher_a,higher_b;

   wire [15:0] result;
   reg [14:0] bigger, smaller;
   reg a_larger_b;

 `ifdef PIPELINE
   reg [14:0] bigger_reg, smaller_reg;
   reg [10:0] adder_output_reg;
   wire [14:0] bigger_tmp, smaller_tmp;
   wire [10:0] adder_output_tmp;
 `endif


   assign a_sign        = a[15];
   assign b_sign        = b[15];
   assign if_sub        = (a_sign == b_sign) ? 1'b0 : 1'b1;
   assign c_sign        = a_larger_b ? a_sign : b_sign;
   assign higher_a      = (mode == 1'b0) ? a[15:11] : 5'b0;
   assign higher_b      = (mode == 1'b0) ? b[15:11] : 5'b0;
   assign adder_input_1 = (mode==1'b0) ? a[10:0] :{1'b1,bigger[9:0]};
   assign adder_input_2 = (mode==1'b0) ? b[10:0] : (if_sub ? ~aligned_small + 1'b1 : aligned_small);
   assign c             = (mode == 1'b0) ? {higher_add,adder_output} : result;

   //compare two number regardless sign
   always @(*) begin
     if (a[14:0] > b[14:0]) begin
       bigger = a[14:0];
       smaller = b[14:0];
       a_larger_b = 1'b1;
     end else begin
       bigger = b[14:0];
       smaller = a[14:0];
       a_larger_b = 1'b0;
     end
   end

 `ifdef PIPELINE
     always @ (posedge clk or negedge rst_n) begin
       if (!rst_n) begin
         bigger_reg <= 15'b0;
         smaller_reg <= 15'b0;
         adder_output_reg <= 11'b0;
       end else begin
         bigger_reg <= bigger;
         smaller_reg <= smaller;
         adder_output_reg <= adder_output;
       end
     end
     assign bigger_tmp = bigger_reg[14:0];
     assign smaller_tmp = smaller_reg[14:0];
     assign adder_output_tmp = adder_output_reg[10:0];
 `endif

 `ifdef PIPELINE
   // align small number
   alignment u1(bigger_tmp,smaller_tmp,aligned_small);
 `else
   // align small number
   alignment u1(bigger,smaller,aligned_small);
 `endif

   cla_nbit #(.n(11)) u2(adder_input_1,adder_input_2,1'b0,adder_output,c1);

   // This 5 bit adder only used in INT8 MAC mode
   cla_nbit #(.n(5)) u3(higher_a,higher_b,c1,higher_add,c2);

 `ifdef PIPELINE
   add_normalizer u4(c_sign,bigger[14:10],adder_output_tmp,result,c1,if_sub);
 `else
   add_normalizer u4(c_sign,bigger[14:10],adder_output,result,c1,if_sub);
 `endif

 endmodule
 `endif
 `ifndef INT_FP_MUL_V_
 `define INT_FP_MUL_V_


 module int_fp_mul (
 `ifdef PIPELINE
   input         clk,
   input         rst_n,
 `endif
   input         mode,
   input  [15:0] a,
   input  [15:0] b,
   output [15:0] c,
   output        error // valid in fp16 mode
 );

   wire [15:0] c_tmp;
   wire        c_sign,a_zero,b_zero;
   wire [ 4:0] sum_exponent, biased_sum_exponent;
   wire [15:0] multiplier_input1,multiplier_input2;

   wire [31:0] multiplier_output;
   wire [14:0] normalized_out;
   wire [21:0] mantissa_prod;
   wire c1,c2,underflow,overflow;

   assign overflow = (c1 && c2 && ~biased_sum_exponent[4]) ? 1'b1 :1'b0;
   assign underflow = (~c1 && ~c2 && biased_sum_exponent[4]) ? 1'b1:1'b0;

   assign a_zero = ~(|a);
   assign b_zero = ~(|b);
   assign c_sign = a[15] ^ b[15];
   assign multiplier_input1 = mode ? {5'b0,1'b1,a[9:0]} : ((a[7]==1'b0) ? {9'b0,a[6:0]} : {9'b0,~a[6:0]+1'b1});
   assign multiplier_input2 = mode ? {5'b0,1'b1,b[9:0]} : ((b[7]==1'b0) ? {9'b0,b[6:0]} : {9'b0,~b[6:0]+1'b1});

   assign c = mode ? ((a_zero | b_zero) ? 16'b0 : c_tmp) : ((a[7]^b[7] == 1'b0) ? multiplier_output[15:0] : {1'b1,~multiplier_output[14:0]+1'b1});
   //error detect
   assign c_tmp = (~error) ? {c_sign,normalized_out} : (underflow ? {c_sign,15'b0000_0000_0000_000} : {c_sign,5'b1111_1,10'b0000_0000_00});

   assign error = overflow | underflow;


 `ifdef PIPELINE

   reg [31:0] multiplier_output_tmp;

   always @ (posedge clk or negedge rst_n) begin
     if (!rst_n) begin
       multiplier_output_tmp <= 32'b0;
     end else begin
       multiplier_output_tmp <= multiplier_output;
     end
   end

   assign mantissa_prod = multiplier_output_tmp[21:0];
   mul16x16 u1(clk,rst_n,multiplier_input1,multiplier_input2,multiplier_output);

 `else

   assign mantissa_prod = multiplier_output[21:0];
   mul16x16 u1(multiplier_input1,multiplier_input2,multiplier_output);

 `endif

   cla_nbit #(.n(5)) u2(a[14:10],b[14:10],1'b0,sum_exponent,c1); // add exponent
   cla_nbit #(.n(5)) u3(sum_exponent, 5'b10001,1'b0,biased_sum_exponent,c2); // minus bias
   mul_normalizer u4(biased_sum_exponent,mantissa_prod,normalized_out);

 endmodule
 `endif
 `ifndef MUL_NORMALIZER_V_
 `define MUL_NORMALIZER_V_

 `timescale 1ns / 1ps

 module mul_normalizer (
   input  [ 4:0] exponent,
   input  [21:0] mantissa_prod,
   output [14:0] result
 );

   wire [4:0] result_exponent;
   wire [9:0] result_mantissa;

   assign result_exponent = (mantissa_prod[21]) ? (exponent + 1'b1): exponent;
   assign result_mantissa = (mantissa_prod[21]) ? mantissa_prod[20:11]:mantissa_prod[19:10];
   assign result          = {result_exponent,result_mantissa};

 // No rounding and No overflow/underflow detection

 endmodule
 `endif
 // SPDX-FileCopyrightText: 2020 Efabless Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // SPDX-License-Identifier: Apache-2.0

 `default_nettype none
 /*
  *-------------------------------------------------------------
  *
  * user_proj_example
  *
  * This is an example of a (trivially simple) user project,
  * showing how the user project can connect to the logic
  * analyzer, the wishbone bus, and the I/O pads.
  *
  * This project generates an integer count, which is output
  * on the user area GPIO pads (digital output only).  The
  * wishbone connection allows the project to be controlled
  * (start and stop) from the management SoC program.
  *
  * See the testbenches in directory "mprj_counter" for the
  * example programs that drive this user project.  The three
  * testbenches are "io_ports", "la_test1", and "la_test2".
  *
  *-------------------------------------------------------------
  */

 module user_proj_example #(
     parameter BITS = 32
 )(
 `ifdef USE_POWER_PINS
     inout vccd1,	// User area 1 1.8V supply
     inout vssd1,	// User area 1 digital ground
 `endif

     // Wishbone Slave ports (WB MI A)
     input wb_clk_i,
     input wb_rst_i,
     input wbs_stb_i,
     input wbs_cyc_i,
     input wbs_we_i,
     input [3:0] wbs_sel_i,
     input [31:0] wbs_dat_i,
     input [31:0] wbs_adr_i,
     output wbs_ack_o,
     output [31:0] wbs_dat_o,

     // Logic Analyzer Signals
     input  [127:0] la_data_in,
     output [127:0] la_data_out,
     input  [127:0] la_oenb,

     // IOs
     input  [`MPRJ_IO_PADS-1:0] io_in,
     output [`MPRJ_IO_PADS-1:0] io_out,
     output [`MPRJ_IO_PADS-1:0] io_oeb,

     // IRQ
     output [2:0] irq
 );
     wire clk;
     wire rst;

     wire [`MPRJ_IO_PADS-1:0] io_in;
     wire [`MPRJ_IO_PADS-1:0] io_out;
     wire [`MPRJ_IO_PADS-1:0] io_oeb;

     wire [31:0] rdata;
     wire [31:0] wdata;
     wire [BITS-1:0] count;

     wire valid;
     wire [3:0] wstrb;
     wire [31:0] la_write;

     // WB MI A
     assign valid = wbs_cyc_i && wbs_stb_i;
     assign wstrb = wbs_sel_i & {4{wbs_we_i}};
     assign wbs_dat_o = rdata;
     assign wdata = wbs_dat_i;

     // IO
     assign io_out = count;
     assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};

     // IRQ
     assign irq = 3'b000;	// Unused

     // LA
     //assign la_data_out = {{(127-BITS){1'b0}}, count};
     // Assuming LA probes [63:32] are for controlling the count register
     assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
     // Assuming LA probes [65:64] are for controlling the count clk & reset
     assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
     assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;

     wire cs;
     // Assuming LA probes [66] are for controlling cs (data ready)
     assign cs = (~la_oenb[66]) ? la_data_in[66] : 0;

     // Assuming LA probes [77] are for controlling wr

     wire wr;
     assign wr = (~la_oenb[77]) ? la_data_in[77] : 0;

     wire [31:0] bank2;
     wire done_o_net;
     //assign bank2 = {{(24){1'b0}}, done_o_net, {(7){1'b0}}};
     assign bank2 = {{(17){1'b0}}, mem_set_done_o_net, {(6){1'b0}}, done_o_net, {(7){1'b0}}};
     assign la_data_out = {{(BITS){1'b0}}, bank2, {(BITS){1'b0}}, count};

     /*
     counter #(
         .BITS(BITS)
     ) counter(
         .clk(clk),
         .reset(rst),
         .ready(wbs_ack_o),
         .valid(valid),
         .rdata(rdata),
         .wdata(wbs_dat_i),
         .wstrb(wstrb),
         .la_write(la_write),
         .la_input(la_data_in[63:32]),
         .count(count)
     );
     */

     wire mem_set_done_o_net;

     interface_top interface_inst(
       .clk(clk),
       .rst(rst),
       .cs(cs),
       .sync(wr),
       .data_addr_i(la_data_in[76:72]),
       .readout_addr(la_data_in[70:67]),
       .data_in(la_data_in[63:32]),
       .data_out(count),
       .done_o(done_o_net),
       .mem_set_done_o(mem_set_done_o_net)
     );

 endmodule

 module counter #(
     parameter BITS = 32
 )(
     input clk,
     input reset,
     input valid,
     input [3:0] wstrb,
     input [BITS-1:0] wdata,
     input [BITS-1:0] la_write,
     input [BITS-1:0] la_input,
     output ready,
     output [BITS-1:0] rdata,
     output [BITS-1:0] count
 );
     reg ready;
     reg [BITS-1:0] count;
     reg [BITS-1:0] rdata;

     always @(posedge clk) begin
         if (reset) begin
             count <= 0;
             ready <= 0;
         end else begin
             ready <= 1'b0;
             if (~|la_write) begin
                 count <= count + 1;
             end
             if (valid && !ready) begin
                 ready <= 1'b1;
                 rdata <= count;
                 if (wstrb[0]) count[7:0]   <= wdata[7:0];
                 if (wstrb[1]) count[15:8]  <= wdata[15:8];
                 if (wstrb[2]) count[23:16] <= wdata[23:16];
                 if (wstrb[3]) count[31:24] <= wdata[31:24];
             end else if (|la_write) begin
                 count <= la_write & la_input;
             end
         end
     end

 endmodule
 `default_nettype wire

 `default_nettype none

 // Looks like we need our own version of transactional memory definition
 // SPI alike
 // 3 x 3, 3 x 3 -- 18 in
 // 3 x 3 9 out
 // GEMM

 module interface_top (
   input  wire        clk,
   input  wire        rst,
   input  wire        sync, // wr
   input  wire        cs,   // data ready, en
   input  wire [ 4:0] data_addr_i,    // input   addr
   input  wire [ 3:0] readout_addr,   // output  addr
   input  wire [31:0] data_in,
   output reg  [31:0] data_out,
   output wire        done_o,
   output wire        mem_set_done_o
 );

   localparam W = 16;
   localparam N = 3;

   //wire done_o;
   //assign data_out = 1;
   reg [31:0] next_data_out;

   // scratch pad
   reg  [2 * W * N * N - 1 : 0] input_registers;
   reg  [2 * W * N * N - 1 : 0] next_input_registers;
   wire [    W * N * N - 1 : 0] C_mat;

   // 16bit - 8
   // 32bit - 9
   // clog2 function
   //reg  [7 : 0] addr_ptr;

   wire [W * N * N - 1 : 0] A_mat;
   wire [W * N * N - 1 : 0] B_mat;

   assign A_mat = input_registers[        0 +: W * N * N];
   assign B_mat = input_registers[W * N * N +: W * N * N];

   // mode selection
   // First try on FP16
   reg [2:0]      state;
   reg [2:0] next_state;

   wire [2:0] IDLE    = 3'b000;
   wire [2:0] LOAD    = 3'b001;
   wire [2:0] PROCESS = 3'b010;

   // Refactor: Moving them inside control
   wire [W * N - 1 : 0] A_in;
   wire [W * N - 1 : 0] B_in;

   control #(.W(W), .N(N)) control_inst(
     .i_clk(clk),
     .i_rst(rst),
     .i_en(cs),
     .i_mode(1'b1),
     .i_A(A_mat),
     .i_B(B_mat),
     .o_C(C_mat),
     .o_done(done_o)
   );

   // memory counter
   // onehot
   reg [17:0] memory_set;
   reg [17:0] next_memory_set;

   // sync in control or PEs might be redundant

   always @(posedge clk) begin
     // TODO: bubble up, as a single signal
     if (rst | sync) begin
       // Force combo circuits!!
       state           <= IDLE;
       data_out        <= 32'b0;
       memory_set      <= 18'b0;
       input_registers <= 288'b0;
     end
     else begin
       state           <= next_state;
       memory_set      <= next_memory_set;
       input_registers <= next_input_registers;
       data_out 	      <= next_data_out;
     end
   end

   // always @(sync) begin
   //   if (sync) begin
   //     next_state <= 0;
   //   end
   // end

   // Should we also use memory counter here?
   always @(*) begin
     case (state)
       IDLE: begin
 	next_state = cs ? PROCESS : IDLE;
       end
       PROCESS: begin
 	next_state = (done_o | ~cs) ? IDLE : PROCESS;
       end
       default: begin
 	next_state = IDLE;
       end
     endcase
   end
   // Tips: Moving comb logic out of the state machine

   // input and output are concurrent
   always @(*) begin
     next_input_registers = input_registers;
     case (data_addr_i)
       5'b00000: begin
 	next_input_registers[5'b00000 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00001: begin
 	next_input_registers[5'b00001 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00010: begin
 	next_input_registers[5'b00010 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00011: begin
 	next_input_registers[5'b00011 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00100: begin
 	next_input_registers[5'b00100 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00101: begin
 	next_input_registers[5'b00101 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00110: begin
 	next_input_registers[5'b00110 * W +: W] = data_in[W - 1 : 0];
       end
       5'b00111: begin
 	next_input_registers[5'b00111 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01000: begin
 	next_input_registers[5'b01000 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01001: begin
 	next_input_registers[5'b01001 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01010: begin
 	next_input_registers[5'b01010 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01011: begin
 	next_input_registers[5'b01011 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01100: begin
 	next_input_registers[5'b01100 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01101: begin
 	next_input_registers[5'b01101 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01110: begin
 	next_input_registers[5'b01110 * W +: W] = data_in[W - 1 : 0];
       end
       5'b01111: begin
 	next_input_registers[5'b01111 * W +: W] = data_in[W - 1 : 0];
       end
       5'b10000: begin
 	next_input_registers[5'b10000 * W +: W] = data_in[W - 1 : 0];
       end
       5'b10001: begin
 	next_input_registers[5'b10001 * W +: W] = data_in[W - 1 : 0];
       end
       // Avoid inferring Latches
       default: begin
 	next_input_registers = input_registers;
       end
     endcase
   end

   // memory set decoder
   always @(*) begin
     next_memory_set = memory_set;
     case (data_addr_i)
       5'b00000: begin
         next_memory_set = memory_set | (18'b1 << 0);
       end
       5'b00001: begin
         next_memory_set = memory_set | (18'b1 << 1);
       end
       5'b00010: begin
         next_memory_set = memory_set | (18'b1 << 2);
       end
       5'b00011: begin
         next_memory_set = memory_set | (18'b1 << 3);
       end
       5'b00100: begin
         next_memory_set = memory_set | (18'b1 << 4);
       end
       5'b00101: begin
         next_memory_set = memory_set | (18'b1 << 5);
       end
       5'b00110: begin
         next_memory_set = memory_set | (18'b1 << 6);
       end
       5'b00111: begin
         next_memory_set = memory_set | (18'b1 << 7);
       end
       5'b01000: begin
         next_memory_set = memory_set | (18'b1 << 8);
       end
       5'b01001: begin
         next_memory_set = memory_set | (18'b1 << 9);
       end
       5'b01010: begin
         next_memory_set = memory_set | (18'b1 << 10);
       end
       5'b01011: begin
         next_memory_set = memory_set | (18'b1 << 11);
       end
       5'b01100: begin
         next_memory_set = memory_set | (18'b1 << 12);
       end
       5'b01101: begin
         next_memory_set = memory_set | (18'b1 << 13);
       end
       5'b01110: begin
         next_memory_set = memory_set | (18'b1 << 14);
       end
       5'b01111: begin
         next_memory_set = memory_set | (18'b1 << 15);
       end
       5'b10000: begin
         next_memory_set = memory_set | (18'b1 << 16);
       end
       5'b10001: begin
         next_memory_set = memory_set | (18'b1 << 17);
       end
       default: begin
         next_memory_set = memory_set;
       end
     endcase
   end

   assign mem_set_done_o = memory_set == {(18){1'b1}};

   // Readout addr
   always @(*) begin
     next_data_out = data_out;
     case (readout_addr)
       4'b0000: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0000 * W +: W];
       end
       4'b0001: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0001 * W +: W];
       end
       4'b0010: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0010 * W +: W];
       end
       4'b0011: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0011 * W +: W];
       end
       4'b0100: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0100 * W +: W];
       end
       4'b0101: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0101 * W +: W];
       end
       4'b0110: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0110 * W +: W];
       end
       4'b0111: begin
         next_data_out[W - 1 : 0] = C_mat[4'b0111 * W +: W];
       end
       4'b1000: begin
         next_data_out[W - 1 : 0] = C_mat[4'b1000 * W +: W];
       end
       // Avoid inferring Latches
       default: begin
 	next_data_out = data_out;
       end
     endcase
   end

 endmodule

 `default_nettype wire