External input ctrl (#6)

* Add end to end read out logic and testing

Fix verilog inference errors

Attempt to add state control

Signed-off-by: ianboyanzhang

Fix errors reported by yosys

* Fix openlane layout config (#5)
diff --git a/README.md b/README.md
index 3077244..02a024e 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,4 @@
 # Caravel User Project
 
-[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![UPRJ_CI](https://github.com/efabless/caravel_project_example/actions/workflows/user_project_ci.yml/badge.svg)](https://github.com/efabless/caravel_project_example/actions/workflows/user_project_ci.yml) [![Caravel Build](https://github.com/efabless/caravel_project_example/actions/workflows/caravel_build.yml/badge.svg)](https://github.com/efabless/caravel_project_example/actions/workflows/caravel_build.yml)
+MPW 7 Systolic Array submission
 
-| :exclamation: Important Note            |
-|-----------------------------------------|
-
-## Please fill in your project documentation in this README.md file 
-
-Refer to [README](docs/source/quickstart.rst) for a quick start of how to use caravel_user_project
-
-Refer to [README](docs/source/index.rst) for this sample project documentation. 
diff --git a/openlane/user_proj_example/config.tcl b/openlane/user_proj_example/config.tcl
index 216cba6..55c2d7a 100755
--- a/openlane/user_proj_example/config.tcl
+++ b/openlane/user_proj_example/config.tcl
@@ -19,6 +19,7 @@
 set script_dir [file dirname [file normalize [info script]]]
 
 set ::env(DESIGN_NAME) user_proj_example
+#set ::env(DESIGN_NAME) add_normalizer
 
 set ::env(VERILOG_FILES) "\
 	$::env(CARAVEL_ROOT)/verilog/rtl/defines.v \
@@ -28,7 +29,8 @@
 
 set ::env(CLOCK_PORT) "wb_clk_i"
 set ::env(CLOCK_NET) "counter.clk"
-set ::env(CLOCK_PERIOD) "10"
+#set ::env(CLOCK_PERIOD) "10"
+set ::env(CLOCK_PERIOD) "20"
 
 set ::env(FP_SIZING) absolute
 set ::env(DIE_AREA) "0 0 900 600"
@@ -36,7 +38,8 @@
 set ::env(FP_PIN_ORDER_CFG) $script_dir/pin_order.cfg
 
 set ::env(PL_BASIC_PLACEMENT) 0
-set ::env(PL_TARGET_DENSITY) 0.05
+#set ::env(PL_TARGET_DENSITY) 0.05
+set ::env(PL_TARGET_DENSITY) 0.26
 
 # Maximum layer used for routing is metal 4.
 # This is because this macro will be inserted in a top level (user_project_wrapper) 
diff --git a/verilog/dv/la_test2/la_test2.c b/verilog/dv/la_test2/la_test2.c
index 003bf4e..4825db7 100644
--- a/verilog/dv/la_test2/la_test2.c
+++ b/verilog/dv/la_test2/la_test2.c
@@ -29,6 +29,7 @@
 int clk = 0;
 int i;
 
+/*
 uint32_t mat_A[9] = {
   //1.126105
   0x00003c81,
@@ -70,6 +71,34 @@
   //2.339815
   0x000040ae
 };
+*/
+
+uint32_t mat_A[9] = {
+  // 1
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+};
+
+uint32_t mat_B[9] = {
+  // 1
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+  0x00003c00,
+};
+
 
 void main()
 {
@@ -136,13 +165,22 @@
 	reg_mprj_datal = 0xAB600000;
 
 	// Configure LA[64] LA[65] LA[66] as outputs from the cpu
-	reg_la2_oenb = reg_la2_iena = 0x00000007;
+	// Configure LA[67] LA[68] LA[69] LA[70] as outputs from the CPU as readout address select
+	// Configure LA[71] as input to the CPU for o_done signal
+	// Configure LA[72] LA[73] LA[74] LA[75] LA[76] as outputs from the CPU as input matrix address select
+	// Configure LA[77] as output from the CPU as 'control module sync' or write enable. (sync/wr)
+	// Configure LA[78] as input to the CPU as mem_set_done_oc
+	reg_la2_oenb = reg_la2_iena = 0x000003F7F;
+
 	// clk, reset, cs
 	//reg_la2_oenb = reg_la2_iena = 0x00000007; 
 
 	// Set clk & reset to one
 	reg_la2_data = 0x00000003;
 
+	// Set sync & wr to one
+	reg_la2_data = reg_la2_data | 0x00001000;
+
 	// Configure LA[63:32] output from the cpu
 	reg_la1_oenb = reg_la1_iena = 0xFFFFFFFF;
 	reg_la1_data = 0x00000000;
@@ -160,15 +198,18 @@
 	reg_la2_data = reg_la2_data | 0x00000004;
 
 	uint32_t i_mat = 0;
+	uint32_t mat_addr = 0x00000000; 
 
 	// Toggle clk & send mat_A data
 	for (i=0; i<17; i=i+1) {
 	    clk = !clk;
 	    reg_la2_data = 0x00000000 | clk;
 	    reg_la2_data = reg_la2_data | 0x00000004;
+	    reg_la2_data = reg_la2_data | mat_addr;
 	    if (clk == 0) {
 	        reg_la1_data = mat_A[i_mat];
 		i_mat += 1;
+		mat_addr += 0x00000100;
 	    } 
 	}
 
@@ -178,9 +219,11 @@
 	    clk = !clk;
 	    reg_la2_data = 0x00000000 | clk;
 	    reg_la2_data = reg_la2_data | 0x00000004;
+	    reg_la2_data = reg_la2_data | mat_addr;
 	    if (clk == 0) {
 	        reg_la1_data = mat_B[i_mat];
 		i_mat += 1;
+		mat_addr += 0x00000100;
 	    } 
 	}
 
@@ -193,8 +236,12 @@
         while (1){
 		clk = !clk;
 		reg_la2_data = 0x00000000 | clk;
+	    	reg_la2_data = reg_la2_data | 0x00000004;
 
-                if ((reg_la0_data_in & 0x0000FFFF) >= 0x00000015) {
+                if ((reg_la0_data_in & 0x0000FFFF) >= 0x00004200 &&
+		    (reg_la2_data_in & 0x0000FFFF) == 0x00000080) {
+			// de-assert cs
+			reg_la2_data = reg_la2_data & ~(0x00001000);
                         reg_mprj_datal = 0xAB610000;
                         break;
                 }
diff --git a/verilog/dv/la_test2/la_test2_tb.v b/verilog/dv/la_test2/la_test2_tb.v
index 613bfa3..3c4d790 100644
--- a/verilog/dv/la_test2/la_test2_tb.v
+++ b/verilog/dv/la_test2/la_test2_tb.v
@@ -141,7 +141,7 @@
 		// Repeat cycles of 1000 clock edges as needed to complete testbench
 		repeat (75) begin
 			//repeat (3000) @(posedge clock);
-			repeat (7000) @(posedge clock);
+			repeat (6000) @(posedge clock);
 			// $display("+1000 cycles");
 		end
 		$display("%c[1;31m",27);
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
index 8fa97d2..0b03d4b 100644
--- a/verilog/rtl/user_proj_example.v
+++ b/verilog/rtl/user_proj_example.v
@@ -114,7 +114,7 @@
   end
   
   always @(*) begin
-    if (next_states == 4'b1001) begin
+    if (states == 4'b1001) begin
       // Done: Force waiting
       // This can also be done by switching off input
       next_states = 4'b1001;
@@ -344,810 +344,7 @@
   );
 
   always @(posedge clk) begin
-    outp <= rst ? 0 : inp;
-  end
-
-endmodule
-
-`default_nettype wire
-`endif
-`ifndef MUL_2x2_V_
-`define MUL_2x2_V_
-
-`timescale 1ns / 1ps
-
-module mul2x2(
-  input  [1:0] a,
-  input  [1:0] b,
-  output [3:0] c
-);
-
-  wire [3:0] tmp;
-
-  assign tmp[0] = a[0] & b[0];
-  assign tmp[1] = (a[1]&b[0]) ^ (a[0]&b[1]);
-  assign tmp[2] = (a[0]&b[1]) & (a[1]&b[0]) ^ (a[1]&b[1]);
-  assign tmp[3] = (a[0]&b[1]) & (a[1]&b[0]) & (a[1]&b[1]);
-  assign c 	= {tmp[3],tmp[2],tmp[1],tmp[0]};
-
-endmodule
-`endif
-`ifndef MUL_4x4_V_
-`define MUL_4x4_V_
-
-`timescale 1ns / 1ps
-
-module mul4x4(
-  input  [3:0] a,
-  input  [3:0] b,
-  output [7:0] c
-);
-
-  wire [15:0] tmp1;
-  wire [ 5:0] result1;
-  wire [ 5:0] result2;
-  wire 	      co1,co2,co3;
-
-  mul2x2 u1(a[3:2],b[3:2],tmp1[15:12]);
-  mul2x2 u2(a[1:0],b[3:2],tmp1[11:8]);
-  mul2x2 u3(a[3:2],b[1:0],tmp1[7:4]);
-  mul2x2 u4(a[1:0],b[1:0],tmp1[3:0]);
-
-  cla_nbit #(.n(6)) u5({tmp1[15:12],2'b0},{2'b0,tmp1[11:8]},1'b0	,result1	,co1);
-  cla_nbit #(.n(6)) u6({2'b0,tmp1[7:4]}  ,{4'b0,tmp1[3:2]} ,co1 	,result2	,co2);
-  cla_nbit #(.n(6)) u7(result1           ,result2	   ,co2 	,c[7:2] 	,co3);
-
-  assign c[1:0] = tmp1[1:0];
-
-endmodule
-`endif
-`ifndef MUL_8x8_V_
-`define MUL_8x8_V_
-
-`timescale 1ns / 1ps
-
-module mul8x8(
-  input  [ 7:0] a,
-  input  [ 7:0] b,
-  output [15:0] c
-);
-
-  wire [31:0] tmp1;
-  wire [11:0] result1;
-  wire [11:0] result2;
-  wire        co1,co2,co3;
-
-  mul4x4 u1(a[7:4],b[7:4],tmp1[31:24]);
-  mul4x4 u2(a[3:0],b[7:4],tmp1[23:16]);
-  mul4x4 u3(a[7:4],b[3:0],tmp1[15:8]);
-  mul4x4 u4(a[3:0],b[3:0],tmp1[7:0]);
-
-  cla_nbit #(.n(12)) u5({tmp1[31:24],4'b0} ,{4'b0,tmp1[23:16]} ,1'b0 ,result1 ,co1);
-  cla_nbit #(.n(12)) u6({4'b0,tmp1[15:8]}  ,{8'b0,tmp1[7:4]}   ,co1  ,result2 ,co2);
-  cla_nbit #(.n(12)) u7(result1		   ,result2	       ,co2  ,c[15:4] ,co3);
-
-  assign c[3:0] = tmp1[3:0];
-
-endmodule
-`endif
-`ifndef MUL_16x16_V_
-`define MUL_16x16_V_
-
-`timescale 1ns / 1ps
-
-module mul16x16(
-`ifdef PIPLINE
-  input clk,
-  input rst_n,
-`endif
-  input  [15:0] a,
-  input  [15:0] b,
-  output [31:0] c);
-
-  wire [63:0] tmp1,tmp2;
-  wire [23:0] result1;
-  wire [23:0] result2;
-  wire co1,co2,co3;
-
-`ifdef PIPLINE
-  // one stage pipline
-  reg [63:0] tmp1_reg;
-  always @ (posedge clk or negedge rst_n) begin
-    if (!rst_n) begin
-      tmp1_reg <= 64'b0;
-    end else begin
-      tmp1_reg <= tmp1;
-    end
-  end
-  assign tmp2 = tmp1_reg;
-
-`else 
-  assign tmp2 = tmp1;
-
-`endif
-
-  mul8x8 u1(a[15:8],b[15:8],tmp1[63:48]);
-  mul8x8 u2(a[7:0] ,b[15:8],tmp1[47:32]);
-  mul8x8 u3(a[15:8],b[ 7:0],tmp1[31:16]);
-  mul8x8 u4(a[7:0] ,b[ 7:0],tmp1[15:0]);
-
-  cla_nbit #(.n(24)) u5({tmp2[63:48],8'b0} ,{8'b0,tmp2[47:32]} ,1'b0 ,result1 ,co1);
-  cla_nbit #(.n(24)) u6({8'b0,tmp2[31:16]} ,{16'b0,tmp2[15:8]} ,co1  ,result2 ,co2);
-  cla_nbit #(.n(24)) u7(result1            ,result2            ,co2  ,c[31:8] ,co3);
-
-  assign c[7:0] = tmp2[7:0];
-
-endmodule
-`endif
-`ifndef ALIGNMENT_V_
-`define ALIGNMENT_V_
-
-`timescale 1ns / 1ps
-
-module alignment (
-  input  [14:0] bigger, 
-  input  [14:0] smaller,
-  output [10:0] aligned_small
-);
-
-  wire c1;
-  wire [4:0] bigger_exponent, smaller_exponent, shift_bits;
-
-  assign bigger_exponent  = bigger  [14:10];
-  assign smaller_exponent = smaller [14:10];
-  assign aligned_small    = ({1'b1,smaller[9:0]} >> shift_bits);
-
-  cla_nbit #(.n(5)) u1(bigger_exponent,~smaller_exponent+1'b1,1'b0,shift_bits,c1);
-
-endmodule
-`endif
-`ifndef SYSTOLIC_V_
-`define SYSTOLIC_V_
-
-
-`timescale 1ns / 1ps
-`default_nettype none
-
-// row
-// 3 x 3
-module systolic #(
-  parameter W = 16,
-  parameter N = 3
-) (
-  input  wire                         i_clk,
-  input  wire                         i_rst,
-  input  wire                         i_en,
-  input  wire                         i_mode,
-  input  wire [        W * N - 1 : 0] i_A,
-  input  wire [        W * N - 1 : 0] i_B,
-  output wire [    W * N * N - 1 : 0] o_C,
-
-  // debug
-  output wire [    W * N * 2 - 1 : 0] debug_pe_a,
-  output wire [    W * N * 2 - 1 : 0] debug_pe_b
-);
-
-  //localparam O_VEC_WIDTH = 2 * W;
-  localparam O_VEC_WIDTH = W;
-
-  wire [W - 1 : 0] a00, a01, a02, b00, b01, b02;
-  wire [W - 1 : 0] pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22;
-  wire [W - 1 : 0] pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22;
-
-  wire [O_VEC_WIDTH - 1 : 0] c00, c01, c02, c10, c11, c12, c20, c21, c22;
-
-  assign a00 = i_A[0 * W +: W];
-  assign a01 = i_A[1 * W +: W];
-  assign a02 = i_A[2 * W +: W];
-
-  assign b00 = i_B[0 * W +: W];
-  assign b01 = i_B[1 * W +: W];
-  assign b02 = i_B[2 * W +: W];
-
-  PE #(.W(W)) PE00(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a00),        .i_B(b00), .o_A(pe_a_00_01),.o_B(pe_b_00_10),.o_C(c00));
-  PE #(.W(W)) PE01(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_00_01), .i_B(b01), .o_A(pe_a_01_02),.o_B(pe_b_01_11),.o_C(c01));
-  PE #(.W(W)) PE02(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_01_02), .i_B(b02), .o_A(),          .o_B(pe_b_02_12),.o_C(c02));
-
-  PE #(.W(W)) PE10(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a01),       .i_B(pe_b_00_10),.o_A(pe_a_10_11),.o_B(pe_b_10_20),.o_C(c10));
-  PE #(.W(W)) PE11(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_10_11),.i_B(pe_b_01_11),.o_A(pe_a_11_12),.o_B(pe_b_11_21),.o_C(c11));
-  PE #(.W(W)) PE12(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_11_12),.i_B(pe_b_02_12),.o_A(),          .o_B(pe_b_12_22),.o_C(c12));
-
-  PE #(.W(W)) PE20(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(a02),       .i_B(pe_b_10_20),.o_A(pe_a_20_21),.o_B(),.o_C(c20));
-  PE #(.W(W)) PE21(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_20_21),.i_B(pe_b_11_21),.o_A(pe_a_21_22),.o_B(),.o_C(c21));
-  PE #(.W(W)) PE22(.i_clk(i_clk),.i_rst(i_rst),.i_en(i_en), .i_mode(i_mode), .i_A(pe_a_21_22),.i_B(pe_b_12_22),.o_A(),          .o_B(),.o_C(c22));
-
-  
-  // https://stackoverflow.com/questions/18067571/indexing-vectors-and-arrays-with
-  // https://standards.ieee.org/ieee/1800/6700/
-  // a_vect[ 0 +: 8] // == a_vect[ 7 : 0]
-  //assign o_C[1 * O_VEC_WIDTH - 1 -: O_VEC_WIDTH] = c00;
-  
-  assign o_C[0 * O_VEC_WIDTH +: O_VEC_WIDTH] = c00;
-  assign o_C[1 * O_VEC_WIDTH +: O_VEC_WIDTH] = c01;
-  assign o_C[2 * O_VEC_WIDTH +: O_VEC_WIDTH] = c02;
-  assign o_C[3 * O_VEC_WIDTH +: O_VEC_WIDTH] = c10;
-  assign o_C[4 * O_VEC_WIDTH +: O_VEC_WIDTH] = c11;
-  assign o_C[5 * O_VEC_WIDTH +: O_VEC_WIDTH] = c12;
-  assign o_C[6 * O_VEC_WIDTH +: O_VEC_WIDTH] = c20;
-  assign o_C[7 * O_VEC_WIDTH +: O_VEC_WIDTH] = c21;
-  assign o_C[8 * O_VEC_WIDTH +: O_VEC_WIDTH] = c22;
-
-  assign debug_pe_a = {pe_a_00_01, pe_a_01_02, pe_a_10_11, pe_a_11_12, pe_a_20_21, pe_a_21_22};
-  assign debug_pe_b = {pe_b_00_10, pe_b_01_11, pe_b_02_12, pe_b_10_20, pe_b_11_21, pe_b_12_22};
-
-endmodule
-`endif
-`ifndef CLA_NBIT_V_
-`define CLA_NBIT_V_
-
-`timescale 1ns / 1ps
-
-// Carry Look-ahead adder (CLA)
-module cla_nbit #(
-  parameter n = 4
-) (
-  input   [n-1:0] a,
-  input   [n-1:0] b,
-  input           ci,
-  output  [n-1:0] s,
-  output          co
-);
-
-  wire [n-1:0] g;
-  wire [n-1:0] p;
-  wire [  n:0] c;
-
-  assign c[0] = ci;
-  assign co   = c[n];
-
-  genvar i;  /* i - generate index variable */
-
-  generate
-    for (i = 0; i < n; i = i + 1) begin : addbit
-      assign s[i] = a[i] ^ b[i] ^ c[i];
-      assign g[i] = a[i] & b[i];
-      assign p[i] = a[i] | b[i];
-      assign c[i + 1] = g[i] | (p[i] & c[i]);
-    end
-  endgenerate
-  
-endmodule
-`endif
-`ifndef INT_FP_ADD_V_
-`define INT_FP_ADD_V_
-
-
-`timescale 1ns / 1ps
-
-module int_fp_add (
-`ifdef PIPELINE
-  input         clk,
-  input         rst_n,
-`endif
-  input         mode,
-  input  [15:0] a,
-  input  [15:0] b,
-  output [15:0] c
-);
-
-  wire [10:0] adder_input_1,adder_input_2,aligned_small,adder_output;
-  wire if_sub,a_sign, b_sign, c_sign,c1, c2;
-  wire [15:0] normalized_out;
-
-  // only used in INT8 MAC mode
-  wire [4:0] higher_add,higher_a,higher_b;
-
-  wire [15:0] result;
-  reg [14:0] bigger, smaller;
-  reg a_larger_b;
-
-`ifdef PIPELINE
-  reg [14:0] bigger_reg, smaller_reg;
-  reg [10:0] adder_output_reg;
-  wire [14:0] bigger_tmp, smaller_tmp;
-  wire [10:0] adder_output_tmp;
-`endif  
-
-
-  assign a_sign        = a[15];
-  assign b_sign        = b[15];
-  assign if_sub        = (a_sign == b_sign) ? 1'b0 : 1'b1;
-  assign c_sign        = a_larger_b ? a_sign : b_sign;
-  assign higher_a      = (mode == 1'b0) ? a[15:11] : 5'b0;
-  assign higher_b      = (mode == 1'b0) ? b[15:11] : 5'b0;
-  assign adder_input_1 = (mode==1'b0) ? a[10:0] :{1'b1,bigger[9:0]};
-  assign adder_input_2 = (mode==1'b0) ? b[10:0] : (if_sub ? ~aligned_small + 1'b1 : aligned_small);
-  assign c             = (mode == 1'b0) ? {higher_add,adder_output} : result;
-
-  //compare two number regardless sign
-  always @(*) begin
-    if (a[14:0] > b[14:0]) begin
-      bigger = a[14:0];
-      smaller = b[14:0];
-      a_larger_b = 1'b1;
-    end else begin 
-      bigger = b[14:0];
-      smaller = a[14:0];
-      a_larger_b = 1'b0;
-    end 
-  end
-
-`ifdef PIPELINE 
-    always @ (posedge clk or negedge rst_n) begin
-      if (!rst_n) begin
-        bigger_reg <= 15'b0;
-        smaller_reg <= 15'b0;
-        adder_output_reg <= 11'b0;
-      end else begin
-        bigger_reg <= bigger;
-        smaller_reg <= smaller;
-        adder_output_reg <= adder_output;
-      end
-    end
-    assign bigger_tmp = bigger_reg[14:0];
-    assign smaller_tmp = smaller_reg[14:0];
-    assign adder_output_tmp = adder_output_reg[10:0];
-`endif
-
-`ifdef PIPELINE
-  // align small number
-  alignment u1(bigger_tmp,smaller_tmp,aligned_small);
-`else 
-  // align small number
-  alignment u1(bigger,smaller,aligned_small);
-`endif
-
-  cla_nbit #(.n(11)) u2(adder_input_1,adder_input_2,1'b0,adder_output,c1);
-
-  // This 5 bit adder only used in INT8 MAC mode
-  cla_nbit #(.n(5)) u3(higher_a,higher_b,c1,higher_add,c2);
-
-`ifdef PIPELINE
-  add_normalizer u4(c_sign,bigger[14:10],adder_output_tmp,result,c1,if_sub);
-`else 
-  add_normalizer u4(c_sign,bigger[14:10],adder_output,result,c1,if_sub);
-`endif
-
-endmodule
-`endif
-`ifndef INT_FP_MUL_V_
-`define INT_FP_MUL_V_
-
-
-module int_fp_mul (
-`ifdef PIPELINE
-  input         clk,
-  input         rst_n,
-`endif
-  input         mode,
-  input  [15:0] a,
-  input  [15:0] b,
-  output [15:0] c,
-  output        error // valid in fp16 mode 
-);
-
-  wire [15:0] c_tmp;
-  wire        c_sign,a_zero,b_zero;
-  wire [ 4:0] sum_exponent, biased_sum_exponent;
-  wire [15:0] multiplier_input1,multiplier_input2;
-
-  wire [31:0] multiplier_output;
-  wire [14:0] normalized_out;
-  wire [21:0] mantissa_prod;
-  wire c1,c2,underflow,overflow;
-
-  assign overflow = (c1 && c2 && ~biased_sum_exponent[4]) ? 1'b1 :1'b0;
-  assign underflow = (~c1 && ~c2 && biased_sum_exponent[4]) ? 1'b1:1'b0;
-
-  assign a_zero = ~(|a);
-  assign b_zero = ~(|b);
-  assign c_sign = a[15] ^ b[15];
-  assign multiplier_input1 = mode ? {5'b0,1'b1,a[9:0]} : ((a[7]==1'b0) ? {9'b0,a[6:0]} : {9'b0,~a[6:0]+1'b1});
-  assign multiplier_input2 = mode ? {5'b0,1'b1,b[9:0]} : ((b[7]==1'b0) ? {9'b0,b[6:0]} : {9'b0,~b[6:0]+1'b1});
-
-  assign c = mode ? ((a_zero | b_zero) ? 16'b0 : c_tmp) : ((a[7]^b[7] == 1'b0) ? multiplier_output[15:0] : {1'b1,~multiplier_output[14:0]+1'b1});
-  //error detect
-  assign c_tmp = (~error) ? {c_sign,normalized_out} : (underflow ? {c_sign,15'b0000_0000_0000_000} : {c_sign,5'b1111_1,10'b0000_0000_00});
-
-  assign error = overflow | underflow; 
-
-    
-`ifdef PIPELINE
-
-  reg [31:0] multiplier_output_tmp;
-
-  always @ (posedge clk or negedge rst_n) begin
-    if (!rst_n) begin
-      multiplier_output_tmp <= 32'b0;
-    end else begin
-      multiplier_output_tmp <= multiplier_output;
-    end
-  end
-
-  assign mantissa_prod = multiplier_output_tmp[21:0];
-  mul16x16 u1(clk,rst_n,multiplier_input1,multiplier_input2,multiplier_output);
-
-`else 
-
-  assign mantissa_prod = multiplier_output[21:0];
-  mul16x16 u1(multiplier_input1,multiplier_input2,multiplier_output);
-
-`endif
-    
-  cla_nbit #(.n(5)) u2(a[14:10],b[14:10],1'b0,sum_exponent,c1); // add exponent
-  cla_nbit #(.n(5)) u3(sum_exponent, 5'b10001,1'b0,biased_sum_exponent,c2); // minus bias
-  mul_normalizer u4(biased_sum_exponent,mantissa_prod,normalized_out);
-
-endmodule
-`endif
-`ifndef MUL_NORMALIZER_V_
-`define MUL_NORMALIZER_V_
-
-`timescale 1ns / 1ps
-
-module mul_normalizer (
-  input  [ 4:0] exponent,
-  input  [21:0] mantissa_prod,
-  output [14:0] result
-);
-
-  wire [4:0] result_exponent;
-  wire [9:0] result_mantissa;
-
-  assign result_exponent = (mantissa_prod[21]) ? (exponent + 1'b1): exponent;
-  assign result_mantissa = (mantissa_prod[21]) ? mantissa_prod[20:11]:mantissa_prod[19:10];
-  assign result          = {result_exponent,result_mantissa};
-
-// No rounding and No overflow/underflow detection
-
-endmodule
-`endif
-`ifndef ADD_NORMALIZER_V_
-`define ADD_NORMALIZER_V_
-
-`timescale 1ns / 1ps
-
-module add_normalizer (
-  input             sign,
-  input      [ 4:0] exponent,
-  input      [10:0] mantissa_add,
-  output reg [15:0] result,
-  input             if_carray,
-  input             if_sub
-);
-
-  reg [4:0] number_of_zero_lead;
-  reg [10:0] norm_mantissa_add;
-  reg [9:0] mantissa_tmp;
-
-  wire [4:0] shift_left_exp;
-  wire c1;
-
-  always @ (*) begin
-    if (mantissa_add[10:4] == 7'b0000_001) begin
-      number_of_zero_lead = 5'd6;
-      norm_mantissa_add   = (mantissa_add << 4'd6);
-    end else if (mantissa_add[10:5] == 6'b0000_01) begin 
-      number_of_zero_lead = 5'd5;
-      norm_mantissa_add   = (mantissa_add << 4'd5);
-    end else if (mantissa_add[10:6] == 5'b0000_1) begin
-      number_of_zero_lead = 5'd4;
-      norm_mantissa_add   = (mantissa_add << 4'd4);
-    end else if (mantissa_add[10:7] == 4'b0001) begin
-      number_of_zero_lead = 5'd3;
-      norm_mantissa_add   = (mantissa_add << 4'd3);
-    end else if (mantissa_add[10:8] == 3'b001) begin
-      number_of_zero_lead = 5'd2;
-      norm_mantissa_add   = (mantissa_add << 4'd2);
-    end else if (mantissa_add[10:9] == 2'b01) begin
-      number_of_zero_lead = 5'd1;
-      norm_mantissa_add   = (mantissa_add << 4'd1);
-    end else begin 
-      number_of_zero_lead = 5'd0;
-      norm_mantissa_add   = mantissa_add[10:0];
-    end 
-  end
-
-  always @(*) begin
-    result[15]      = sign;
-    if (!if_sub) begin 
-      result[14:10] = if_carray ? exponent + 1'b1 : exponent;
-      result[9:0]   = if_carray ? mantissa_add[10:1] : mantissa_add[9:0];
-    end else begin 
-      result[14:10] = shift_left_exp;
-      result[9:0]   = norm_mantissa_add[9:0];
-    end 
-  end
-
-  cla_nbit #(.n(5)) u1(exponent,~number_of_zero_lead+1'b1,1'b0,shift_left_exp,c1);
-
-endmodule
-
-`endif
-`ifndef CONTROL_V_
-`define CONTROL_V_
-
-
-`timescale 1ns / 1ps
-`default_nettype none
-
-// Really 3x3, a done output??
-
-
-// 6 logic cycles + 2 (buffered?) delay cycles
-module control #(
-  parameter W = 16,
-  parameter N = 3
-) (
-  input  wire                         i_clk,
-  input  wire                         i_rst,
-  input  wire                         i_en,
-  input  wire                         i_mode,
-  input  wire [    W * N * N - 1 : 0] i_A,
-  input  wire [    W * N * N - 1 : 0] i_B,
-  output wire [    W * N * N - 1 : 0] o_C,
-  output wire                         o_done,
-  output wire [    W * N     - 1 : 0] A_in,
-  output wire [    W * N     - 1 : 0] B_in,
-
-  // debug
-  output wire [    W * N * 2 - 1 : 0] debug_pe_a,
-  output wire [    W * N * 2 - 1 : 0] debug_pe_b
-);
-  
-  reg [3 : 0] states, next_states;
-
-  reg  [W - 1 : 0] a00, a01, a02;
-  wire [W - 1 : 0] a01_q, a02_q;
-
-  reg  [W - 1 : 0] b00, b01, b02;
-  wire [W - 1 : 0] b01_q, b02_q;
-
-  assign A_in = {a02_q, a01_q, a00};
-  assign B_in = {b02_q, b01_q, b00};  
-
-  // 0000 is the idle / standby state
-  always @(posedge i_clk) begin
-    if (i_rst | ~i_en) begin
-      states <= 4'b0000;
-    end
-    else if (i_en) begin
-      states <= next_states;
-    end
-  end
-  
-  always @(*) begin
-    if (next_states == 4'b1001) begin
-      // Done: Force waiting
-      // This can also be done by switching off input
-      next_states = 4'b1001;
-    end else begin
-      next_states = states + 4'b0001;
-    end
-  end
-
-  assign o_done = (states == 4'b1001);	
-  
-  always @(*) begin
-    case (states)
-      4'b0001: begin
-        a00 = i_A[    W - 1 : 0 * W];
-        a01 = i_A[2 * W - 1 : 1 * W];
-        a02 = i_A[3 * W - 1 : 2 * W];
-       
-        
-        b00 = i_B[    W - 1 : 0 * W];
-        b01 = i_B[2 * W - 1 : 1 * W];
-        b02 = i_B[3 * W - 1 : 2 * W];
-      end
-      4'b0010: begin
-        a00 = i_A[4 * W - 1 : 3 * W];
-        a01 = i_A[5 * W - 1 : 4 * W];
-        a02 = i_A[6 * W - 1 : 5 * W];
-  
-        b00 = i_B[4 * W - 1 : 3 * W];
-        b01 = i_B[5 * W - 1 : 4 * W];
-        b02 = i_B[6 * W - 1 : 5 * W];
-      end
-      4'b0011: begin
-        a00 = i_A[7 * W - 1 : 6 * W];
-        a01 = i_A[8 * W - 1 : 7 * W];
-        a02 = i_A[9 * W - 1 : 8 * W];
-        
-        b00 = i_B[7 * W - 1 : 6 * W];
-        b01 = i_B[8 * W - 1 : 7 * W];
-        b02 = i_B[9 * W - 1 : 8 * W];
-      end
-      default: begin
-	a00 = 0;
-	a01 = 0;
-	a02 = 0;
-	      
-	b00 = 0;
-	b01 = 0;
-	b02 = 0;
-      end
-    endcase
-  end
-
-  systolic #(.W(W), .N(N)) sys(.i_clk(i_clk), .i_rst(i_rst), .i_en(i_en), .i_mode(i_mode), .i_A(A_in), .i_B(B_in), .o_C(o_C), .debug_pe_a(debug_pe_a), .debug_pe_b(debug_pe_b));
-
-  delay2 #(.WIDTH(W), .DEPTH(1)) delayA1(.clk(i_clk), .reset(i_rst), .data_in(a01), .data_out(a01_q));
-  delay2 #(.WIDTH(W), .DEPTH(2)) delayA2(.clk(i_clk), .reset(i_rst), .data_in(a02), .data_out(a02_q));
-
-  delay2 #(.WIDTH(W), .DEPTH(1)) delayB1(.clk(i_clk), .reset(i_rst), .data_in(b01), .data_out(b01_q));
-  delay2 #(.WIDTH(W), .DEPTH(2)) delayB2(.clk(i_clk), .reset(i_rst), .data_in(b02), .data_out(b02_q));
-
-
-endmodule
-`default_nettype wire
-`endif
-// No pipelined/piplined MAC
-// Version: 1.0
-
-// Description:
-
-// Function : mac_out = in_a * in_b + in_c.  Both work for INT8 and FP16 mode. Default INT8 and FP16 are signed number
-// Exception : error detection for overflow and underflow in FP16 mode
-`ifndef MAC_UNIT_V_
-`define MAC_UNIT_V_
-
-
-`timescale 1ns / 1ps
-
-module mac_unit
-(
-`ifdef PIPELINE
-  input            clk,
-  input            rst_n,
-`endif
-  input     [15:0] in_a, // multiplier input1
-  input     [15:0] in_b, // multiplier input2
-  input     [15:0] in_c, // adder input2 ; adder input1 = in_a*in_b
-  input 	   mode,
-  //output    [15:0] mac_out,
-  output    [15:0] mac_out,
-  output 	   error
-);
-
-  wire [15:0] mul_out;
-
-  int_fp_add add(
-  `ifdef PIPELINE
-    .clk   (clk    ),
-    .rst_n (rst_n  ),
-  `endif 
-    .mode  (mode   ),
-    .a     (mul_out),
-    .b     (in_c   ),
-    .c     (mac_out)
-  );
-
-  int_fp_mul mul(
-  `ifdef PIPELINE
-    .clk   (clk    ),
-    .rst_n (rst_n  ),
-  `endif 
-    .mode  (mode   ),
-    .a     (in_a   ),
-    .b     (in_b   ),
-    .c     (mul_out),
-    .error (error  )
-  );
-
-endmodule
-`endif
-`ifndef PE_2_V_
-`define PE_2_V_
-
-
-`timescale 1ns / 1ps
-`default_nettype none
-
-module PE #(
-  //parameter W = 32
-  parameter W = 16
-) (
-  input  wire                 i_clk,
-  input  wire                 i_rst,
-  input  wire                 i_en,
-  input  wire                 i_mode,
-  input  wire [    W - 1 : 0] i_A,
-  input  wire [    W - 1 : 0] i_B,
-  output wire [    W - 1 : 0] o_A,
-  output wire [    W - 1 : 0] o_B,
-  //output wire [    W - 1 : 0] o_C
-  output reg  [    W - 1 : 0] o_C
-);
-
-  //wire mode;
-  //assign mode = 1;
-
-  wire sync_load;
-  assign o_A = i_A_buffered;
-  assign o_B = i_B_buffered;
-  assign sync_load = i_rst | ~i_en;
-
-  wire [W - 1 : 0] i_A_buffered;
-  wire [W - 1 : 0] i_B_buffered;
-
-  reg  [15 : 0] accu;
-  wire [15 : 0] mac_out;
-
-  // Buffered in MAC
-  delay2 #(.WIDTH(W), .DEPTH(1)) delayA(.clk(i_clk), .reset(i_rst), .data_in(i_A), .data_out(i_A_buffered));
-  delay2 #(.WIDTH(W), .DEPTH(1)) delayB(.clk(i_clk), .reset(i_rst), .data_in(i_B), .data_out(i_B_buffered));
-
-  always @(posedge i_clk) begin
-    if (sync_load) begin
-      accu <= 0;
-      o_C  <= 0;
-    end
-    else begin
-      accu <= mac_out;
-      o_C  <= mac_out;
-    end
-  end
-
-  // Optional: making it clocked
-  mac_unit u0_mac(
-    .in_a    (i_A_buffered),
-    .in_b    (i_B_buffered),
-    .in_c    (accu),
-    .mode    (i_mode),
-    .mac_out (mac_out)
-  );
-
-endmodule
-
-`default_nettype wire
-`endif
-`ifndef DELAY_2_V_
-`define DELAY_2_V_
-
-`timescale 1ns / 1ps
-`default_nettype none
-
-module delay2 #(
-  parameter WIDTH = 16,
-  parameter DEPTH = 3
-) (
-  input  wire                 clk,
-  input  wire                 reset,
-  input  wire [WIDTH - 1 : 0] data_in,
-  output wire [WIDTH - 1 : 0] data_out
-);
-
-  wire [WIDTH - 1 : 0] connect_wire [DEPTH : 0];
-
-  assign data_out        = connect_wire[DEPTH];
-  assign connect_wire[0] = data_in;
-
-  genvar i;
-  generate
-    for (i = 1; i <= DEPTH; i = i + 1) begin
-      dff #(.WIDTH(WIDTH)) DFF(
-        .clk(clk),
-        .rst(reset),
-        .inp(connect_wire[i-1]),
-        .outp(connect_wire[i]));
-    end
-  endgenerate
-endmodule
-
-// D flip-flop with synchronous reset
-module dff#(
-    parameter WIDTH = 1
-  ) (
-    input wire clk,
-    input wire rst,
-
-    input wire [WIDTH-1:0] inp,
-    output reg [WIDTH-1:0] outp
-  );
-
-  always @(posedge clk) begin
-    outp <= rst ? 0 : inp;
+    outp <= rst ? {WIDTH{1'b0}} : inp;
   end
 
 endmodule
@@ -1704,7 +901,7 @@
     assign irq = 3'b000;	// Unused
 
     // LA
-    assign la_data_out = {{(127-BITS){1'b0}}, count};
+    //assign la_data_out = {{(127-BITS){1'b0}}, count};
     // Assuming LA probes [63:32] are for controlling the count register  
     assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
     // Assuming LA probes [65:64] are for controlling the count clk & reset  
@@ -1715,6 +912,17 @@
     // Assuming LA probes [66] are for controlling cs (data ready)
     assign cs = (~la_oenb[66]) ? la_data_in[66] : 0;
 
+    // Assuming LA probes [77] are for controlling wr
+    
+    wire wr;
+    assign wr = (~la_oenb[77]) ? la_data_in[77] : 0;
+
+    wire [31:0] bank2;
+    wire done_o_net;
+    //assign bank2 = {{(24){1'b0}}, done_o_net, {(7){1'b0}}};
+    assign bank2 = {{(17){1'b0}}, mem_set_done_o_net, {(6){1'b0}}, done_o_net, {(7){1'b0}}};
+    assign la_data_out = {{(BITS){1'b0}}, bank2, {(BITS){1'b0}}, count};
+
     /*
     counter #(
         .BITS(BITS)
@@ -1732,12 +940,19 @@
     );
     */
 
+    wire mem_set_done_o_net;
+
     interface_top interface_inst(
       .clk(clk),
       .rst(rst),
-      .cs(1'b1),
+      .cs(cs),
+      .sync(wr),
+      .data_addr_i(la_data_in[76:72]),
+      .readout_addr(la_data_in[70:67]),
       .data_in(la_data_in[63:32]),
-      .data_out(count)
+      .data_out(count),
+      .done_o(done_o_net),
+      .mem_set_done_o(mem_set_done_o_net)
     );
 
 endmodule
@@ -1796,25 +1011,32 @@
 module interface_top (
   input  wire        clk,
   input  wire        rst,
-  input  wire        cs,  // data ready
+  input  wire        sync, // wr
+  input  wire        cs,   // data ready, en
+  input  wire [ 4:0] data_addr_i,    // input   addr
+  input  wire [ 3:0] readout_addr,   // output  addr
   input  wire [31:0] data_in,
-  output reg  [31:0] data_out
+  output reg  [31:0] data_out,
+  output wire        done_o,
+  output wire        mem_set_done_o
 );
 
   localparam W = 16;
   localparam N = 3;
 
-  wire done_o;
+  //wire done_o;
   //assign data_out = 1;
+  reg [31:0] next_data_out;
 
   // scratch pad
   reg  [2 * W * N * N - 1 : 0] input_registers;
+  reg  [2 * W * N * N - 1 : 0] next_input_registers;
   wire [    W * N * N - 1 : 0] C_mat;
 
   // 16bit - 8
   // 32bit - 9
   // clog2 function
-  reg  [7 : 0] addr_ptr;
+  //reg  [7 : 0] addr_ptr;
 
   wire [W * N * N - 1 : 0] A_mat;
   wire [W * N * N - 1 : 0] B_mat;
@@ -1824,13 +1046,12 @@
 
   // mode selection
   // First try on FP16
-
   reg [2:0]      state;
   reg [2:0] next_state;
 
   wire [2:0] IDLE    = 3'b000;
   wire [2:0] LOAD    = 3'b001;
-  wire [2:0] PROCESS = 3'b011;
+  wire [2:0] PROCESS = 3'b010;
 
   // Refactor: Moving them inside control
   wire [W * N - 1 : 0] A_in;
@@ -1839,7 +1060,7 @@
   control #(.W(W), .N(N)) control_inst(
     .i_clk(clk),
     .i_rst(rst),
-    .i_en(mat_en),
+    .i_en(cs),
     .i_mode(1'b1),
     .i_A(A_mat),
     .i_B(B_mat),
@@ -1848,53 +1069,216 @@
   );
 
   // memory counter
-  reg [4:0]      addr_cnter;
-  reg [4:0] next_addr_cnter;
+  // onehot
+  reg [17:0] memory_set;
+  reg [17:0] next_memory_set;
 
-  //assign data_out = A_mat[W * 3 +: W];
-  //assign data_out = done_o;
-  //assign data_out[2:0] = state[2:0];
-
-  // TODO: might be redundant
-  reg mat_en;
+  // sync in control or PEs might be redundant
 
   always @(posedge clk) begin
-    if (rst) begin
+    // TODO: bubble up, as a single signal
+    if (rst | sync) begin
+      // Force combo circuits!!
       state           <= IDLE;
-      next_state      <= IDLE;
-      addr_cnter      <= 5'b0;
-      next_addr_cnter <= 5'b0;
       data_out        <= 32'b0;
+      memory_set      <= 18'b0;
+      input_registers <= 288'b0;
     end
     else begin
-      state      <= next_state;
-      addr_cnter <= next_addr_cnter;
-      data_out   <= {31'b0, done_o};
+      state           <= next_state;
+      memory_set      <= next_memory_set;
+      input_registers <= next_input_registers;
+      data_out 	      <= next_data_out;
     end
   end
 
+  // always @(sync) begin
+  //   if (sync) begin
+  //     next_state <= 0;
+  //   end
+  // end
+
+  // Should we also use memory counter here?
   always @(*) begin
     case (state)
       IDLE: begin
-	if (cs) begin
-	  next_state = LOAD;
-	end
-	mat_en = 1'b0;
-      end
-      LOAD: begin
-        input_registers[addr_cnter * W +: W] = data_in[W - 1 : 0];
-
-	if (addr_cnter >= 2 * N * N - 1) begin
-	  mat_en = 1'b1;
-	  next_state = PROCESS;
-	  next_addr_cnter = 0;
-	end
-	else begin
-          next_addr_cnter = addr_cnter + 1;
-	end
+	next_state = cs ? PROCESS : IDLE;
       end
       PROCESS: begin
-	next_state = done_o ? IDLE : PROCESS;
+	next_state = (done_o | ~cs) ? IDLE : PROCESS;
+      end
+      default: begin
+	next_state = IDLE;
+      end
+    endcase
+  end
+  // Tips: Moving comb logic out of the state machine
+
+  // input and output are concurrent
+  always @(*) begin
+    next_input_registers = input_registers;
+    case (data_addr_i)
+      5'b00000: begin
+	next_input_registers[5'b00000 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00001: begin
+	next_input_registers[5'b00001 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00010: begin
+	next_input_registers[5'b00010 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00011: begin
+	next_input_registers[5'b00011 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00100: begin
+	next_input_registers[5'b00100 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00101: begin
+	next_input_registers[5'b00101 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00110: begin
+	next_input_registers[5'b00110 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b00111: begin
+	next_input_registers[5'b00111 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01000: begin
+	next_input_registers[5'b01000 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01001: begin
+	next_input_registers[5'b01001 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01010: begin
+	next_input_registers[5'b01010 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01011: begin
+	next_input_registers[5'b01011 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01100: begin
+	next_input_registers[5'b01100 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01101: begin
+	next_input_registers[5'b01101 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01110: begin
+	next_input_registers[5'b01110 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b01111: begin
+	next_input_registers[5'b01111 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b10000: begin
+	next_input_registers[5'b10000 * W +: W] = data_in[W - 1 : 0];
+      end
+      5'b10001: begin
+	next_input_registers[5'b10001 * W +: W] = data_in[W - 1 : 0];
+      end
+      // Avoid inferring Latches
+      default: begin
+	next_input_registers = input_registers;
+      end
+    endcase
+  end
+
+  // memory set decoder
+  always @(*) begin
+    next_memory_set = memory_set;
+    case (data_addr_i)
+      5'b00000: begin
+        next_memory_set = memory_set | (18'b1 << 0);
+      end
+      5'b00001: begin
+        next_memory_set = memory_set | (18'b1 << 1);
+      end
+      5'b00010: begin
+        next_memory_set = memory_set | (18'b1 << 2);
+      end
+      5'b00011: begin
+        next_memory_set = memory_set | (18'b1 << 3);
+      end
+      5'b00100: begin
+        next_memory_set = memory_set | (18'b1 << 4);
+      end
+      5'b00101: begin
+        next_memory_set = memory_set | (18'b1 << 5);
+      end
+      5'b00110: begin
+        next_memory_set = memory_set | (18'b1 << 6);
+      end
+      5'b00111: begin
+        next_memory_set = memory_set | (18'b1 << 7);
+      end
+      5'b01000: begin
+        next_memory_set = memory_set | (18'b1 << 8);
+      end
+      5'b01001: begin
+        next_memory_set = memory_set | (18'b1 << 9);
+      end
+      5'b01010: begin
+        next_memory_set = memory_set | (18'b1 << 10);
+      end
+      5'b01011: begin
+        next_memory_set = memory_set | (18'b1 << 11);
+      end
+      5'b01100: begin
+        next_memory_set = memory_set | (18'b1 << 12);
+      end
+      5'b01101: begin
+        next_memory_set = memory_set | (18'b1 << 13);
+      end
+      5'b01110: begin
+        next_memory_set = memory_set | (18'b1 << 14);
+      end
+      5'b01111: begin
+        next_memory_set = memory_set | (18'b1 << 15);
+      end
+      5'b10000: begin
+        next_memory_set = memory_set | (18'b1 << 16);
+      end
+      5'b10001: begin
+        next_memory_set = memory_set | (18'b1 << 17);
+      end
+      default: begin
+        next_memory_set = memory_set;
+      end
+    endcase
+  end
+
+  assign mem_set_done_o = memory_set == {(18){1'b1}};
+
+  // Readout addr
+  always @(*) begin
+    next_data_out = data_out;
+    case (readout_addr)
+      4'b0000: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0000 * W +: W];
+      end
+      4'b0001: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0001 * W +: W];
+      end
+      4'b0010: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0010 * W +: W];
+      end
+      4'b0011: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0011 * W +: W];
+      end
+      4'b0100: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0100 * W +: W];
+      end
+      4'b0101: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0101 * W +: W];
+      end
+      4'b0110: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0110 * W +: W];
+      end
+      4'b0111: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b0111 * W +: W];
+      end
+      4'b1000: begin
+        next_data_out[W - 1 : 0] = C_mat[4'b1000 * W +: W];
+      end
+      // Avoid inferring Latches
+      default: begin
+	next_data_out = data_out;
       end
     endcase
   end