Merge pull request #9 from WebKingdom/8-create-test-bench-for-top-level-module

Create test bench for top level module
diff --git a/gds/user_project_wrapper.gds b/gds/user_project_wrapper.gds
index cfce4ea..597f672 100644
--- a/gds/user_project_wrapper.gds
+++ b/gds/user_project_wrapper.gds
Binary files differ
diff --git a/openlane/btc_miner_top/config.tcl b/openlane/btc_miner_top/config.tcl
new file mode 100755
index 0000000..36b0bc6
--- /dev/null
+++ b/openlane/btc_miner_top/config.tcl
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: 2020 Efabless Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+
+set ::env(PDK) "sky130A"
+set ::env(STD_CELL_LIBRARY) "sky130_fd_sc_hd"
+
+set script_dir [file dirname [file normalize [info script]]]
+
+set ::env(DESIGN_NAME) btc_miner_top
+
+set ::env(VERILOG_FILES) "\
+	$::env(CARAVEL_ROOT)/verilog/rtl/defines.v \
+	$script_dir/../../verilog/rtl/btc_miner_top.v \
+	$script_dir/../../verilog/rtl/sha256.v \
+	$script_dir/../../verilog/rtl/sha256_core.v \
+	$script_dir/../../verilog/rtl/sha256_k_constants.v \
+	$script_dir/../../verilog/rtl/sha256_w_mem.v"
+
+set ::env(DESIGN_IS_CORE) 0
+
+set ::env(CLOCK_PORT) "wb_clk_i"
+set ::env(CLOCK_NET) "miner_ctrl.clk"
+set ::env(CLOCK_PERIOD) "10"
+
+# always got: "There are hold violations in the design at the typical corner" when FP_SIZING was absolute... 
+# no matter what PL or GLB parameters I set. tried increasing both HOLD_MAX_BUFFER_PERCENT and HOLD_SLACK_MARGIN to 80% and 0.3ns
+set ::env(FP_SIZING) absolute
+# max area in wrapper: 0 0 2920 3520
+set ::env(DIE_AREA) "0 0 20000 20000"
+
+set ::env(FP_PIN_ORDER_CFG) $script_dir/pin_order.cfg
+
+set ::env(PL_BASIC_PLACEMENT) 0
+set ::env(PL_TARGET_DENSITY) 0.7
+set ::env(FP_CORE_UTIL) 80
+# with 10%: detailed placement faild and had setup violations
+# with 50%: detailed placement faild and had setup violations
+# with 100% and 0.7: "Utilization exceeds 100%." Ran out of space?
+# with 90% and 0.7: "Use a higher -density or re-floorplan with a larger core area. Suggested target density: 0.92" 
+# with 95% and 0.9: "Use a higher -density or re-floorplan with a larger core area. Suggested target density: 0.98"
+
+# DIE_AREA: "0 0 2920 3520" and FP_SIZING: relative
+# with 98% and 0.98: "Utilization exceeds 100%." (Chip area: 158121.651200) (PlaceInstsArea: 158121651200) (NonPlaceInstsArea: 3992579200) (CoreArea: 160567747200)
+# with 95% and 1: "Detailed placement failed." "Error: resizer.tcl, 78 DPL-0036" 
+# with 95% and 0.98: "Use a higher -density or re-floorplan with a larger core area." (PlaceInstsArea: 158121651200) (NonPlaceInstsArea: 4046380800) (Util(%): 98.13) (CoreArea: 165175916800)
+
+# DIE_AREA: "0 0 2920 3520" and FP_SIZING: absolute
+# with 95% and 0.98: "Detailed placement failed." "Error: resizer.tcl, 78 DPL-0036"
+
+# Not sure how FP_SIZING absolute and relative works excatly and how DIE_AREA affects the overall size and constraints
+
+# set ::env(ROUTING_CORES) 4
+set ::env(PL_RANDOM_GLB_PLACEMENT) 0
+# set ::env(PL_RESIZER_ALLOW_SETUP_VIOS) 1
+# set ::env(GLB_RESIZER_ALLOW_SETUP_VIOS) 1
+
+set ::env(PL_RESIZER_HOLD_MAX_BUFFER_PERCENT) 80
+set ::env(GLB_RESIZER_HOLD_MAX_BUFFER_PERCENT) 80
+# set ::env(PL_RESIZER_HOLD_SLACK_MARGIN) 0.1ns
+# set ::env(GLB_RESIZER_HOLD_SLACK_MARGIN) 0.1ns
+
+set ::nev(PL_RESIZER_SETUP_MAX_BUFFER_PERCENT) 80
+set ::nev(GLB_RESIZER_SETUP_MAX_BUFFER_PERCENT) 80
+# set ::env(PL_RESIZER_SETUP_SLACK_MARGIN) 0.05ns
+# set ::env(GLB_RESIZER_SETUP_SLACK_MARGIN) 0.05ns
+
+# set ::anv(CTS_TARGET_SKEW) 200
+
+# Maximum layer used for routing is metal 4.
+# This is because this macro will be inserted in a top level (user_project_wrapper) 
+# where the PDN is planned on metal 5. So, to avoid having shorts between routes
+# in this macro and the top level metal 5 stripes, we have to restrict routes to metal4.  
+# 
+# set ::env(GLB_RT_MAXLAYER) 5
+
+set ::env(RT_MAX_LAYER) {met4}
+
+# You can draw more power domains if you need to 
+set ::env(VDD_NETS) [list {vccd1}]
+set ::env(GND_NETS) [list {vssd1}]
+
+set ::env(DIODE_INSERTION_STRATEGY) 4 
+# If you're going to use multiple power domains, then disable cvc run.
+set ::env(RUN_CVC) 1
diff --git a/openlane/btc_miner_top/pin_order.cfg b/openlane/btc_miner_top/pin_order.cfg
new file mode 100644
index 0000000..2fda806
--- /dev/null
+++ b/openlane/btc_miner_top/pin_order.cfg
@@ -0,0 +1,10 @@
+#BUS_SORT
+
+#S
+wb_.*
+wbs_.*
+la_.*
+irq.*
+
+#N
+io_.*
diff --git a/openlane/user_adder/config.json b/openlane/user_adder/config.json
deleted file mode 100644
index a817807..0000000
--- a/openlane/user_adder/config.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-    "PDK"                      : "sky130A",
-    "STD_CELL_LIBRARY"         : "sky130_fd_sc_hd",
-    "CARAVEL_ROOT"             : "../../caravel",
-    "CLOCK_NET"                : "add_sub_accum.clk",
-    "CLOCK_PERIOD"             : "10",
-    "CLOCK_PORT"               : "wb_clk_i",
-    "DESIGN_IS_CORE"           : "0",
-    "DESIGN_NAME"              : "user_adder",
-    "DIE_AREA"                 : "0 0 1200 900",
-    "DIODE_INSERTION_STRATEGY" : "4",
-    "FP_CORE_UTIL"             : "5",
-    "FP_PIN_ORDER_CFG"         : "pin_order.cfg",
-    "FP_SIZING"                : "relative",
-    "GLB_RT_MAXLAYER"          : "5",
-    "GND_NETS"                 : "vssd1",
-    "PL_BASIC_PLACEMENT"       : "1",
-    "PL_TARGET_DENSITY"        : "0.5",
-    "RUN_CVC"                  : "1",
-    "VDD_NETS"                 : "vccd1",
-    "VERILOG_FILES"            : ["../../caravel/verilog/rtl/defines.v", "../../verilog/rtl/user_adder.v"]
-}
diff --git a/verilog/dv/adder_test1/RTL-adder_test1.vcd b/verilog/dv/adder_test1/RTL-adder_test1.vcd
index 0582aed..7139f59 100644
--- a/verilog/dv/adder_test1/RTL-adder_test1.vcd
+++ b/verilog/dv/adder_test1/RTL-adder_test1.vcd
Binary files differ
diff --git a/verilog/dv/btc_miner_top_test1/RTL-btc_miner_top_test1.vcd b/verilog/dv/btc_miner_top_test1/RTL-btc_miner_top_test1.vcd
new file mode 100644
index 0000000..32b1e94
--- /dev/null
+++ b/verilog/dv/btc_miner_top_test1/RTL-btc_miner_top_test1.vcd
Binary files differ
diff --git a/verilog/dv/btc_miner_top_test1/btc_miner_top_test1.c b/verilog/dv/btc_miner_top_test1/btc_miner_top_test1.c
index a9f7841..6177449 100644
--- a/verilog/dv/btc_miner_top_test1/btc_miner_top_test1.c
+++ b/verilog/dv/btc_miner_top_test1/btc_miner_top_test1.c
@@ -19,19 +19,75 @@
 #include <defs.h>
 #include <stub.c>
 
+// constants
+#define ADDR_CTRL 0x08
+#define CTRL_INIT_BIT 0
+#define CTRL_NEXT_BIT 1
+#define CTRL_MODE_BIT 2
+
+#define ADDR_STATUS 0x09
+#define STATUS_READY_BIT 0
+#define STATUS_VALID_BIT 1
+
+#define ADDR_BLOCK0 0x10
+#define ADDR_BLOCK15 0x1f
+
+#define ADDR_DIGEST0 0x20
+#define ADDR_DIGEST6 0x26
+#define ADDR_DIGEST7 0x27
+
+#define MODE_SHA_224 0
+#define MODE_SHA_256 1
+
+
 /*
-	Wishbone Test:
-		- Configures MPRJ lower 8-IO pins as outputs
-		- Checks counter value through the wishbone port
+	Miner test 1
+    - checks if automated state machine works as expected
 */
 
+// void *memcpy(void *dest, const void *src, uint32_t n)
+// {
+//     for (uint32_t i = 0; i < n; i++)
+//     {
+//         ((char*)dest)[i] = ((char*)src)[i];
+//     }
+// }
+
+
+// void *memcpy (void *dest, const void *src, uint32_t len)
+// {
+//   char *d = dest;
+//   const char *s = src;
+//   while (len--)
+//     *d++ = *s++;
+//   return dest;
+// }
+
+
 void main()
 {
-
     // boolean for validating all tests
     uint32_t testsPassed = 1;
-    // previous result
-    uint32_t prevResult = 0x0000000F;
+
+    // could put into array
+    uint32_t hash_out0 = 0;
+    uint32_t hash_out1 = 0;
+    uint32_t hash_out2 = 0;
+    uint32_t hash_out3 = 0;
+    uint32_t hash_out4 = 0;
+    uint32_t hash_out5 = 0;
+    uint32_t hash_out6 = 0;
+    uint32_t hash_out7 = 0;
+
+    // SHA info
+    // uint32_t index = 0;
+    // const uint32_t sha256_input[] = {
+    //     0x00000001, 0x00000002, 0x00000003, 0x00000004,
+    //     0x00000005, 0x00000006, 0x00000007, 0x00000008,
+    //     0x00000009, 0x0000000A, 0x0000000B, 0x0000000C,
+    //     0x0000000D, 0x0000000E, 0x0000000F, 0x00000010
+    // };
+
 
 	/* 
 	IO Control Registers
@@ -80,86 +136,171 @@
     reg_mprj_xfer = 1;
     while (reg_mprj_xfer == 1);
 
-    // TODO set up testbench
-    // LA probes [31:0] input to the CPU
+    // LA probes [31:0] input to MGMT from USER
     reg_la0_oenb = reg_la0_iena = 0x00000000;    // [31:0]
-    // LA probes [63:32] output from the CPU
-    reg_la1_oenb = reg_la1_iena = 0xFFFFFFFF;    // [63:32]
-    // LA probes [94:64]  input to the CPU and [95:94] output from CPU (for nAdd_Sub and use_prev_result)
-	reg_la2_oenb = reg_la2_iena = 0xC0000000;    // [95:64]
-
-    // set prev_result for testing
-    reg_la1_data = prevResult;
-    // LA probes [63:32] input to the CPU (disable counter writes)
-    // reg_la1_oenb = reg_la1_iena = 0x00000000;    // [63:32]
-
-    // set nAdd_sub to 0 -> add operation
-    reg_la2_data = 0x00000000;
+    // LA probes [63:32] input to MGMT from USER
+    reg_la1_oenb = reg_la1_iena = 0x00000000;    // [63:32]
+    // LA probes [95:64]  input to MGMT from USER
+	reg_la2_oenb = reg_la2_iena = 0x00000000;    // [95:64]
+    // LA probes [127:96] output from MGMT into USER
+	reg_la3_oenb = reg_la3_iena = 0xFFFF3FFF;    // [127:96]
 
     // Flag start of the test
-	reg_mprj_datal = 0xAB600000;
+	reg_mprj_datal = 0xFEEDFEED;
     // reg_mprj_datah = 0x00000000;
 
-    // Set initial value of adder
-    reg_mprj_slave = 0x00FF00FF;
-    if (reg_mprj_slave == 0x000001FE)
+    // set control information to SHA256: sha_mode, sha_init, auto_ctrl, and start_ctrl
+    // *init bit starts sha_core, but only write to control register after reading in 512-bit input!
+    reg_la3_data = 0x00050C00;
+    
+    // TODO could put in loop?
+    reg_mprj_slave = 0x0FAB0FAB;
+    // reg_mprj_slave = sha256_input[index];
+    // index++;
+    // sha_addr == ADDR_BLOCK0 && sha_we && sha_cs && sha_read_data == 0
+    if (((reg_la2_data & 0x000000FF) == 0x10) && ((reg_la2_data & 0x00000F00) == 0x300))
     {
-        prevResult = reg_mprj_slave;
+        // Write 1st input to sha module
         testsPassed = testsPassed & 1;
     }
     else
     {
-        prevResult = 0xBAD0BAD0;
+        // did not read input
         testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
     }
 
-    // set prev_result on LA
-    reg_la1_data = prevResult;
+    // set control information to SHA256: disable start_ctrl
+    reg_la3_data = 0x00050800;
 
-    // set nAdd_sub and previous result computation flags
-    reg_la2_data = 0x40000000;
-
-    // set previous result as input to adder (0x1FE)
-    reg_mprj_slave = prevResult;
-
-    // continue adding previous result to itself
-    while (reg_mprj_slave < 0x07F80000)
+    reg_mprj_slave = 0x0000F00D;
+    // reg_mprj_slave = sha256_input[index];
+    // index++;
+    // sha_addr == ADDR_BLOCK1 && sha_we && sha_cs && sha_read_data == 0
+    if (((reg_la2_data & 0x000000FF) == 0x11) && ((reg_la2_data & 0x00000F00) == 0x300))
     {
-        if (reg_mprj_slave == (prevResult + prevResult))
-        {
-            // set previous result to adder output
-            prevResult = reg_mprj_slave;
-            testsPassed = testsPassed & 1;
+        // Write 2nd input to sha module
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        // did not read input
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
 
-            // set prev_result on LA
-            reg_la1_data = prevResult;
-            // set previous result as input to adder
-            reg_mprj_slave = prevResult;
-        }
-        else
-        {
-            // set previous result to adder output
-            prevResult = reg_mprj_slave;
-            testsPassed = testsPassed & 0;
+    reg_mprj_slave = 0x0001F00D;
+    reg_mprj_slave = 0x0002F00D;
+    reg_mprj_slave = 0x0003F00D;
+    reg_mprj_slave = 0x0004F00D;
+    reg_mprj_slave = 0x0005F00D;
+    reg_mprj_slave = 0x0006F00D;
+    reg_mprj_slave = 0x0007F00D;
+    reg_mprj_slave = 0x0008F00D;
+    reg_mprj_slave = 0x0009F00D;
+    reg_mprj_slave = 0x000AF00D;
+    reg_mprj_slave = 0x000BF00D;
+    reg_mprj_slave = 0x000CF00D;
+    reg_mprj_slave = 0x000DF00D;
+    reg_mprj_slave = 0x000EF00D;
 
-            // set prev_result on LA
-            reg_la1_data = 0xBAD0BAD0;
-            break;
-        }
+    // read valid output hash (digest)
+    hash_out0 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == 0x20) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out1 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 1)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out2 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 2)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out3 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 3)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out4 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 4)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out5 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 5)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out6 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == (0x20 + 6)) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
+    }
+
+    hash_out7 = reg_mprj_slave;
+    if (((reg_la2_data & 0x000000FF) == 0x27) && ((reg_la2_data & 0x00000F00) == 0x100))
+    {
+        testsPassed = testsPassed & 1;
+    }
+    else
+    {
+        testsPassed = testsPassed & 0;
+        reg_mprj_datal = 0xBAD0BAD0;
     }
 
 
     if (testsPassed)
     {
-        // set previous result to adder output
-        prevResult = reg_mprj_slave;
-        // set prev_result on LA
-        reg_la1_data = prevResult;
-
-        reg_mprj_datal = 0xAB610000;
+        // Successfully ended test
+        reg_mprj_datal = 0xDEADDEAD;
     }
     else
     {
-        reg_mprj_datal = 0xAB620000;
+        reg_mprj_datal = 0xBAD0BAD0;
     }
 }
diff --git a/verilog/dv/btc_miner_top_test1/btc_miner_top_test1_tb.v b/verilog/dv/btc_miner_top_test1/btc_miner_top_test1_tb.v
index 44e0630..531afe1 100644
--- a/verilog/dv/btc_miner_top_test1/btc_miner_top_test1_tb.v
+++ b/verilog/dv/btc_miner_top_test1/btc_miner_top_test1_tb.v
@@ -49,15 +49,15 @@
 		$dumpvars(0, btc_miner_top_test1_tb);
 
 		// Repeat cycles of 1000 clock edges as needed to complete testbench
-		repeat (220) begin
+		repeat (100) begin
 			repeat (1000) @(posedge clock);
 			// $display("+1000 cycles");
 		end
 		$display("%c[1;31m",27);
 		`ifdef GL
-			$display ("Monitor: Timeout, Adder Test 2 (GL) Failed");
+			$display ("Monitor: Timeout, Miner Test 1 (GL) Failed");
 		`else
-			$display ("Monitor: Timeout, Adder Test 2 (RTL) Failed");
+			$display ("Monitor: Timeout, Miner Test 1 (RTL) Failed");
 		`endif
 		$display("%c[0m",27);
 		$finish;
@@ -65,13 +65,13 @@
 
 	// TODO change finish conditions
 	initial begin
-	  wait(checkbits == 16'hAB60);
-		$display("Monitor: Adder Test 2 Started");
-		wait(checkbits == 16'hAB61);
+	  wait(checkbits == 16'hFEED);
+		$display("Monitor: Miner Test 1 Started");
+		wait(checkbits == 16'hDEAD);
 		`ifdef GL
-				$display("Monitor: Adder Test 2 (GL) Passed");
+				$display("Monitor: Miner Test 1 (GL) Passed");
 		`else
-				$display("Monitor: Adder Test 2 (RTL) Passed");
+				$display("Monitor: Miner Test 1 (RTL) Passed");
 		`endif
 			$finish;
 	end
diff --git a/verilog/rtl/btc_miner_top.v b/verilog/rtl/btc_miner_top.v
index 0744621..91c963d 100644
--- a/verilog/rtl/btc_miner_top.v
+++ b/verilog/rtl/btc_miner_top.v
@@ -86,11 +86,10 @@
   wire [7:0] o_sha_address;
   wire [BITS-1:0] o_sha_read_data;
 
-  reg [127:0] la_data_out;  // TODO? ensure LA muxing does not require register
-
   // TODO use top 32-bits of LA to control muxing and other variables like starting state machine
   wire [5:0] la_sel;
   assign la_sel = la_data_in[127:122];
+  wire [75:0] la_data_out_w;
 
   // WB MI A
   assign valid = wbs_cyc_i && wbs_stb_i; 
@@ -112,22 +111,29 @@
   assign la_write2 = ~la_oenb[95:64] & ~{BITS{valid}};
   assign la_write3 = ~la_oenb[127:96] & ~{BITS{valid}};
 
+  assign la_data_out_w = {o_idle, o_error, o_sha_we, o_sha_cs, o_sha_address, o_sha_read_data, rdata};
+
   // Assuming LA probes [111:110] are for controlling the reset & clock
   assign clk = (~la_oenb[110]) ? la_data_in[110] : wb_clk_i;
   assign rst = (~la_oenb[111]) ? la_data_in[111] : wb_rst_i;
 
   // TODO more LA muxing
-  always @(la_data_in || la_oenb || la_sel || o_idle || o_error || o_sha_we || o_sha_cs || o_sha_address || o_sha_read_data || rdata) begin
-    case (la_sel)
-      6'b000000:
-        la_data_out <= {{(127-((2*BITS)-12)){1'b0}}, {o_idle, o_error, o_sha_we, o_sha_cs, o_sha_address, o_sha_read_data, rdata}};
+  assign la_data_out = (la_sel == 6'b000000) ? {{52{1'b0}}, la_data_out_w} : ((la_sel == 6'b000001) ? {{52{1'b0}}, la_data_out_w} : {{52{1'b0}}, la_data_out_w});
+  // always @(clk || la_data_in || la_oenb || la_sel || la_data_out_w) begin
+  //   if (rst) begin
+  //     la_data_out <= 0;
+  //   end else begin
+  //     case (la_sel)
+  //     6'b000000:
+  //       la_data_out <= {{52{1'b0}}, la_data_out_w};
 
-      default:
-        begin
-          la_data_out <= {{(127-((2*BITS)-12)){1'b0}}, {o_idle, o_error, o_sha_we, o_sha_cs, o_sha_address, o_sha_read_data, rdata}};
-        end
-    endcase
-  end
+  //     default:
+  //       begin
+  //         la_data_out <= {{52{1'b0}}, la_data_out_w};
+  //       end
+  //   endcase
+  //   end
+  // end
 
   // module for controlling the sha module
   miner_ctrl #(
@@ -146,7 +152,7 @@
     .idle(o_idle),
     .reg_sha_cs(o_sha_cs),
     .reg_sha_we(o_sha_we),
-    .reg_sha_address(o_sha_address),
+    .sha_address(o_sha_address),
     .sha_read_data(o_sha_read_data)
   );
 
@@ -170,7 +176,7 @@
   output wire idle,
   output reg reg_sha_cs,
   output reg reg_sha_we,
-  output reg [7:0] reg_sha_address,
+  output wire [7:0] sha_address,
   output wire [BITS-1:0] sha_read_data  // output from sha256
 );
 
@@ -202,7 +208,7 @@
   // localparam ADDR_BLOCK11   = 8'h1b;
   // localparam ADDR_BLOCK12   = 8'h1c;
   // localparam ADDR_BLOCK13   = 8'h1d;
-  // localparam ADDR_BLOCK14   = 8'h1e;
+  localparam ADDR_BLOCK14   = 8'h1e;
   localparam ADDR_BLOCK15   = 8'h1f;
 
   localparam ADDR_DIGEST0   = 8'h20;
@@ -219,14 +225,14 @@
 
   // enum logic [1:0] {WAIT_IN=2'b00, READ_IN=2'b01, WAIT_COMPUTE=2'b10, CHECK=2'b11, WRITE_OUT=} state;
   // enum integer unsigned {WAIT_IN=0, READ_IN=1, WAIT_COMPUTE=2, INCR_NONCE=3, WRITE_OUT=4} state;
-  localparam WAIT_IN=0, WRITE_CTRL=1, READ_IN=2, WAIT_COMPUTE=3, WRITE_OUT=4;
+  localparam WAIT_IN=0, READ_IN=1, WRITE_CTRL=2, WAIT_COMPUTE=3, WRITE_OUT=4;
 
   reg [2:0] state;
 
   wire start_ctrl;
   wire sha_cs;
   wire sha_we;
-  wire [7:0] sha_address;
+  reg [7:0] reg_sha_address;
 
   // sha_mode, sha_next, sha_init. Map to ADDR_CTRL register [2:0]
   wire [2:0] sha_ctrl_bits;
@@ -239,28 +245,33 @@
   wire auto_ctrl;
 
   assign idle = (state == WAIT_IN) ? 1'b1 : 1'b0;
-  assign start_ctrl = la_input3[10] & la_write3[10];
+
+  // * la_write is 0 when valid is 1 !!
+  // assign start_ctrl = la_input3[10] & la_write3[10];
+  assign start_ctrl = la_input3[10];  
 
   // automated and manual control
   assign read_status_flag = sha_cs && !sha_we && (sha_address == ADDR_STATUS);
   assign sha_in_ready = read_status_flag ? sha_read_data[STATUS_READY_BIT] : 1'b0;
   assign sha_digest_valid = read_status_flag ? sha_read_data[STATUS_VALID_BIT] : 1'b0;
 
-  assign auto_ctrl = la_input3[11] & la_write3[11];
+  // assign auto_ctrl = la_input3[11] & la_write3[11];
+  assign auto_ctrl = la_input3[11];
 
-  assign sha_cs = auto_ctrl ? reg_sha_cs : (la_input3[8] & la_write3[8]);
-  assign sha_we = auto_ctrl ? reg_sha_we : (la_input3[9] & la_write3[9]);
-  assign sha_address = auto_ctrl ? reg_sha_address : (la_input3[7:0] & la_write3[7:0]);
-  assign sha_ctrl_bits = la_input3[18:16] & la_write3[18:16];
+  // assign sha_cs = auto_ctrl ? reg_sha_cs : (la_input3[8] & la_write3[8]);
+  // assign sha_we = auto_ctrl ? reg_sha_we : (la_input3[9] & la_write3[9]);
+  // assign sha_address = auto_ctrl ? reg_sha_address : (la_input3[7:0] & la_write3[7:0]);
+  // assign sha_ctrl_bits = la_input3[18:16] & la_write3[18:16];
+  assign sha_cs = auto_ctrl ? reg_sha_cs : la_input3[8];
+  assign sha_we = auto_ctrl ? reg_sha_we : la_input3[9];
+  assign sha_address = auto_ctrl ? reg_sha_address : la_input3[7:0];
+  assign sha_ctrl_bits = la_input3[18:16];
 
-  // need to count to 640/32 = 20 (decimal). Only to 19 b/c nonce is last 32-bits
-  integer unsigned count;
 
   always @(posedge clk) begin
     if (rst) begin
-      ready <= 0;
       rdata <= 0;
-      count <= 0;
+      ready <= 0;
       reg_sha_cs <= 0;
       reg_sha_we <= 0;
       reg_sha_address <= 0;
@@ -269,20 +280,46 @@
     end else if (auto_ctrl) begin
       ready <= 1'b0;
 
-      // state machine for controlling miner and I/O
+      // state machine for controlling SHA module and I/O
       case (state)
         WAIT_IN: begin
           // wait for LA start input and for sha module to be ready
           reg_sha_cs <= 1'b1;
           reg_sha_we <= 1'b0;
           reg_sha_address <= ADDR_STATUS;
-          sha_write_data <= {{(BITS-3){1'b0}}, {sha_ctrl_bits}};
 
           if (start_ctrl && sha_in_ready) begin
             reg_sha_cs <= 1'b1;
             reg_sha_we <= 1'b1;
-            reg_sha_address <= ADDR_CTRL;
-            state <= WRITE_CTRL;
+            state <= READ_IN;
+          end
+        end
+
+        READ_IN: begin
+          // read in 512-bit input to sha module through WB
+          reg_sha_cs <= 1'b1;
+          reg_sha_we <= 1'b1;
+
+          if (valid && !ready) begin
+            ready <= 1'b1;
+            sha_write_data <= wdata;
+
+            if (wb_wr_mask == 4'b1111) begin
+              // read up to the last address
+              if (sha_address == ADDR_BLOCK14) begin
+                // new addr will be ADDR_BLOCK15
+                reg_sha_address <= reg_sha_address + 1;
+                state <= WRITE_CTRL;
+              end else begin
+                // check if 1st write coming from WAIT_IN
+                if (sha_address == ADDR_STATUS) begin
+                  reg_sha_address <= ADDR_BLOCK0;
+                end else begin
+                  reg_sha_address <= reg_sha_address + 1;
+                end
+              end
+            end
+
           end
         end
 
@@ -295,7 +332,7 @@
           // write and read back to ensure CTRL is set
           if (reg_sha_we == 1'b0) begin
             if (sha_read_data[2:0] == sha_ctrl_bits) begin
-              state <= READ_IN;
+              state <= WAIT_COMPUTE;
             end
             reg_sha_we <= 1'b1;
           end else begin
@@ -303,33 +340,6 @@
           end
         end
 
-        READ_IN: begin
-          // read in 512 bit input to sha module
-          reg_sha_cs <= 1'b1;
-          reg_sha_we <= 1'b1;
-
-          if (valid && !ready) begin
-            ready <= 1'b1;
-            sha_write_data <= wdata;
-
-            if (wb_wr_mask == 4'b1111) begin
-              // read up to the last address
-              if (sha_address == ADDR_BLOCK15) begin
-                state <= WAIT_COMPUTE;
-              end else begin
-                // check if 1st write coming from WRITE_CTRL
-                if (sha_address == ADDR_CTRL) begin
-                  reg_sha_address <= ADDR_BLOCK0;
-                end else begin
-                  reg_sha_address <= reg_sha_address + 1;
-                end
-              end
-            end
-
-          end
-        end
-        // TODO? could do a check and read back 512-bit block to ensure it is correct
-
         WAIT_COMPUTE: begin
           // read status register to determine when done
           reg_sha_cs <= 1'b1;
@@ -343,7 +353,7 @@
         end
 
         WRITE_OUT: begin
-          // TODO?
+          // write valid 256-bit digest to WB
           reg_sha_cs <= 1'b1;
           reg_sha_we <= 1'b0;
 
@@ -354,10 +364,12 @@
             if (wb_wr_mask == 4'b0000) begin
               rdata <= sha_read_data;
               
-              if ((sha_ctrl_bits[2] == MODE_SHA_256) && sha_address == ADDR_DIGEST7) begin
+              if ((sha_ctrl_bits[2] == MODE_SHA_256) && (sha_address == ADDR_DIGEST7)) begin
+                reg_sha_address <= ADDR_STATUS;
                 state <= WAIT_IN;
               end else if ((sha_ctrl_bits[2] == MODE_SHA_224) && (sha_address == ADDR_DIGEST6)) begin
                 // only read 7 words from digest reg
+                reg_sha_address <= ADDR_STATUS;
                 state <= WAIT_IN;
               end else begin
                 reg_sha_address <= reg_sha_address + 1;