more simulation and attempt to make user_project_wrapper
diff --git a/ol_templates/Makefile b/ol_templates/Makefile
index 4e58532..7913275 100644
--- a/ol_templates/Makefile
+++ b/ol_templates/Makefile
@@ -52,7 +52,7 @@
 	@echo -n >../openlane/user_proj_example/macro_placement.cfg
 	@for r in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do \
 		for c in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do \
-		echo "blk.column\["$$c"\].row\["$$r"\].yc "$$(printf %.3f $$((40480+80960*(15-$$c)))e-3)" "$$(printf %.3f $$((86400+86400*(15-$$r)))e-3)" N" >> ../openlane/user_proj_example/macro_placement.cfg; \
+		echo "blk.column\["$$c"\].row\["$$r"\].yc "$$(printf %.3f $$((220000+65480*$$c))e-3)" "$$(printf %.3f $$((87000+76160*$$r))e-3)" S" >> ../openlane/user_proj_example/macro_placement.cfg; \
 		done \
 	done
 
diff --git a/ol_templates/config_block2.tcl b/ol_templates/config_block2.tcl
index b26a5a4..6f8265b 100644
--- a/ol_templates/config_block2.tcl
+++ b/ol_templates/config_block2.tcl
@@ -16,8 +16,8 @@
 set ::env(EXTRA_GDS_FILES) "\
         $script_dir/../../gds/morphle_ycell.gds"
 
-set ::env(PDN_CFG) $script_dir/pdn.tcl
-set ::env(FP_PDN_CORE_RING) 1
+#set ::env(PDN_CFG) $script_dir/pdn.tcl
+#set ::env(FP_PDN_CORE_RING) 1
 
 set ::unit 3
 set ::env(FP_IO_VEXTEND) [expr 2*$::unit]
@@ -29,7 +29,7 @@
 set ::env(FP_IO_HTHICKNESS_MULT) 4
 
 set ::env(PL_OPENPHYSYN_OPTIMIZATIONS) 0
-set ::env(DIODE_INSERTION_STRATEGY) 0
+set ::env(DIODE_INSERTION_STRATEGY) 3
 
 # Need to fix a FastRoute bug for this to work, but it's good
 # for a sense of "isolation"
diff --git a/verilog/morphle/yblock.v b/verilog/morphle/yblock.v
index 7b7c465..12a0640 100644
--- a/verilog/morphle/yblock.v
+++ b/verilog/morphle/yblock.v
@@ -98,9 +98,9 @@
              // cbitin, cbitout,
              .cbitin(vcbit[y][x]), .cbitout(vcbit[y+1][x]),
              // hempty, vempty, (R, U)
-             .hempty(he2[x][y]), .vempty(ve2[y][x]),
+             .hempty(he2[x+1][y]), .vempty(ve2[y][x]),
              // hempty2, vempty2, (L, D)
-             .hempty2(he[x+1][y]), .vempty2(ve[y+1][x]),
+             .hempty2(he[x][y]), .vempty2(ve[y+1][x]),
              // uempty, uin, uout,
              .uempty(ve[y][x]),
              .uin(vs[y][2*x+1:2*x]),
@@ -110,11 +110,11 @@
              .din(vb[y+1][2*x+1:2*x]),
              .dout(vs[y+1][2*x+1:2*x]),
              // lempty, lin, lout,
-             .lempty(he2[x+1][y]),
+             .lempty(he[x+1][y]),
              .lin(hs[x+1][2*y+1:2*y]),
              .lout(hb[x+1][2*y+1:2*y]),
              // rempty, rin, rout
-             .rempty(he[x][y]),
+             .rempty(he2[x][y]),
              .rin(hb[x][2*y+1:2*y]),
              .rout(hs[x][2*y+1:2*y])
              );
@@ -142,13 +142,13 @@
   assign vb[BLOCKHEIGHT] = din;
   assign dout = vs[BLOCKHEIGHT];
   // RIGHT
-  assign he[0] = rempty;
-  assign rhempty = he2[0];   
+  assign he2[0] = rempty;
+  assign rhempty = he[0];   
   assign hb[0] = rin;
   assign rout = hs[0];
   // LEFT
-  assign he2[BLOCKWIDTH] = lempty;
-  assign lhempty = he[BLOCKWIDTH];  
+  assign he[BLOCKWIDTH] = lempty;
+  assign lhempty = he2[BLOCKWIDTH];  
   assign hs[BLOCKWIDTH] = lin;
   assign lout = hb[BLOCKWIDTH];
   
diff --git a/verilog/morphle/ycell.v b/verilog/morphle/ycell.v
index e10f099..3f1a45e 100644
--- a/verilog/morphle/ycell.v
+++ b/verilog/morphle/ycell.v
@@ -177,7 +177,7 @@
   wire [1:0] vout;
   wire [1:0] vback;
 
-  wire [1:0] hmatch = {(vback[1]&hmatch1)|(vback[0]&hmatch0),(vback[1]&~hmatch1&hmatch0)|(vback[0]&~hmatch0&hmatch1)};
+  wire [1:0] hmatch = {(vback[1]&vmatch1)|(vback[0]&vmatch0),(vback[1]&~vmatch1&vmatch0)|(vback[0]&~vmatch0&vmatch1)};
   ycfsm hfsm (.reset(hreset), .in(hin), .match(hmatch), .out(hout));
   wire [1:0] bhout = hbypass ? hin : hout;
   assign rout = bhout;
@@ -185,7 +185,7 @@
   assign hback = (rempty | hempty) ? bhout : rin; // don't propagate when rightmost or empty
   assign lout = hback;
   
-  wire [1:0] vmatch = {(hback[1]&vmatch1)|(hback[0]&vmatch0),(hback[1]&~vmatch1&vmatch0)|(hback[0]&~vmatch0&vmatch1)};
+  wire [1:0] vmatch = {(hback[1]&hmatch1)|(hback[0]&hmatch0),(hback[1]&~hmatch1&hmatch0)|(hback[0]&~hmatch0&hmatch1)};
   ycfsm vfsm (.reset(vreset), .in(vin), .match(vmatch), .out(vout));
   wire [1:0] bvout = vbypass ? vin : vout;
   assign dout = bvout;
diff --git a/verilog/mtests/test004.tv b/verilog/mtests/test004.tv
index d133429..feca9a9 100644
--- a/verilog/mtests/test004.tv
+++ b/verilog/mtests/test004.tv
@@ -90,6 +90,7 @@
 10A0000
 00F0000  // another 1 - configured to 101 (0)
 0010000  // out of reset
+0215502  // left empty, uin = 0, din = 0 ==> rout = 1
 00B0000  // reset
 20E0000  // clock in a 1 (thinks it is 011 = |)
 20A0000
@@ -110,7 +111,7 @@
 0010000
 0010041  // lin = 0 ==> rout = 0
 0010000
-0010014  // rin = 0 ==> lout = 0
+0010114  // rin = 0 ==> lout = 0, dout = 0
 0010000
 0011482  // din = 0, lin = 1 (what we are looking for) ==> rout = 1, dout = 0
 0011400  // lin = empty, but previous result was supposed to be latched
diff --git a/verilog/mtests/test005.tv b/verilog/mtests/test005.tv
index 195030f..16ab648 100644
--- a/verilog/mtests/test005.tv
+++ b/verilog/mtests/test005.tv
@@ -168,5 +168,26 @@
 A_2600_00000000_0000_00000000  // lsb bit
 B_2600_00000000_0000_00000000
 0_0000_00000000_0000_00000000  // normal operation!
-4_0000_15A80000_0000_15000000  // inject E0 00 11 1E EE EE EE EE and print
-4_0000_00000000_0000_00000000  // normal operation and print
+0_0000_15A80000_0000_15000000  // inject E0 00 11 1E EE EE EE EE
+0_0000_15A80000_0000_15540000  // inject E0 00 11 1E EE EE EE EE
+0_0000_00000000_0000_00540000  // normal operation
+0_0000_00A80000_0000_00000000  // clear outputs
+0_0000_16A80000_0000_16000000  // inject E0 01 11 1E EE EE EE EE
+0_0000_16A80000_0000_16580000  // inject E0 01 11 1E EE EE EE EE
+0_0000_00000000_0000_00580000  // normal operation
+0_0000_00A80000_0000_00000000  // clear outputs
+0_0000_19A80000_0000_19000000  // inject E0 10 11 1E EE EE EE EE
+0_0000_19A80000_0000_19640000  // inject E0 10 11 1E EE EE EE EE
+0_0000_00000000_0000_00640000  // normal operation and print
+0_0000_29000000_0000_29000000  // inject E1 10 11 1E EE EE EE EE
+0_0000_29000000_0000_29000000  // inject E1 10 11 1E EE EE EE EE
+0_0000_00000000_0000_00000000  // normal operation
+0_0000_00A80000_0000_00580000  // get pending outputs
+0_0000_00A80000_0000_00580000  // get pending outputs
+0_0000_00000000_0000_00000000  // everyone is empty again
+4_0000_02080000_0000_02000000  // inject EE E1 EE 1E EE EE EE EE
+4_0000_02080000_0000_02000000  // inject EE E1 EE 1E EE EE EE EE
+4_0000_0A280000_0000_0A000000  // inject EE 11 E1 1E EE EE EE EE
+4_0000_0A280000_0000_0A000000  // inject EE 11 E1 1E EE EE EE EE
+4_0000_2AA80000_0000_2A000000  // inject E1 11 11 1E EE EE EE EE
+4_0000_2AA80000_0000_2A580000  // inject E1 11 11 1E EE EE EE EE
diff --git a/verilog/mtests/test005upblock.v b/verilog/mtests/test005upblock.v
index 93f4619..b757b35 100644
--- a/verilog/mtests/test005upblock.v
+++ b/verilog/mtests/test005upblock.v
@@ -172,7 +172,7 @@
         end
         $display("  ");
         for (c = 15; c > 1; c = c - 1) begin // left to right
-          $write("  %b  ", DUT.blk.he2[c][r]);
+          $write("  %b  ", DUT.blk.he[c][r]);
           if (cfg[r][c] == 3'b000) $write(".");
           else if (cfg[r][c] == 3'b001) $write("+");
           else if (cfg[r][c] == 3'b010) $write("-");
@@ -182,7 +182,7 @@
           else if (cfg[r][c] == 3'b110) $write("Y");
           else if (cfg[r][c] == 3'b111) $write("N");
           else $write("?");
-          $write("  %b", DUT.blk.he2[c][r]);
+          $write("  %b", DUT.blk.he[c][r]);
         end
         $display("  ");
         for (c = 15; c > 1; c = c - 1) begin // left to right