Multiplex main RAM, cartridge, and video RAM access on the same bus
diff --git a/verilog/rtl/README.md b/verilog/rtl/README.md
new file mode 100644
index 0000000..09a7886
--- /dev/null
+++ b/verilog/rtl/README.md
@@ -0,0 +1,32 @@
+## Memory Access / Multiplexing
+
+Due to limited pins, cartridge/ main memory access and VRAM access are multiplexed over one same bus.
+
+4-cycles are divided as follows:
+
+- 0: Cartridge address setup (for external latching)
+- 1: VRAM access
+- 2: WRAM or cartridge RW
+- 3: VRAM access
+
+Diagram:
+
+```
+CT       |  0  |  1  |  2  |  3  |
+          ___________             ___            
+CLK  ____|           |___________|     
+     _    __    __    __    __    __    
+CK    |__|  |__|  |__|  |__|  |__|  |
+     ____       _____________________
+EALE     |_____|
+                      _____
+ECS  ________________|     |_________
+     ________________       _________
+WR                   |_____|
+     ____ _____ _____ _____ _____ ___
+ADDR ____X_WR__X_VR1_X_WR__X_VR2_X___
+     ____       _____ _____ _____ 
+DATA ____>-----<_VR1_X_WR__X_VR2_>---
+```
+
+Note that VRAM/WRAM seperation is based on function unit: Only PPU accesses VRAM on cycle 2 and 4. If CPU accesses VRAM, it would still happen on cycle 3.
diff --git a/verilog/rtl/async_ram.v b/verilog/rtl/async_ram.v
new file mode 100644
index 0000000..89765e6
--- /dev/null
+++ b/verilog/rtl/async_ram.v
@@ -0,0 +1,23 @@
+`timescale 1ns / 1ps
+`default_nettype wire
+module async_ram #(
+    parameter integer WORDS = 8192,
+    parameter ABITS = 13
+)(
+    input clka,
+    input wea,
+    input [ABITS - 1:0] addra,
+    input [7:0] dina,
+    output [7:0] douta
+);
+
+    reg [7:0] ram [0:WORDS-1];
+    
+    always@(posedge clka) begin
+        if (wea)
+            ram[addra] <= dina;
+    end
+    
+    assign douta = ram[addra];
+
+endmodule
diff --git a/verilog/rtl/boy.v b/verilog/rtl/boy.v
index 69c2d65..b437e09 100644
--- a/verilog/rtl/boy.v
+++ b/verilog/rtl/boy.v
@@ -21,7 +21,8 @@
     input wire rst, // Async Reset Input
     input wire clk, // 4.19MHz Clock Input
     output wire phi, // 1.05MHz Reference Clock Output
-    // Cartridge interface
+    output wire [1:0] ct, // 0-3T cycle number
+    // CPU/ DMA bus interface
     output wire [15:0] a, // Address Bus
     output wire [7:0] dout,  // Data Bus
     input wire [7:0] din,
@@ -38,6 +39,12 @@
     // Sound output
     output reg [15:0] left,
     output reg [15:0] right,
+    // PPU bus interface
+    output wire [12:0] ppu_a,
+    output wire ppu_wr,
+    output wire ppu_rd,
+    output wire [7:0] ppu_dout,
+    input wire [7:0] ppu_din,
     // Debug interface
     output wire done,
     output wire fault
@@ -49,17 +56,18 @@
     reg  [7:0]  cpu_din;           // CPU Data Bus, to CPU
     wire [7:0]  cpu_dout;          // CPU Data Bus, from CPU
     wire [15:0] cpu_a;             // CPU Address Bus
+    wire [15:0] cpu_a_early;       // CPU Address Unbuffered
     wire [4:0]  cpu_int_en;        // CPU Interrupt Enable input
     wire [4:0]  cpu_int_flags_in;  // CPU Interrupt Flags input
     wire [4:0]  cpu_int_flags_out; // CPU Interrupt Flags output
-    wire [1:0]  cpu_ct;            // 0-3 T cycle number inside one M cycle
     
     cpu cpu(
         .clk(clk),
         .rst(rst),
         .phi(phi),
-        .ct(cpu_ct),
+        .ct(ct),
         .a(cpu_a),
+        .a_early(cpu_a_early),
         .dout(cpu_dout),
         .din(cpu_din),
         .rd(cpu_rd),
@@ -89,16 +97,15 @@
     wire dma_rd; // DMA Memory Write Enable
     wire dma_wr; // DMA Memory Read Enable
     wire [15:0] dma_a; // Main Address Bus
-    reg  [7:0]  dma_din; // Main Data Bus
+    wire [7:0]  dma_din; // Main Data Bus
     wire [7:0]  dma_dout;
     wire [7:0]  dma_mmio_dout;
     reg dma_mmio_wr; // actually wire
-    wire dma_occupy_extbus; // 0x0000 - 0x7FFF, 0xA000 - 0xFFFF
-    wire dma_occupy_vidbus; // 0x8000 - 0x9FFF
-    wire dma_occupy_oambus; // 0xFE00 - 0xFE9F
+    wire dma_occupy_bus;
     dma dma(
         .clk(clk),
         .rst(rst),
+        .ct(ct),
         .dma_rd(dma_rd),
         .dma_wr(dma_wr),
         .dma_a(dma_a),
@@ -107,10 +114,9 @@
         .mmio_wr(dma_mmio_wr),
         .mmio_din(cpu_dout),
         .mmio_dout(dma_mmio_dout),
-        .dma_occupy_extbus(dma_occupy_extbus),
-        .dma_occupy_vidbus(dma_occupy_vidbus),
-        .dma_occupy_oambus(dma_occupy_oambus)
+        .dma_occupy_bus(dma_occupy_bus)
     );
+    assign dma_din = din;
 
     // Interrupt
     // int_req is the request signal from peripherals.
@@ -171,12 +177,6 @@
     // PPU
     wire [7:0] ppu_mmio_dout;
     reg ppu_mmio_wr; // actually wire
-    wire [15:0] vram_a;
-    wire [7:0] vram_dout;
-    //wire [7:0] vram_din;
-    wire vram_rd;
-    wire vram_wr;
-    reg vram_cpu_wr;
     wire [15:0] oam_a;
     wire [7:0] oam_dout;
     wire [7:0] oam_din;
@@ -184,28 +184,20 @@
     wire oam_wr;
     reg oam_cpu_wr;
 
-    assign vram_a = (dma_occupy_vidbus) ? (dma_a) : (cpu_a);
-    //assign vram_din = (dma_occupy_vidbus) ? (dma_dout) : (cpu_dout);
-    assign vram_rd = (dma_occupy_vidbus) ? (dma_rd) : (cpu_rd);
-    assign vram_wr = (dma_occupy_vidbus) ? (1'b0) : (vram_cpu_wr);
-    assign oam_a = (dma_occupy_oambus) ? (dma_a) : (cpu_a);
-    assign oam_din = (dma_occupy_oambus) ? (dma_dout) : (cpu_dout);
-    assign oam_rd = (dma_occupy_oambus) ? (1'b0) : (cpu_rd);
-    assign oam_wr = (dma_occupy_oambus) ? (dma_wr) : (oam_cpu_wr);
+    assign oam_a = (dma_occupy_bus) ? (dma_a) : (cpu_a);
+    assign oam_din = (dma_occupy_bus) ? (dma_dout) : (cpu_dout);
+    assign oam_rd = (dma_occupy_bus) ? (1'b0) : (cpu_rd);
+    assign oam_wr = (dma_occupy_bus) ? (dma_wr) : (oam_cpu_wr);
 
     ppu ppu(
         .clk(clk),
         .rst(rst),
+        .ct(ct),
         .mmio_a(cpu_a), // mmio bus is always accessable to CPU
         .mmio_dout(ppu_mmio_dout),
         .mmio_din(cpu_dout),
         .mmio_rd(cpu_rd),
         .mmio_wr(ppu_mmio_wr),
-        .vram_a(vram_a),
-        .vram_dout(vram_dout),
-        .vram_din(cpu_dout), // DMA never writes to VRAM
-        .vram_rd(vram_rd),
-        .vram_wr(vram_wr),
         .oam_a(oam_a),
         .oam_dout(oam_dout),
         .oam_din(oam_din),
@@ -220,6 +212,11 @@
         .valid(valid),
         .hs(hs), // Horizontal Sync, Low Active
         .vs(vs),  // Vertical Sync, Low Active
+        .vram_a(ppu_a),
+        .vram_dout(ppu_din),
+        .vram_din(ppu_dout),
+        .vram_rd(ppu_rd),
+        .vram_wr(ppu_wr),
         // Ignore the debugging interface
         /* verilator lint_off PINCONNECTEMPTY */
         .scx(),
@@ -235,7 +232,7 @@
     timer timer(
         .clk(clk),
         .rst(rst),
-        .ct(cpu_ct),
+        .ct(ct),
         .a(cpu_a),
         .dout(timer_dout),
         .din(cpu_dout),
@@ -308,25 +305,6 @@
         .d(brom_dout)
     );
 
-    // Work RAM
-    wire [7:0] wram_dout;
-    wire [12:0] wram_a;
-    wire wram_wr;
-    reg wram_cpu_wr; // actually wire
-
-    assign wram_a = (dma_occupy_extbus) ? (dma_a[12:0]) : (cpu_a[12:0]);
-    assign wram_wr = (dma_occupy_extbus) ? (1'b0) : (wram_cpu_wr);
-
-    singleport_ram #(
-        .WORDS(8192)
-    ) br_wram (
-        .clka(clk),
-        .wea(wram_wr),
-        .addra(wram_a), 
-        .dina(cpu_dout), // DMA never writes to Work RAM
-        .douta(wram_dout)
-    );
-
     // Keypad
     wire [7:0] keypad_reg;
     reg keypad_reg_wr; // actually wire
@@ -345,12 +323,12 @@
           ((keypad_high[0] == 1'b1) ? (key[3:0]) : 4'h0)); 
     assign int_key_req = (keypad_reg[3:0] != 4'hf) ? (1'b1) : (1'b0);
 
-    // External Bus
+    // External Bus (this includes CPU/DMA access to WRAM, VRAM, and cartridge)
     reg ext_cpu_wr;  // wire
-    assign a = (dma_occupy_extbus) ? (dma_a) : (cpu_a);
+    assign a = (dma_occupy_bus) ? (dma_a) : (cpu_a_early);
     assign dout = cpu_dout; // DMA never writes to external bus
-    assign wr = (dma_occupy_extbus) ? (1'b0) : (ext_cpu_wr);
-    assign rd = (dma_occupy_extbus) ? (dma_rd) : (cpu_rd);
+    assign wr = (dma_occupy_bus) ? (1'b0) : (ext_cpu_wr);
+    assign rd = (dma_occupy_bus) ? (dma_rd) : (cpu_rd);
 
     // Bus Multiplexing, CPU
     always @(*) begin
@@ -364,9 +342,7 @@
         high_ram_wr = 1'b0;
         sound_wr = 1'b0;
         ppu_mmio_wr = 1'b0;
-        vram_cpu_wr = 1'b0;
         oam_cpu_wr = 1'b0;
-        wram_cpu_wr = 1'b0;
         ext_cpu_wr = 1'b0;
         // -- These are exclusive to CPU --
         if (cpu_a == 16'hffff) begin  // 0xFFFF - IE
@@ -418,23 +394,9 @@
             cpu_din = brom_dout;
         end 
         // -- These are shared between CPU and DMA --
-        else if (cpu_a >= 16'h8000 && cpu_a <= 16'h9fff) begin // VRAM
-            vram_cpu_wr = cpu_wr;
-            cpu_din = (dma_occupy_vidbus) ? (8'hff) : (vram_dout);
-        end
-        else if (cpu_a >= 16'hfe00 && cpu_a <= 16'hfe9f) begin // OAM
-            oam_cpu_wr = cpu_wr;
-            cpu_din = (dma_occupy_oambus) ? (8'hff) : (oam_dout);
-        end
-        else if ((cpu_a >= 16'hc000 && cpu_a <= 16'hdfff) ||
-                 (cpu_a >= 16'he000 && cpu_a <= 16'hfdff)) begin // WRAM
-            wram_cpu_wr = cpu_wr;
-            cpu_din = (dma_occupy_extbus) ? (8'hff) : (wram_dout);
-        end
-        else if ((cpu_a <= 16'h7fff) ||
-                 (cpu_a >= 16'ha000 && cpu_a <= 16'hbfff)) begin // External
+        else if (cpu_a <= 16'hfdff) begin // External/ Work RAM/ Video RAM
             ext_cpu_wr = cpu_wr;
-            cpu_din = (dma_occupy_extbus) ? (8'hff) : (din);
+            cpu_din = (dma_occupy_bus) ? (8'hff) : (din);
         end
         else begin
             // Unmapped area
@@ -442,18 +404,4 @@
         end
     end
 
-    // Bus Multiplexing, DMA
-    always @(*) begin
-        if (dma_a >= 16'h8000 && dma_a <= 16'h9fff) begin // VRAM
-            dma_din = vram_dout;
-        end
-        else if ((dma_a >= 16'hc000 && dma_a <= 16'hdfff) ||
-                 (dma_a >= 16'he000 && dma_a <= 16'hfdff)) begin // WRAM
-            dma_din = wram_dout;
-        end
-        else begin
-            dma_din = din;
-        end
-    end
-
 endmodule
diff --git a/verilog/rtl/chip.v b/verilog/rtl/chip.v
new file mode 100644
index 0000000..ffceb0c
--- /dev/null
+++ b/verilog/rtl/chip.v
@@ -0,0 +1,149 @@
+`timescale 1ns / 1ps
+`default_nettype none
+//////////////////////////////////////////////////////////////////////////////////
+// Company: 
+// Engineer: Wenting Zhang
+// 
+// Create Date:    18:48:36 02/14/2018 
+// Design Name: 
+// Module Name:    ppu 
+// Project Name: 
+// Target Devices: 
+// Tool versions: 
+// Description: 
+//   Chip top level
+// Additional Comments: 
+//   Wraps up the VerilogBoy and expose signals to be connected to the pad frame
+//////////////////////////////////////////////////////////////////////////////////
+module chip(
+    input wire clk, // 4 MHz clock input
+    input wire rst, // Active high sync reset
+    output reg [15:0] a, // Address bus
+    output reg [7:0] dout, // Data bus to be written
+    input wire [7:0] din, // Data bus read
+    output reg doe, // Data bus output enable
+    output reg wr, // High active write enable
+    output reg cale, // Cartridge address latch enable
+    output reg cs, // Cartridge chip select
+    output wire hsync, // LCD horizontal sync
+    output wire vsync, // LCD vertical sync
+    output wire pvalid, // LCD pixel valid/ clock gate
+    output wire [1:0] pixel, // LCD pixel output
+    input wire skey, // Serial key input
+    output wire audiol, // Audio left output
+    output wire audior, // Audio right output
+    input wire mode, // Test mode
+    // For testbench only
+    output wire done,
+    output wire fault
+);
+    wire [1:0] ct;
+    wire [15:0] cpu_a;
+    wire [7:0] cpu_dout;
+    reg [7:0] cpu_din;
+    wire cpu_wr;
+    wire cpu_rd;
+    wire [15:0] ppu_a;
+    wire [7:0] ppu_dout;
+    reg [7:0] ppu_din;
+    wire ppu_wr;
+    wire ppu_rd;
+    wire [15:0] left;
+    wire [15:0] right;
+
+    boy boy(
+        .rst(rst), // Async Reset Input
+        .clk(clk), // 4.19MHz Clock Input
+        .phi(), // 1.05MHz Reference Clock Output
+        .ct(ct), // 0-3T cycle number
+        // Cartridge interface
+        .a(cpu_a), // Address Bus
+        .dout(cpu_dout),  // Data Bus
+        .din(cpu_din),
+        .wr(cpu_wr), // Write Enable
+        .rd(cpu_rd), // Read Enable
+        // Keyboard input
+        .key(8'b0),
+        // LCD output
+        .hs(hsync), // Horizontal Sync Output
+        .vs(vsync), // Vertical Sync Output
+        .cpl(), // Pixel Data Latch
+        .pixel(pixel), // Pixel Data
+        .valid(pvalid),
+        // Sound output
+        .left(left),
+        .right(right),
+        // Video RAM interface
+        .ppu_a(ppu_a[12:0]),
+        .ppu_wr(ppu_wr),
+        .ppu_rd(ppu_rd),
+        .ppu_din(ppu_din),
+        .ppu_dout(ppu_dout),
+        // Debug interface
+        .done(done),
+        .fault(fault)
+    );
+
+    assign ppu_a[15:13] = 3'b100;
+
+    // Internal SRAM (WRAM + VRAM, 16KB)
+    // Address
+    // 1000 xxxx xxxx xxxx VRAM
+    // 1001 xxxx xxxx xxxx VRAM
+    // 1100 xxxx xxxx xxxx WRAM
+    // 1101 xxxx xxxx xxxx WRAM
+
+    wire addr_is_ram = ((cpu_a >= 16'h8000) && (cpu_a <= 16'h9fff)) ||
+            ((cpu_a >= 16'hc000) && (cpu_a <= 16'hdfff));
+    wire addr_is_cart = ((cpu_a <= 16'h7fff) || // Cart ROM
+            ((cpu_a >= 16'ha000) && (cpu_a <= 16'hbfff))); // Cart RAM
+
+    reg addr_is_cart_reg;
+    reg [15:0] cpu_a_reg;
+    always @(posedge clk) begin
+        if (ct == 2'b00) begin
+            addr_is_cart_reg <= addr_is_cart;
+            cpu_a_reg <= cpu_a;
+        end
+    end
+
+    // Bus multiplexing
+    always @(*) begin
+        if (ct == 2'b00) begin
+            // CPU/ DMA address output
+            a = cpu_a;
+            dout = 8'hff;
+            doe = 1'b0;
+            wr = 1'b0;
+            cale = 1'b1;
+            cs = 1'b0;
+            cpu_din = 8'hff;
+            ppu_din = 8'hff;
+        end
+        else if ((ct == 2'b01) || (ct == 2'b11)) begin
+            // VRAM access (read only)
+            a = ppu_a;
+            dout = 8'hff;
+            doe = 1'b0;
+            wr = 1'b0;
+            cale = 1'b0;
+            cs = 1'b0;
+            cpu_din = 8'hff;
+            ppu_din = din;
+        end
+        else begin
+            // CPU/ DMA access (RW)
+            a = cpu_a_reg;
+            dout = cpu_dout;
+            doe = !cpu_wr;
+            wr = cpu_wr;
+            cale = 1'b0;
+            cs = addr_is_cart_reg;
+            cpu_din = din;
+            ppu_din = 8'hff;
+        end
+    end
+
+
+endmodule
+`default_nettype wire
diff --git a/verilog/rtl/cpu.v b/verilog/rtl/cpu.v
index 6b59982..cd6df6b 100644
--- a/verilog/rtl/cpu.v
+++ b/verilog/rtl/cpu.v
@@ -21,6 +21,7 @@
     output reg phi,
     output wire [1:0] ct,
     output reg [15:0] a,
+    output wire [15:0] a_early,
     output reg [7:0] dout,
     input [7:0] din,
     output reg rd,
@@ -454,6 +455,12 @@
             end
             2'b01: begin
                 // Read in progress
+                if (bus_op == 2'b10) begin
+                    // Write cycle
+                    wr <= 1;
+                    dout <= db_wr;
+                end
+                // Otherwise wait for next cycle for read
             end
             2'b10: begin
                 if (bus_op == 2'b10) begin
@@ -501,6 +508,8 @@
         end
     end
 
+    assign a_early = ab_wr; // For external latching
+
     // CT - FSM / Instruction Execution
     reg  [1:0] alu_src_a_ct;
     reg  [2:0] alu_src_b_ct;
diff --git a/verilog/rtl/dma.v b/verilog/rtl/dma.v
index 5aaadaa..d384a53 100644
--- a/verilog/rtl/dma.v
+++ b/verilog/rtl/dma.v
@@ -25,6 +25,7 @@
     input  wire        clk,
     //input  wire        phi,
     input  wire        rst,
+    input  wire [1:0]  ct,
     output reg         dma_rd,
     output reg         dma_wr,
     //output wire        dma_rd_comb,
@@ -35,9 +36,7 @@
     input  wire        mmio_wr,
     input  wire [7:0]  mmio_din,
     output wire [7:0]  mmio_dout,
-    output wire        dma_occupy_extbus,
-    output wire        dma_occupy_vidbus,
-    output wire        dma_occupy_oambus
+    output wire        dma_occupy_bus
     );
 
     // DMA data blocks /////////////////////////////////////////////////////////
@@ -49,11 +48,7 @@
 
     reg cpu_mem_disable;
 
-    assign dma_occupy_extbus = cpu_mem_disable & 
-            ((dma_start_addr <= 8'h7f) || (dma_start_addr >= 8'ha0));
-    assign dma_occupy_vidbus = cpu_mem_disable &
-            ((dma_start_addr >= 8'h80) && (dma_start_addr <= 8'h9f));
-    assign dma_occupy_oambus = cpu_mem_disable;
+    assign dma_occupy_bus = cpu_mem_disable;
 
    // DMA transfer logic //////////////////////////////////////////////////////
    
@@ -95,16 +90,11 @@
                 if (mmio_wr) begin
                     // Transfer starts on next cycle
                     state <= DMA_DELAY;
-                    count <= 8'd3; // Delay before start
                 end
-                else
-                    count <= 8'd0;
+                count <= 8'd0;
             end
             DMA_DELAY: begin
-                if (count != 8'd0) begin
-                    count <= count - 1;
-                end
-                else begin
+                if (ct == 2'b11) begin
                     state <= DMA_TRANSFER_READ_ADDR;
                 end
             end
@@ -116,7 +106,7 @@
                 dma_rd <= 1'b1;
                 if (mmio_wr) begin // Allow re-triggering
                     state <= DMA_DELAY;
-                    count <= 8'd3; // Delay before start
+                    count <= 8'd0;
                 end
                 else
                     state <= DMA_TRANSFER_READ_DATA;
@@ -134,7 +124,7 @@
                 dma_wr <= 1'b1;
                 if (mmio_wr) begin // Allow re-triggering
                     state <= DMA_DELAY;
-                    count <= 8'd3; // Delay before start
+                    count <= 8'd0;
                 end
                 else
                     state <= DMA_TRANSFER_WRITE_WAIT;
@@ -143,7 +133,7 @@
                 // Wait
                 if (mmio_wr) begin // Allow re-triggering
                     state <= DMA_DELAY;
-                    count <= 8'd3; // Delay before start
+                    count <= 8'd0; // Delay before start
                 end
                 else
                 if (count == 8'h9f) begin
diff --git a/verilog/rtl/ppu.v b/verilog/rtl/ppu.v
index bf4e1e8..01ab72e 100644
--- a/verilog/rtl/ppu.v
+++ b/verilog/rtl/ppu.v
@@ -53,18 +53,13 @@
 module ppu(
     input clk,
     input rst,
+    input wire [1:0] ct,
     // MMIO Bus, 0xFF40 - 0xFF4B, always visible to CPU
     input wire [15:0] mmio_a,
     output reg [7:0]  mmio_dout,
     input wire [7:0]  mmio_din,
     input wire        mmio_rd,
     input wire        mmio_wr,
-    // VRAM Bus, 0x8000 - 0x9FFF
-    input wire [15:0] vram_a,
-    output wire [7:0] vram_dout,
-    input wire [7:0]  vram_din,
-    input wire        vram_rd,
-    input wire        vram_wr,
     // OAM Bus,  0xFE00 - 0xFE9F
     input wire [15:0] oam_a,
     output wire [7:0] oam_dout,
@@ -82,6 +77,12 @@
     output reg valid, // Pixel Valid
     output reg hs, // Horizontal Sync, High Valid
     output reg vs, // Vertical Sync, High Valid
+    // Video RAM interface
+    output wire [12:0] vram_a,
+    output wire vram_wr,
+    output wire vram_rd,
+    output wire [7:0] vram_din,
+    input wire [7:0] vram_dout,
     //Debug output
     output [7:0] scx,
     output [7:0] scy,
@@ -140,11 +141,7 @@
     wire vram_addr_int_sel; // 0 - BG, 1 - OBJ
     
     assign vram_addr_int = (vram_addr_int_sel == 1'b1) ? (vram_addr_obj) : (vram_addr_bg);
-    
-    wire vram_access_ext = ((reg_mode == PPU_MODE_H_BLANK)||
-                            (reg_mode == PPU_MODE_V_BLANK)||
-                            (reg_mode == PPU_MODE_OAM_SEARCH));
-    wire vram_access_int = ~vram_access_ext;
+
     wire oam_access_ext = ((reg_mode == PPU_MODE_H_BLANK)||
                            (reg_mode == PPU_MODE_V_BLANK));
     
@@ -187,25 +184,12 @@
     assign oam_dout = (oam_access_ext) ? (oam_data_out_byte) : (8'hFF);
 
     // 8 bit WR, 8 bit RD, 8KB VRAM
-    wire        vram_we;
-    wire [12:0] vram_addr;
-    wire [7:0]  vram_data_in;
     wire [7:0]  vram_data_out;
-    
-    singleport_ram #(
-        .WORDS(8192)
-    ) br_vram (
-        .clka(~clk),
-        .wea(vram_we),
-        .addra(vram_addr[12:0]),
-        .dina(vram_data_in),
-        .douta(vram_data_out));
-        
-    assign vram_addr_ext = vram_a[12:0];
-    assign vram_addr = (vram_access_ext) ? (vram_addr_ext) : (vram_addr_int);
-    assign vram_data_in = vram_din;
-    assign vram_we = (vram_wr)&(vram_access_ext);
-    assign vram_dout = (vram_access_ext) ? (vram_data_out) : (8'hFF);
+
+    assign vram_a = vram_addr_int;
+    assign vram_wr = 1'b0; // PPU doesn't write to VRAM
+    assign vram_din = 8'd0;
+    assign vram_data_out = vram_dout;
     
     // Pixel Pipeline
     
@@ -450,7 +434,8 @@
     assign vram_addr_int_sel = 
         ((r_state == S_OAMRDB) || (r_state == S_OFRD0A) || (r_state == S_OFRD0B)
             || (r_state == S_OFRD1A) || (r_state == S_OFRD1B)) ? 1'b1 : 1'b0;
-        
+    assign vram_rd = (r_state == S_FTIDB) || (r_state == S_FRD0B) ||
+        (r_state == S_FRD1B) || (r_state == S_OFRD0B) || (r_state == S_OFRD1B);    
     
     // Current mode logic, based on current state
     always @ (posedge clk)
@@ -712,6 +697,8 @@
             end
         end
     end
+
+    wire ram_ready = ((ct == 2'b00) || (ct == 2'b10));
     
     // Next State Logic
     // Since new state get updated during posedge
@@ -733,20 +720,20 @@
                 ) : (S_IDLE);
             S_OAMX: r_next_state = (reg_lcd_en) ? (S_OAMY) : (S_IDLE);
             S_OAMY: r_next_state = (reg_lcd_en) ? ((oam_search_count == (PPU_OAM_SEARCH_LENGTH - 1'b1)) ? (S_FTIDA) : (S_OAMX)) : (S_IDLE);
-            S_FTIDA: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FTIDB))) : (S_IDLE);
+            S_FTIDA: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (ram_ready ? S_FTIDB : S_FTIDA))) : (S_IDLE);
             S_FTIDB: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FRD0A))) : (S_IDLE);
-            S_FRD0A: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FRD0B))) : (S_IDLE);
+            S_FRD0A: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (ram_ready ? S_FRD0B : S_FRD0A))) : (S_IDLE);
             S_FRD0B: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FRD1A))) : (S_IDLE);
-            S_FRD1A: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FRD1B))) : (S_IDLE);
+            S_FRD1A: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (ram_ready ? S_FRD1B : S_FRD1A))) : (S_IDLE);
             S_FRD1B: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : ((pf_empty != PF_FULL) ? (S_FTIDA) : (S_FWAITA)))) : (S_IDLE); // If fifo not full, no wait state is needed
             S_FWAITA: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FWAITB))) : (S_IDLE);
             S_FWAITB: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : ((window_trigger) ? (S_SWW) : (S_FTIDA))) : (S_IDLE);
             S_SWW: r_next_state = (reg_lcd_en) ? ((h_pix_output == (PPU_H_OUTPUT - 1'b1)) ? (S_BLANK) : (S_FTIDA)) : (S_IDLE);
             S_OAMRDA: r_next_state = (reg_lcd_en) ? (S_OAMRDB) : (S_IDLE);
             S_OAMRDB: r_next_state = (reg_lcd_en) ? (S_OFRD0A) : (S_IDLE);
-            S_OFRD0A: r_next_state = (reg_lcd_en) ? (S_OFRD0B) : (S_IDLE);
+            S_OFRD0A: r_next_state = (reg_lcd_en) ? (ram_ready ? S_OFRD0B : S_OFRD0A) : (S_IDLE);
             S_OFRD0B: r_next_state = (reg_lcd_en) ? (S_OFRD1A) : (S_IDLE);
-            S_OFRD1A: r_next_state = (reg_lcd_en) ? (S_OFRD1B) : (S_IDLE);
+            S_OFRD1A: r_next_state = (reg_lcd_en) ? (ram_ready ? S_OFRD1B : S_OFRD1A) : (S_IDLE);
             S_OFRD1B: r_next_state = (reg_lcd_en) ? (S_OWB) : (S_IDLE);
             S_OWB: r_next_state = (reg_lcd_en) ? (r_next_backup) : (S_IDLE);
             default: r_next_state = S_IDLE;
diff --git a/verilog/rtl/simtop.v b/verilog/rtl/simtop.v
new file mode 100644
index 0000000..64a5070
--- /dev/null
+++ b/verilog/rtl/simtop.v
@@ -0,0 +1,90 @@
+`timescale 1ns / 1ps
+`default_nettype wire
+////////////////////////////////////////////////////////////////////////////////
+// Company: 
+// Engineer: Wenting Zhang
+// 
+// Create Date:    17:30:26 02/08/2018 
+// Module Name:    simtop
+// Project Name:   VerilogBoy
+// Description: 
+//   Top-level wrapper for RTL simulation
+////////////////////////////////////////////////////////////////////////////////
+module simtop(
+    input wire clk,
+    input wire rst,
+    // Cartridge interface
+    output reg [15:0] a,
+    output wire [7:0] dout,
+    input wire [7:0] din,
+    output wire wr,
+    output wire rd,
+    // Keyboard input
+    input wire [7:0] key,
+    // LCD output
+    output wire hs,
+    output wire vs,
+    output wire [1:0] pixel,
+    output wire valid,
+    // For testbench only
+    output wire done,
+    output wire fault
+    );
+
+    wire [15:0] bus_a;
+    wire [7:0] bus_dout;
+    wire [7:0] bus_din;
+    wire bus_doe;
+    wire bus_wr;
+    wire bus_cale;
+    wire bus_cs;
+    wire skey;
+
+    chip chip(
+        .clk(clk),
+        .rst(rst),
+        .a(bus_a),
+        .dout(bus_dout),
+        .din(bus_din),
+        .doe(bus_doe),
+        .wr(bus_wr),
+        .cale(bus_cale),
+        .cs(bus_cs),
+        .hsync(hs),
+        .vsync(vs),
+        .pvalid(valid),
+        .pixel(pixel),
+        .skey(skey),
+        .audiol(),
+        .audior(),
+        .mode(1'b0),
+        .done(done),
+        .fault(fault)
+    );
+
+    wire sram_we;
+    wire [7:0] sram_dout;
+    async_ram #(.WORDS(16384), .ABITS(14)) sram(
+        .clka(clk),
+        .wea(sram_we),
+        .addra({bus_a[14], bus_a[12:0]}),
+        .dina(bus_dout),
+        .douta(sram_dout)
+    );
+    assign sram_we = bus_wr & !bus_cs;
+
+    // OR use transparent latch
+    always @(posedge clk) begin
+        if (bus_cale)
+            a <= bus_a;
+    end
+
+    assign dout = bus_dout;
+    assign bus_din = bus_cs ? din : sram_dout;
+    assign wr = bus_cs & bus_wr;
+    assign rd = ~bus_wr; // Always enable output
+
+    // Key parallel to serial
+    assign skey = 1'b0;
+
+endmodule
diff --git a/verilog/sim/Makefile b/verilog/sim/Makefile
index 415ae0a..eb56330 100644
--- a/verilog/sim/Makefile
+++ b/verilog/sim/Makefile
@@ -8,7 +8,7 @@
 VROOT := $(VERILATOR_ROOT)
 VINCD := $(VROOT)/include
 RTLOBJDIR := $(RTLDIR)/obj_dir
-RTLOBJ := $(RTLOBJDIR)/Vboy__ALL.a
+RTLOBJ := $(RTLOBJDIR)/Vsimtop__ALL.a
 
 CC = g++
 CXX = g++
diff --git a/verilog/sim/bootrom.mif b/verilog/sim/bootrom.mif
new file mode 120000
index 0000000..e4e13d4
--- /dev/null
+++ b/verilog/sim/bootrom.mif
@@ -0,0 +1 @@
+../rtl/bootrom.mif
\ No newline at end of file
diff --git a/verilog/sim/main.cpp b/verilog/sim/main.cpp
index c181510..a7d33bd 100644
--- a/verilog/sim/main.cpp
+++ b/verilog/sim/main.cpp
@@ -31,7 +31,7 @@
 
 #include "verilated.h"
 #include "verilated_vcd_c.h"
-#include "Vboy.h"
+#include "Vsimtop.h"
 
 #include "memsim.h"
 #include "mbcsim.h"
@@ -46,11 +46,11 @@
 #define CON_BASE 0x20000000
 
 // Verilator related
-Vboy *core;
+Vsimtop *core;
 VerilatedVcdC *trace;
 
 #define CONCAT(a,b) a##b
-#define SIGNAL(x) CONCAT(core->boy__DOT__,x)
+#define SIGNAL(x) CONCAT(core->simtop__DOT__chip__DOT__boy__DOT__,x)
 
 // this only applies to quiet mode.
 const uint64_t CYCLE_LIMIT = 32768;
@@ -141,7 +141,7 @@
             (SIGNAL(cpu__DOT__next == 0))) {
             // Instruction just finished executing
             fprintf(it, "Time %ld\nPC = %04x, F = %c%c%c%c, A = %02x, SP = %02x%02x\nB = %02x, C = %02x, D = %02x, E = %02x, H = %02x, L = %02x\n",
-                10 * tickcount,
+                10 * (tickcount - 1), // Make timing compatible with old traces
                 SIGNAL(cpu__DOT__pc),
                 ((SIGNAL(cpu__DOT__flags)) & 0x8) ? 'Z' : '-',
                 ((SIGNAL(cpu__DOT__flags)) & 0x4) ? 'N' : '-',
@@ -176,7 +176,7 @@
     // Initialize testbench
     Verilated::commandArgs(argc, argv);
 
-    core = new Vboy;
+    core = new Vsimtop;
     Verilated::traceEverOn(true);
 
     if (argc < 2) {
@@ -231,8 +231,8 @@
         mbc = new MBCSIM();
     }
     else {
-        cartrom = new MEMSIM(0x0000, 32768, 0);
-        cartram = new MEMSIM(0xa000, 8192, 0);
+        cartrom = new MEMSIM(0x0000, 32768);
+        cartram = new MEMSIM(0xa000, 8192);
     }
 
     if (!quiet) {
diff --git a/verilog/sim/memsim.cpp b/verilog/sim/memsim.cpp
index 0cbf3b0..bf79380 100644
--- a/verilog/sim/memsim.cpp
+++ b/verilog/sim/memsim.cpp
@@ -2,7 +2,7 @@
 // VerilogBoy simulator
 // Copyright 2022 Wenting Zhang
 //
-// memsim.cpp: A memory simulation model with simple delay control
+// memsim.cpp: An async memory simulation model
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -29,15 +29,10 @@
 #include <assert.h>
 #include "memsim.h"
 
-MEMSIM::MEMSIM(uint16_t base, size_t len, size_t delay) {
+MEMSIM::MEMSIM(uint16_t base, size_t len) {
     this->base = base;
     this->len = len;
-    this->delay = delay;
     mem = (uint8_t *)malloc(len);
-    delay_count = 0;
-    last_wr = 0;
-    last_rd = 0;
-    last_data = 0;
 }
 
 MEMSIM::~MEMSIM(void) {
@@ -58,35 +53,25 @@
 }
 
 void MEMSIM::apply(uint8_t wr_data, uint16_t address, 
-    uint8_t wr_enable, uint8_t rd_enable, uint8_t &rd_data) {
+    uint8_t wr, uint8_t rd, uint8_t &rd_data) {
 
-    if (delay_count == 0) {
-        if ((address >= base) && (address < (base + len))) {
-            if (last_wr && !wr_enable) {
-                mem[address - base] = last_data;
-                delay_count = delay;
+    if ((address >= base) && (address < (base + len))) {
+        if (wr) {
+            mem[address - base] = wr_data;
 #ifdef __DEBUG
-            printf("MEMBUS W[%04x] = %02x\n",
-                address,
-                last_data);
+        printf("MEMBUS W[%04x] = %02x\n",
+            address,
+            wr_data);
 #endif
-            } 
-            else if (!last_rd && rd_enable) {
-                rd_data = mem[address - base];
-                delay_count = delay;
-#ifdef __DEBUG
-            printf("MEMBUS R[%04x] = %02x\n",
-                address,
-                rd_data);
-#endif
-            }
         } 
-        last_rd = rd_enable;
-        last_wr = wr_enable;
-        last_data = wr_data;
-    } 
-    else {
-        delay_count --;
+        else if (rd) {
+            rd_data = mem[address - base];
+#ifdef __DEBUG
+        printf("MEMBUS R[%04x] = %02x\n",
+            address,
+            rd_data);
+#endif
+        }
     }
 }
 
diff --git a/verilog/sim/memsim.h b/verilog/sim/memsim.h
index 8ce05d2..8457d32 100644
--- a/verilog/sim/memsim.h
+++ b/verilog/sim/memsim.h
@@ -26,18 +26,13 @@
 
 class MEMSIM {
 public:
-    MEMSIM(uint16_t base, size_t len, size_t delay);
+    MEMSIM(uint16_t base, size_t len);
     ~MEMSIM(void);
     void load(char *fname);
-    void apply(uint8_t wr_data, uint16_t address, uint8_t wr_enable,
-            uint8_t rd_enable, uint8_t &rd_data);
+    void apply(uint8_t wr_data, uint16_t address, uint8_t wr,
+            uint8_t rd, uint8_t &rd_data);
 private:
     uint16_t base;
     uint8_t *mem;
     uint16_t len;
-    int delay;
-    int delay_count;
-    uint8_t last_wr;
-    uint8_t last_rd;
-    uint8_t last_data;
 };
diff --git a/verilog/sim/rtl.mk b/verilog/sim/rtl.mk
index fdc9c72..024e74e 100644
--- a/verilog/sim/rtl.mk
+++ b/verilog/sim/rtl.mk
@@ -1,4 +1,4 @@
-TARGET ?= boy
+TARGET ?= simtop
 all: $(TARGET)
 
 VOBJ := obj_dir