Import verilog files to rtl & tb
diff --git a/.gitignore b/.gitignore
index f4e486c..b2a927b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-precheck_results
\ No newline at end of file
+precheck_results
+*~
diff --git a/verilog/dv/tb/alu_tb.v b/verilog/dv/tb/alu_tb.v
new file mode 100644
index 0000000..c87ec05
--- /dev/null
+++ b/verilog/dv/tb/alu_tb.v
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module alu_tb();
+
+parameter DATA_WIDTH = 16;
+
+reg [3:0] opcode;
+reg [DATA_WIDTH-1:0] in1;
+reg [DATA_WIDTH-1:0] in2;
+reg carry;
+wire [DATA_WIDTH-1:0] out;
+wire carry_out;
+
+alu #(
+   .DATA_WIDTH(DATA_WIDTH)
+) alu_dut (
+   .opcode(opcode),
+   .in1(in1),
+   .in2(in2),
+   .carry(carry),
+   .out(out),
+   .carry_out(carry_out)
+);
+
+integer i;
+
+initial begin
+   $monitor("time=%4t op=%4b in1=%16b in2=%16b carry=%1b out=%16b carry_out=%1b", $time, opcode, in1, in2, carry, out, carry_out);
+
+   in1 = 16'b0011001100110011;
+   //in2 = 16'b0000111100001111;
+   in2 = 16'b101;
+   for (i=0; i<16; i=i+1) begin
+      #10 opcode=i;
+          carry = 0;
+      #10 carry = 1;
+   end
+
+   opcode = 15;
+   in2 = 0;
+   carry = 1;
+   for (i=0; i<18; i=i+1) begin
+       in1 = i;
+       #10;
+   end
+   for (i=17; i>=0; i=i-1) begin
+       in1 = ~i & {1'b0, {(15){1'b1}}};
+       #10;
+   end
+   for (i=0; i<18; i=i+1) begin
+       in1 = i | {1'b1, {(15){1'b0}}};
+       #10;
+   end
+   for (i=17; i>=0; i=i-1) begin
+       in1 = ~i;
+       #10;
+   end
+
+   #10 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/cpu_core_tb.v b/verilog/dv/tb/cpu_core_tb.v
new file mode 100644
index 0000000..7883ed8
--- /dev/null
+++ b/verilog/dv/tb/cpu_core_tb.v
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module cpu_core_tb();
+
+parameter DATA_WIDTH = 16;
+parameter PC_WIDTH = 8;
+parameter ADDR_WIDTH = 8;
+parameter SPREAD_WIDTH = 2;
+parameter INSTR_WIDTH = 32;
+
+reg clk;
+reg rst_n;
+
+wire [INSTR_WIDTH-1:0] opcode;
+wire [PC_WIDTH-1:0] progctr;
+wire mem_we;
+wire [ADDR_WIDTH-1:0] mem_waddr;
+wire [SPREAD_WIDTH-1:0] mem_wspread;
+wire [DATA_WIDTH-1:0] mem_wdata;
+wire [ADDR_WIDTH-1:0] mem_raddr;
+wire [DATA_WIDTH-1:0] mem_rdata;
+wire debug_stopped;
+wire [DATA_WIDTH-1:0] debug_rdata;
+
+cpu_core #(
+   .DATA_WIDTH(DATA_WIDTH),
+   .PC_WIDTH(PC_WIDTH),
+   .ADDR_WIDTH(ADDR_WIDTH),
+   .SPREAD_WIDTH(SPREAD_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH)
+) cpu_core_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .opcode(opcode),
+   .mem_rdata(mem_rdata),
+   .prng_in(16'd0),
+   .debug_mode(2'd0),
+   .debug_sel(4'd6),
+   .debug_we(1'd0),
+   .debug_wdata(16'd0),
+   .progctr(progctr),
+   .mem_we(mem_we),
+   .mem_waddr(mem_waddr),
+   .mem_wspread(mem_wspread),
+   .mem_wdata(mem_wdata),
+   .mem_raddr(mem_raddr),
+   .debug_stopped(debug_stopped),
+   .debug_rdata(debug_rdata)
+);
+
+wire io_dummy_active;
+wire [DATA_WIDTH-1:0] io_dummy_data;
+
+mem_mesh #(
+   .CORES(1),
+   .DEPTH(16),
+   .DATA_WIDTH(DATA_WIDTH),
+   .ADDR_WIDTH(ADDR_WIDTH),
+   .SPREAD_LAYERS(0),
+   .SPREAD_WIDTH(SPREAD_WIDTH),
+   .USE_IO(0),
+   .IO_PORTS(1),
+   .IO_FIRST(0)
+) mem_mesh_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .we(mem_we),
+   .waddr(mem_waddr),
+   .wspread(mem_wspread),
+   .wdata(mem_wdata),
+   .raddr(mem_raddr),
+   .rdata(mem_rdata),
+   .io_dir(1'b1),
+   .io_active(io_dummy_active),
+   .io_data(io_dummy_data)
+);
+
+always #5 clk = ~clk;
+
+reg [3:0] round;
+wire [INSTR_WIDTH-1:0] noop  = 32'b000_000_0_00_0000_000_0000000000000000;
+wire [INSTR_WIDTH-1:0] progmem ['h100:0];
+
+localparam n_tests = 4;
+
+// test 1
+assign progmem['h00] = 32'b100_000_1_00_0011_001_0000000001001001;   // reg1 = 73
+assign progmem['h01] = 32'b100_000_1_00_0011_010_0000000001001010;   // reg2 = 74
+assign progmem['h02] = 32'b000_001_1_00_1010_011_0000000000000000;   // jmp reg1 + reg2
+
+// test 2
+assign progmem['h10] = 32'b100_000_1_00_0011_001_0000000011110011;   // reg1 = 243
+assign progmem['h11] = 32'b000_000_1_11_0011_111_0000000000010000;   // mem[1] = reg1
+assign progmem['h12] = 32'b100_000_1_00_0011_100_0000000000000000;   // t = mem[0]
+assign progmem['h13] = 32'b011_111_1_11_1010_111_0000000000000000;   // mem[0] = t+1
+assign progmem['h14] = 32'b100_000_1_00_0011_100_0000000000000000;   // t = mem[0]
+assign progmem['h15] = 32'b011_000_1_00_0011_100_0000000000000000;   // t = mem[t]
+assign progmem['h16] = 32'b011_000_1_00_0011_011_0000000000000000;   // jmp t
+
+// test 3
+assign progmem['h20] = 32'b110_100_1_00_1011_001_0000000000010111;   // reg1 = timer - 23
+assign progmem['h21] = 32'b000_010_1_01_0011_011_0000000000000000;   // jmp (reg1 < 0) ? 0 : pc
+assign progmem['h22] = 32'b100_000_1_00_0011_011_0000000000101100;   // jmp 44
+
+// test 4
+assign progmem['h30] = 32'b100_000_1_00_0011_010_0000000010001000;   // reg2 = 136
+assign progmem['h31] = 32'b001_000_1_11_0011_111_0000000000100000;   // mem[2] = reg2
+assign progmem['h32] = 32'b100_000_1_00_0011_100_0000000000000010;   // t = mem[2]
+assign progmem['h33] = 32'b100_000_1_10_0011_010_0000000000010001;   // reg1 = t; reg2 = 17
+assign progmem['h34] = 32'b000_001_1_00_1010_011_0000000000000000;   // jmp reg1 + reg2
+
+assign opcode = rst_n ? (progctr < 16 ? progmem[round << 4 | progctr] : noop) : noop;
+
+always @ (posedge clk) begin
+   if (progctr >= 16) begin
+      rst_n = 0;
+      if (round + 1 >= n_tests) $finish;
+      round = round + 1;
+      $display("");
+      #12 rst_n = 1;
+   end
+end
+
+initial begin
+   $monitor("time=%4t round=%1x rstn=%1b ct=%2d op=%32b new_pc=%8b(%2x) reg1=%16b we=%1b wa=%8b ws=%2b wd=%16b ra=%8b rd=%16b dd=%16b",
+   $time, round, rst_n, cpu_core_dut.timer, opcode, progctr, progctr, cpu_core_dut.reg1,
+   mem_we, mem_waddr, mem_wspread, mem_wdata, mem_raddr, mem_rdata, debug_rdata);
+   round = 0;
+   clk = 0;
+   rst_n = 0;
+   #12 rst_n = 1;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/debug_mux_tb.v b/verilog/dv/tb/debug_mux_tb.v
new file mode 100644
index 0000000..5127f03
--- /dev/null
+++ b/verilog/dv/tb/debug_mux_tb.v
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module debug_mux_tb();
+
+parameter CORES=4;
+parameter LOG_CORES=2;
+parameter DATA_WIDTH=8;
+
+reg [LOG_CORES-1:0] sel;
+reg [4:0] addr;
+reg we;
+reg [DATA_WIDTH-1:0] wdata;
+wire [DATA_WIDTH-1:0] rdata;
+reg reg_stopped[CORES-1:0];
+reg [DATA_WIDTH-1:0] reg_rdata[CORES-1:0];
+wire [1:0] cpu_mode[CORES-1:0];
+wire [3:0] reg_sel[CORES-1:0];
+wire reg_we[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_wdata[CORES-1:0];
+
+wire [CORES-1:0] reg_stopped_raw;
+wire [CORES*DATA_WIDTH-1:0] reg_rdata_raw;
+wire [CORES*2-1:0] cpu_mode_raw;
+wire [CORES*4-1:0] reg_sel_raw;
+wire [CORES-1:0] reg_we_raw;
+wire [CORES*DATA_WIDTH-1:0] reg_wdata_raw;
+
+debug_mux #(
+   .CORES(CORES),
+   .LOG_CORES(LOG_CORES),
+   .DATA_WIDTH(DATA_WIDTH)
+) debug_mux_dut (
+   .sel(sel),
+   .addr(addr),
+   .we(we),
+   .wdata(wdata),
+   .rdata(rdata),
+   .reg_stopped(reg_stopped_raw),
+   .reg_rdata(reg_rdata_raw),
+   .cpu_mode(cpu_mode_raw),
+   .reg_sel(reg_sel_raw),
+   .reg_we(reg_we_raw),
+   .reg_wdata(reg_wdata_raw)
+);
+
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+   assign reg_stopped_raw[core] = reg_stopped[core];
+   assign reg_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = reg_rdata[core];
+   assign cpu_mode[core] = cpu_mode_raw[core*2 +: 2];
+   assign reg_sel[core] = reg_sel_raw[core*4 +: 4];
+   assign reg_we[core] = reg_we_raw[core];
+   assign reg_wdata[core] = reg_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+
+initial begin
+   $monitor("time=%4t SEL=%b ADDR=%b WE=%b WDATA=%b rdata=%b, ST0=%b RD0=%b cm0=%b s0=%b we0=%b wd0=%b ST1=%b RD1=%b cm1=%b s1=%b we1=%b wd1=%b",
+               $time, sel, addr, we, wdata, rdata, reg_stopped[0], reg_rdata[0], cpu_mode[0], reg_sel[0], reg_we[0], reg_wdata[0],
+                                                   reg_stopped[1], reg_rdata[1], cpu_mode[1], reg_sel[1], reg_we[1], reg_wdata[1]);
+   sel = 0;
+   addr = 5'b01100;
+   we = 0;
+   wdata = 8'b10101010;
+   reg_stopped[0] = 1;
+   reg_stopped[1] = 0;
+   reg_stopped[2] = 1;
+   reg_stopped[3] = 0;
+   reg_rdata[0] = 8'b11110000;
+   reg_rdata[1] = 8'b11100001;
+   reg_rdata[2] = 8'b11000011;
+   reg_rdata[3] = 8'b10000111;
+
+   #10
+   sel = 1;
+   we = 1;
+
+   #10
+   sel = 0;
+   we = 0;
+   addr = 5'b10000;
+
+   #10
+   sel = 1;
+   we = 1;
+   wdata = 8'b00000011;
+
+   #10
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/entropy_pool_tb.v b/verilog/dv/tb/entropy_pool_tb.v
new file mode 100644
index 0000000..b501729
--- /dev/null
+++ b/verilog/dv/tb/entropy_pool_tb.v
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module entropy_pool_tb();
+
+parameter WIDTH = 16;
+
+reg clk;
+reg rst_n;
+reg [WIDTH-1:0] e_word;
+wire e_bit;
+
+entropy_pool #(
+   .WIDTH(WIDTH)
+) entropy_pool_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .e_word(e_word),
+   .e_bit(e_bit)
+);
+
+always #5 clk = ~clk;
+
+reg strobe;
+always @(posedge clk) strobe = ~strobe;   // force a $monitor strobe every clock cycle
+
+initial begin
+   $monitor("time %4t s %1b ew %16b es %15b eb %1b", $time, strobe, e_word, entropy_pool_dut.e_pool_mod, e_bit);
+   clk = 0;
+   rst_n = 0;
+   strobe = 0;
+   #10
+   rst_n = 1;
+   e_word = 16'b0111110000111001;
+   #10
+   e_word = 0;
+   #100
+   e_word = 16'b1010101010101010;
+   #10
+   e_word = 0;
+   #200
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/instr_mem_tb.v b/verilog/dv/tb/instr_mem_tb.v
new file mode 100644
index 0000000..64167ff
--- /dev/null
+++ b/verilog/dv/tb/instr_mem_tb.v
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module instr_mem_tb();
+
+parameter PC_WIDTH = 4;
+parameter INSTR_WIDTH = 8;
+parameter DEPTH = 16;
+
+reg clk;
+reg rst_n;
+reg [PC_WIDTH-1:0] raddr;
+wire [INSTR_WIDTH-1:0] rdata;
+reg we;
+reg [PC_WIDTH-1:0] waddr;
+reg [INSTR_WIDTH-1:0] wdata;
+
+instr_mem #(
+   .INSTR_WIDTH(INSTR_WIDTH),
+   .PC_WIDTH(PC_WIDTH),
+   .DEPTH(DEPTH)
+) instr_mem_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .raddr(raddr),
+   .rdata(rdata),
+   .we(we),
+   .waddr(waddr),
+   .wdata(wdata)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+   $monitor("time=%4t rstn=%1b we=%1b waddr=%4b wdata=%8b raddr=%4b rdata=%8b", $time, rst_n, we, waddr, wdata, raddr, rdata);
+   clk <= 0;
+   rst_n <= 0;
+   we <= 0;
+   waddr <= 0;
+   wdata <= 1;
+   raddr <= 0;
+   #500 $display("");
+   rst_n <= 0;
+   #500 $display("");
+   rst_n <= 0;
+   #500 $finish;
+end
+
+always @(posedge clk) begin
+   if (!rst_n) begin
+      rst_n <= 1;
+   end else begin
+      if (we) begin
+         waddr <= waddr + 1;
+         wdata <= wdata + 1;
+      end
+      we <= !we;
+      raddr <= raddr + 1;
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_filter_rev_tb.v b/verilog/dv/tb/io_filter_rev_tb.v
new file mode 100644
index 0000000..085d2e1
--- /dev/null
+++ b/verilog/dv/tb/io_filter_rev_tb.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_filter_rev_tb();
+
+parameter IO_PINS = 4;
+parameter DATA_WIDTH = 8;
+
+reg clk;
+reg rst_n;
+reg [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in_raw;
+wire [IO_PINS-1:0] pin_data_out_raw;
+wire [IO_PINS+2-1:0] port_active_in_raw;
+wire [IO_PINS+2-1:0] port_active_out_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out_raw;
+
+io_filter_rev #(
+   .IO_PINS(IO_PINS),
+   .DATA_WIDTH(DATA_WIDTH)
+) io_filter_rev_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .pin_dir(pin_dir),
+   .pin_data_in(pin_data_in_raw),
+   .pin_data_out(pin_data_out_raw),
+   .port_active_in(port_active_in_raw),
+   .port_active_out(port_active_out_raw),
+   .port_data_in(port_data_in_raw),
+   .port_data_out(port_data_out_raw)
+);
+
+// The testbench acts as the "external world" for the io filter, so it simulates both the cpu/memory part
+// and the peripherals. An "output" message is one sent from the cpu/memory to the peripherals which means
+// the testbench ports act as output and the pins act as input. Conversely, an "input" message is one coming
+// from the peripherals to the cpu/memory where testbench pins will act as output and ports as input.
+
+wire [IO_PINS-1:0] pin_data_out;
+reg [IO_PINS+2-1:0] port_active_out;
+reg [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out;
+
+reg [IO_PINS-1:0] pin_data_in;
+wire [IO_PINS+2-1:0] port_active_in;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in;
+
+generate genvar pin;
+for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+   // output
+   assign port_active_out_raw[pin] = port_active_out[pin];
+   assign port_data_out_raw[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_out[pin*DATA_WIDTH +: DATA_WIDTH];
+   assign pin_data_out = pin_data_out_raw;
+   // input
+   assign pin_data_in_raw[pin] = pin_data_in[pin];
+   assign port_active_in[pin] = port_active_in_raw[pin];
+   assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_in_raw[pin*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+// output
+assign port_active_out_raw[IO_PINS +: 2] = {1'b0, port_active_out[IO_PINS]};
+assign port_data_out_raw[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {{(DATA_WIDTH){1'b0}}, port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]};
+// input
+assign port_active_in[IO_PINS +: 2] = {port_active_in_raw[IO_PINS+1], 1'b0};
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {port_data_in_raw[(IO_PINS+1)*DATA_WIDTH +: DATA_WIDTH], {(DATA_WIDTH){1'b0}}};
+
+always #5 clk = ~clk;
+
+initial begin
+   $monitor("time %4d pin_data_in %4b pin_data_out %4b port_active_in %6b port_active_out %6b port_data_in %24b port_data_out %24b",
+               $time, pin_data_in_raw, pin_data_out_raw, port_active_in_raw, port_active_out_raw, port_data_in_raw, port_data_out_raw);
+   clk <= 0;
+   rst_n <= 0;
+   #40
+   rst_n <= 1;
+   pin_dir <= 4'b1010;
+   port_active_out <= 6'b0;
+   port_data_out <= 48'b0;
+   pin_data_in <= 4'b0;
+   #40
+   pin_data_in <= 4'b0001;
+   #40
+   port_active_out <= 6'b000100;
+   port_data_out <= 48'b00000000_00000000_00000000_11111111_00000000_11111111;
+   #10 port_active_out <= 6'b0;
+   #30
+   port_active_out <= 6'b010000;
+   port_data_out <= 48'b00000000_00000001_00000000_00000000_00000000_00000000;
+   #10 port_active_out <= 6'b0;
+   #30
+   port_active_out <= 6'b010001;
+   port_data_out <= 48'b00000000_00000011_00000000_00000000_00000000_00000000;
+   #10 port_active_out <= 6'b0;
+   #30
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_filter_tb.v b/verilog/dv/tb/io_filter_tb.v
new file mode 100644
index 0000000..0bf940e
--- /dev/null
+++ b/verilog/dv/tb/io_filter_tb.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_filter_tb();
+
+parameter IO_PINS = 4;
+parameter DATA_WIDTH = 8;
+
+reg clk;
+reg rst_n;
+reg [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in_raw;
+wire [IO_PINS-1:0] pin_data_out_raw;
+wire [IO_PINS+2-1:0] port_active_in_raw;
+wire [IO_PINS+2-1:0] port_active_out_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out_raw;
+
+io_filter #(
+   .IO_PINS(IO_PINS),
+   .DATA_WIDTH(DATA_WIDTH)
+) io_filter_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .pin_dir(pin_dir),
+   .pin_data_in(pin_data_in_raw),
+   .pin_data_out(pin_data_out_raw),
+   .port_active_in(port_active_in_raw),
+   .port_active_out(port_active_out_raw),
+   .port_data_in(port_data_in_raw),
+   .port_data_out(port_data_out_raw)
+);
+
+// The testbench acts as the "external world" for the io filter, so it simulates both the cpu/memory part
+// and the peripherals. An "output" message is one sent from the cpu/memory to the peripherals which means
+// the testbench ports act as output and the pins act as input. Conversely, an "input" message is one coming
+// from the peripherals to the cpu/memory where testbench pins will act as output and ports as input.
+
+wire [IO_PINS-1:0] pin_data_out;
+reg [IO_PINS+2-1:0] port_active_out;
+reg [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out;
+
+reg [IO_PINS-1:0] pin_data_in;
+wire [IO_PINS+2-1:0] port_active_in;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in;
+
+generate genvar pin;
+for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+   // output
+   assign port_active_out_raw[pin] = port_active_out[pin];
+   assign port_data_out_raw[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_out[pin*DATA_WIDTH +: DATA_WIDTH];
+   assign pin_data_out = pin_data_out_raw;
+   // input
+   assign pin_data_in_raw[pin] = pin_data_in[pin];
+   assign port_active_in[pin] = port_active_in_raw[pin];
+   assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_in_raw[pin*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+// output
+assign port_active_out_raw[IO_PINS +: 2] = {1'b0, port_active_out[IO_PINS]};
+assign port_data_out_raw[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {{(DATA_WIDTH){1'b0}}, port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]};
+// input
+assign port_active_in[IO_PINS +: 2] = {port_active_in_raw[IO_PINS+1], 1'b0};
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {port_data_in_raw[(IO_PINS+1)*DATA_WIDTH +: DATA_WIDTH], {(DATA_WIDTH){1'b0}}};
+
+always #5 clk = ~clk;
+
+initial begin
+   $monitor("time %4d pin_data_in %4b pin_data_out %4b port_active_in %6b port_active_out %6b port_data_in %24b port_data_out %24b",
+               $time, pin_data_in_raw, pin_data_out_raw, port_active_in_raw, port_active_out_raw, port_data_in_raw, port_data_out_raw);
+   clk <= 0;
+   rst_n <= 0;
+   #40
+   rst_n <= 1;
+   pin_dir <= 4'b0101;
+   port_active_out <= 6'b0;
+   port_data_out <= 48'b0;
+   pin_data_in <= 4'b0;
+   #40
+   pin_data_in <= 4'b1000;
+   #40
+   port_active_out <= 6'b000100;
+   port_data_out <= 48'b00000000_00000000_00000000_11111111_00000000_11111111;
+   #10 port_active_out <= 6'b0;
+   #30
+   port_active_out <= 6'b010000;
+   port_data_out <= 48'b00000000_00000001_00000000_00000000_00000000_00000000;
+   #10 port_active_out <= 6'b0;
+   #30
+   port_active_out <= 6'b010001;
+   port_data_out <= 48'b00000000_00000011_00000000_00000000_00000000_00000000;
+   #10 port_active_out <= 6'b0;
+   #30
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_pads_tb.v b/verilog/dv/tb/io_pads_tb.v
new file mode 100644
index 0000000..d05df8f
--- /dev/null
+++ b/verilog/dv/tb/io_pads_tb.v
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_pads_tb();
+
+parameter IO_PINS=16;
+parameter IO_PADS=38;
+parameter LOGIC_PROBES=128;
+parameter FIRST_PAD=12;
+
+reg wb_clk_i;
+reg wb_rst_i;
+reg [LOGIC_PROBES-1:0] la_data_in;
+wire [LOGIC_PROBES-1:0] la_data_out;
+reg [LOGIC_PROBES-1:0] la_oenb;
+reg [IO_PADS-1:0] io_in;
+wire [IO_PADS-1:0] io_out;
+wire [IO_PADS-1:0] io_oeb;
+wire clk;
+wire rst_hard_n;
+wire rst_soft_n;
+wire rst_prng_n;
+wire [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in;
+reg [IO_PINS-1:0] pin_data_out;
+reg cfg_we;
+reg cfg_addr;
+reg [IO_PINS-1:0] cfg_wdata;
+
+io_pads #(
+   .IO_PINS(IO_PINS),
+   .IO_PADS(IO_PADS),
+   .LOGIC_PROBES(LOGIC_PROBES),
+   .FIRST_PAD(FIRST_PAD)
+) io_pads_dut (
+   .wb_clk_i(wb_clk_i),
+   .wb_rst_i(wb_rst_i),
+   .la_data_in(la_data_in),
+   .la_data_out(la_data_out),
+   .la_oenb(la_oenb),
+   .io_in(io_in),
+   .io_out(io_out),
+   .io_oeb(io_oeb),
+   .clk(clk),
+   .rst_hard_n(rst_hard_n),
+   .rst_soft_n(rst_soft_n),
+   .rst_prng_n(rst_prng_n),
+   .pin_dir(pin_dir),
+   .pin_data_in(pin_data_in),
+   .pin_data_out(pin_data_out),
+   .cfg_we(cfg_we),
+   .cfg_addr(cfg_addr),
+   .cfg_wdata(cfg_wdata)
+);
+
+always #5 wb_clk_i = ~wb_clk_i;
+
+initial begin
+   $monitor("time %4t lado %b io %b ioe %b clk %b rh %b rs %b rp %b pd %b pi %b pm %b sd %b",
+      $time, la_data_out, io_out, io_oeb, clk, rst_hard_n, rst_soft_n, rst_prng_n, pin_dir, pin_data_in, io_pads_dut.programming, io_pads_dut.saved_dir);
+   wb_clk_i = 0;
+   wb_rst_i = 1;
+   la_data_in = 128'b0;
+   la_oenb = ~128'b0;
+   io_in = 38'b0;
+   pin_data_out = 16'b0;
+   cfg_we = 0;
+   cfg_addr = 0;
+   cfg_wdata = 16'b0;
+   #10
+   wb_rst_i = 0;
+   #30
+   $display("clock & reset tests");
+   la_oenb[0] = 0;
+   #30
+   la_data_in[0] = 1;
+   #30
+   la_data_in[0] = 0;
+   #30
+   la_oenb[0] = 1;
+   la_oenb[1] = 0;
+   la_data_in[1] = 0;
+   #30
+   la_oenb[1] = 1;
+   #30
+   la_oenb[2] = 0;
+   #30
+   la_oenb[2] = 1;
+   #30
+   la_oenb[3] = 0;
+   #30
+   la_oenb[3] = 1;
+   #30
+   wb_rst_i = 1;
+   #30
+   wb_rst_i = 0;
+   #30
+   la_oenb[4:1] = 3'b000;
+   la_data_in[4:1] = 3'b111;
+   wb_rst_i = 1;
+   #30
+   la_oenb[4:1] = 3'b111;
+   la_data_in[4:1] = 3'b000;
+   wb_rst_i = 0;
+   #10
+   $display("wb mux config test");
+   cfg_we = 1;
+   cfg_addr = 0;
+   cfg_wdata = 1;
+   #10
+   cfg_wdata = 0;
+   #10
+   cfg_addr = 1;
+   cfg_wdata = 16'b1111111100000000;
+   #10
+   cfg_we = 0;
+   #10
+   $display("io pin & pad tests");
+   $display("%d", io_pads_dut.LA_PAD);
+   io_in = 38'b111010101010101010111111111111;
+   #10
+   pin_data_out = 16'b1100110011001100;
+   #10
+   la_oenb[8 +: 8] = 8'b00000000;
+   la_data_in[8 +: 8] = 8'b00001111;
+   #10
+   la_oenb[8 +: 8] = 8'b11111111;
+   la_data_in[8 +: 8] = 8'b00000000;
+   #10
+   la_oenb[24 +: 8] = 8'b00000000;
+   la_data_in[24 +: 8] = 8'b11110000;
+   #10
+   la_oenb[24 +: 8] = 8'b11111111;
+   la_data_in[24 +: 8] = 8'b00000000;
+   #10
+   la_oenb[52 +: 8] = 8'b00000000;
+   la_data_in[52 +: 8] = 8'b11110000;
+   #10
+   la_oenb[52 +: 8] = 8'b11111111;
+   la_data_in[52 +: 8] = 8'b00000000;
+   #10
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/mcu_tb.v b/verilog/dv/tb/mcu_tb.v
new file mode 100644
index 0000000..1c475c3
--- /dev/null
+++ b/verilog/dv/tb/mcu_tb.v
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module mcu_tb();
+
+parameter CORES = 2;
+parameter LOG_CORES = 1;
+parameter MEM_DEPTH = 16;
+parameter DATA_WIDTH = 16;
+parameter PC_WIDTH = 3;
+parameter ADDR_WIDTH = 4;
+parameter INSTR_WIDTH = 32;
+parameter INSTR_DEPTH = 4;
+parameter IN_PINS = 4;
+parameter OUT_PINS = 4;
+parameter IO_PADS = 38;
+parameter FIRST_PAD = 12;
+parameter LOGIC_PROBES = 128;
+parameter WB_WIDTH = 32;
+parameter IO_PINS = IN_PINS + OUT_PINS;
+
+reg clk;
+wire wb_clk_i = clk;
+reg wb_rst_i;
+reg wb_stb_i;
+reg wb_cyc_i;
+reg wb_we_i;
+reg [WB_WIDTH-1:0] wb_adr_i;
+reg [WB_WIDTH-1:0] wb_dat_i;
+wire wbs_ack_o;
+wire [WB_WIDTH-1:0] wbs_dat_o;
+reg [LOGIC_PROBES-1:0] la_data_in;
+wire [LOGIC_PROBES-1:0] la_data_out;
+reg [LOGIC_PROBES-1:0] la_oenb;
+wire [IO_PADS-1:0] io_in;
+wire [IO_PADS-1:0] io_out;
+wire [IO_PADS-1:0] io_oeb;
+
+mcu #(
+   .CORES(CORES),
+   .LOG_CORES(LOG_CORES),
+   .MEM_DEPTH(MEM_DEPTH),
+   .DATA_WIDTH(DATA_WIDTH),
+   .PC_WIDTH(PC_WIDTH),
+   .ADDR_WIDTH(ADDR_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH),
+   .INSTR_DEPTH(INSTR_DEPTH),
+   .IO_PINS(IO_PINS),
+   .IO_PADS(IO_PADS),
+   .FIRST_PAD(FIRST_PAD),
+   .LOGIC_PROBES(LOGIC_PROBES),
+   .WB_WIDTH(WB_WIDTH)
+) mcu_dut (
+   .wb_clk_i(wb_clk_i),
+   .wb_rst_i(wb_rst_i),
+   .wb_stb_i(wb_stb_i),
+   .wb_cyc_i(wb_cyc_i),
+   .wb_we_i(wb_we_i),
+   .wb_adr_i(wb_adr_i),
+   .wb_dat_i(wb_dat_i),
+   .wbs_ack_o(wbs_ack_o),
+   .wbs_dat_o(wbs_dat_o),
+   .la_data_in(la_data_in),
+   .la_data_out(la_data_out),
+   .la_oenb(la_oenb),
+   .io_in(io_in),
+   .io_out(io_out),
+   .io_oeb(io_oeb)
+);
+
+reg [IN_PINS-1:0] pin_data_in;
+assign io_in = {{(IO_PADS - IN_PINS - FIRST_PAD){1'b0}}, pin_data_in, {(FIRST_PAD){1'b0}}};
+
+wire [OUT_PINS-1:0] pin_data_out = io_out[FIRST_PAD + IN_PINS +: OUT_PINS];
+
+always #5 clk = ~clk;
+
+initial begin
+   $monitor("time %4t rh %1b rs %1b wwei %1b wai %32b pdi %4b pdo %4b",
+               $time, la_data_out[1], la_data_out[2], wb_we_i, wb_adr_i, pin_data_in, pin_data_out);
+   // power up
+   clk = 0;
+   wb_rst_i = 1;
+   wb_stb_i = 0;
+   wb_cyc_i = 0;
+   wb_we_i = 0;
+   wb_adr_i = 0;
+   wb_dat_i = 0;
+   la_data_in = {(LOGIC_PROBES){1'b0}};
+   la_oenb = {(LOGIC_PROBES){1'b1}};
+   pin_data_in = 4'b0000;
+   #10
+   // wishbone reset off, start communications
+   wb_rst_i = 0;
+   wb_stb_i = 1;
+   wb_cyc_i = 1;
+   wb_we_i = 1;
+   // programming mode
+   wb_adr_i = 32'b01_000000000000000000000000000000;           // set programming mode
+   wb_dat_i = 32'b00000000000000000000000000000001;            // to 1
+   #10
+   // send code for cpu core 0
+   wb_adr_i = 32'b00_00000000000000000000000000_0_000;         // address 0:
+   wb_dat_i = 32'b100_000_1_00_0011_100_0000000000001111;      // read value from memory cell 15 (joined input)
+   #10
+   wb_adr_i = 32'b00_00000000000000000000000000_0_001;         // address 1:
+   wb_dat_i = 32'b011_000_1_11_0011_111_0000000000000001;      // write value to memory cell 0, spread 1
+   #10
+   wb_adr_i = 32'b00_00000000000000000000000000_0_010;         // address 2:
+   wb_dat_i = 32'b100_000_1_00_0011_011_0000000000000000;      // jump to address 0
+   #10
+   // send code for cpu core 1
+   wb_adr_i = 32'b00_00000000000000000000000000_1_000;         // address 0:
+   wb_dat_i = 32'b100_000_1_00_0011_100_0000000000000000;      // read value from memory cell 0
+   #10
+   wb_adr_i = 32'b00_00000000000000000000000000_1_001;         // address 1:
+   wb_dat_i = 32'b011_000_1_11_0011_111_0000000011100010;      // write value to memory cell 14, spread 2 (joined output)
+   #10
+   wb_adr_i = 32'b00_00000000000000000000000000_1_010;         // address 2:
+   wb_dat_i = 32'b100_000_1_00_0011_011_0000000000000000;      // jump to address 0
+   #10
+   // set pin directions
+   wb_adr_i = 32'b01_000000000000000000000000000001;           // set pin directions
+   wb_dat_i = 32'b00000000000000000000000011110000;            // first 4 pins are inputs, next 4 pins are outputs
+   #10
+   // exit programming mode
+   wb_adr_i = 32'b01_000000000000000000000000000000;           // set programming mode
+   wb_dat_i = 32'b00000000000000000000000000000000;            // to 0
+   #10
+   // stop wishbone communications
+   wb_we_i = 0;
+   wb_cyc_i = 0;
+   wb_stb_i = 0;
+   // set input pins
+   pin_data_in = 4'b0011;
+   // wait for data to appear on output pins
+   #100
+   // change input pins
+   pin_data_in = 4'b1001;
+   // wait for data to appear on output pins
+   #100
+   // change input pins
+   pin_data_in = 4'b1100;
+   // wait for data to appear on output pins
+   #100
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/mem_mesh_tb.v b/verilog/dv/tb/mem_mesh_tb.v
new file mode 100644
index 0000000..9b81a5f
--- /dev/null
+++ b/verilog/dv/tb/mem_mesh_tb.v
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module mem_mesh_tb();
+
+parameter CORES = 8;
+parameter DEPTH = 256;
+parameter DATA_WIDTH = 16;
+parameter ADDR_WIDTH = 8;
+parameter SPREAD_LAYERS = 3;
+parameter SPREAD_WIDTH = 3;
+parameter USE_IO = 1;
+parameter IO_PORTS = 16;
+parameter IO_FIRST = 5;
+
+reg clk;
+reg rst_n;
+reg we[CORES-1:0];
+reg [ADDR_WIDTH-1:0] waddr[CORES-1:0];
+reg [SPREAD_WIDTH-1:0] wspread[CORES-1:0];
+reg [DATA_WIDTH-1:0] wdata[CORES-1:0];
+reg [ADDR_WIDTH-1:0] raddr[CORES-1:0];
+wire [DATA_WIDTH-1:0] rdata[CORES-1:0];
+
+// io directions are according to the cpu & memory, so they are
+// reversed from the point of view of the testbench / external world
+reg io_dir[IO_PORTS-1:0];
+reg io_receiving[IO_PORTS-1:0];
+wire io_sending[IO_PORTS-1:0];
+reg [DATA_WIDTH-1:0] io_input[IO_PORTS-1:0];
+wire [DATA_WIDTH-1:0] io_output[IO_PORTS-1:0];
+
+wire [CORES-1:0] we_raw;
+wire [CORES*ADDR_WIDTH-1:0] waddr_raw;
+wire [CORES*SPREAD_WIDTH-1:0] wspread_raw;
+wire [CORES*DATA_WIDTH-1:0] wdata_raw;
+wire [CORES*ADDR_WIDTH-1:0] raddr_raw;
+wire [CORES*DATA_WIDTH-1:0] rdata_raw;
+
+wire [IO_PORTS-1:0] io_active_in_raw;
+wire [IO_PORTS-1:0] io_active_out_raw;
+wire [IO_PORTS*DATA_WIDTH-1:0] io_data_in_raw;
+wire [IO_PORTS*DATA_WIDTH-1:0] io_data_out_raw;
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+   assign we_raw[core] = we[core];
+   assign waddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = waddr[core];
+   assign wspread_raw[core*SPREAD_WIDTH +: SPREAD_WIDTH] = wspread[core];
+   assign wdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = wdata[core];
+   assign raddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = raddr[core];
+   assign rdata[core] = rdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+
+generate genvar port;
+for (port=0; port<IO_PORTS; port=port+1) begin:g_port
+   assign io_active_in_raw[port] = io_dir[port] ? 1'b0 : io_receiving[port];
+   assign io_sending[port] = io_dir[port] ? io_active_out_raw[port] : 1'b0;
+   assign io_data_in_raw[port*DATA_WIDTH +: DATA_WIDTH] = io_dir[port] ? {(DATA_WIDTH){1'b0}} : io_input[port];
+   assign io_output[port] = io_dir[port] ? io_data_out_raw[port*DATA_WIDTH +: DATA_WIDTH] : {(DATA_WIDTH){1'b0}};
+end
+endgenerate
+
+mem_mesh #(
+   .CORES(CORES),
+   .DEPTH(DEPTH),
+   .DATA_WIDTH(DATA_WIDTH),
+   .ADDR_WIDTH(ADDR_WIDTH),
+   .SPREAD_LAYERS(SPREAD_LAYERS),
+   .SPREAD_WIDTH(SPREAD_WIDTH),
+   .USE_IO(USE_IO),
+   .IO_PORTS(IO_PORTS),
+   .IO_FIRST(IO_FIRST)
+) mem_mesh_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .we(we_raw),
+   .waddr(waddr_raw),
+   .wspread(wspread_raw),
+   .wdata(wdata_raw),
+   .raddr(raddr_raw),
+   .rdata(rdata_raw),
+   .io_active_in(io_active_in_raw),
+   .io_active_out(io_active_out_raw),
+   .io_data_in(io_data_in_raw),
+   .io_data_out(io_data_out_raw)
+);
+
+always #5 clk = ~clk;
+
+integer i;
+
+// for synchronization checking
+reg io_sending_reg[IO_PORTS-1:0];
+reg [DATA_WIDTH-1:0] io_output_reg[IO_PORTS-1:0];
+
+always @(posedge clk) begin
+   for (i=0; i<IO_PORTS; i=i+1) begin
+      io_sending_reg[i] <= io_sending[i];
+      io_output_reg[i] <= io_output[i];
+   end
+end
+
+initial begin
+   raddr[2] = 8;
+   raddr[3] = 8;
+   raddr[6] = 8;
+   raddr[7] = 192;
+   wspread[2] = 0;
+   io_input[3] = 0;
+   $monitor("time=%t mem[2][8]=%d mem[3][8]=%d mem[6][8]=%d mem[7][192]=%d io_dir[3]=%d io_sending[3]=%d io_out[3]=%d",
+               $time, rdata[2], rdata[3], rdata[6], rdata[7], io_dir[3], io_sending_reg[3], io_output_reg[3]);
+
+   for (i=0; i<CORES; i=i+1) begin
+      we[i] = 0;
+   end
+
+   for (i=0; i<IO_PORTS; i=i+1) begin
+      io_dir[i] = 1;
+   end
+
+   clk = 0;
+   rst_n = 1;
+   #10 rst_n = 0;
+   #10 rst_n = 1;
+
+   #20
+   we[2] = 1;
+   waddr[2] = 8;
+   wspread[2] = 0;
+   wdata[2] = 100;
+
+   #20
+   waddr[2] = 8;
+   wspread[2] = 1;
+   wdata[2] = 200;
+
+   #20
+   waddr[2] = 8;
+   wspread[2] = 2;
+   wdata[2] = 300;
+
+   #20
+   waddr[2] = 8;
+   wspread[2] = 3;
+   wdata[2] = 400;
+
+   #20
+   waddr[2] = 8;
+   wspread[2] = 4;
+   wdata[2] = 500;
+
+   #20
+   io_dir[3] = 0;
+   io_receiving[3] = 0;
+
+   #20
+   io_receiving[3] = 1;
+   io_input[3] = 1234;
+
+   #20 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/pin_compress_tb.v b/verilog/dv/tb/pin_compress_tb.v
new file mode 100644
index 0000000..6cb29ea
--- /dev/null
+++ b/verilog/dv/tb/pin_compress_tb.v
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module pin_compress_tb ();
+
+parameter WIDTH = 16;
+
+reg [WIDTH-1:0] data;
+reg [WIDTH-1:0] mask;
+wire [WIDTH-1:0] result;
+
+pin_compress #(
+   .WIDTH(WIDTH)
+) pin_compress_dut (
+   .data(data),
+   .mask(mask),
+   .result(result)
+);
+
+initial begin
+   data <= 16'b1001110100110101;
+   mask <= 16'b0100100101000101;
+   #10
+   $display("%16b", result);
+   $display("%16b", 16'b0000000000011011);
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/pin_decompress_tb.v b/verilog/dv/tb/pin_decompress_tb.v
new file mode 100644
index 0000000..09bb432
--- /dev/null
+++ b/verilog/dv/tb/pin_decompress_tb.v
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module pin_decompress_tb ();
+
+parameter WIDTH = 16;
+
+reg [WIDTH-1:0] data;
+reg [WIDTH-1:0] mask;
+wire [WIDTH-1:0] result;
+
+pin_decompress #(
+   .WIDTH(WIDTH)
+) pin_decompress_dut (
+   .data(data),
+   .mask(mask),
+   .result(result)
+);
+
+initial begin
+   data <= 16'b0000000000001011;
+   mask <= 16'b0101000101000101;
+   #10
+   $display("%16b", result);
+   $display("%16b", 16'b0000000100000101);
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prng_tb.v b/verilog/dv/tb/prng_tb.v
new file mode 100644
index 0000000..d010787
--- /dev/null
+++ b/verilog/dv/tb/prng_tb.v
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prng_tb();
+
+parameter STATE_BITS = 4;
+parameter POLYNOMIAL = 4'b1100;
+parameter STATE_INIT = 4'b0000;
+parameter OUTPUT_BITS = 2;
+
+reg clk;
+reg rst_n;
+wire [OUTPUT_BITS-1:0] random;
+
+prng #(
+   .STATE_BITS(STATE_BITS),
+   .POLYNOMIAL(POLYNOMIAL),
+   .STATE_INIT(STATE_INIT),
+   .OUTPUT_BITS(OUTPUT_BITS)
+) prng_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .entropy(1'b0),
+   .random(random)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+   $monitor("%4d %4b %4b %4b %2b", $time, prng_dut.state, prng_dut.g_shift[0].new_state, prng_dut.g_shift[1].new_state, random);
+   clk <= 0;
+   rst_n <= 0;
+   #10 rst_n <= 1;
+   #100 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prng_wrap_tb.v b/verilog/dv/tb/prng_wrap_tb.v
new file mode 100644
index 0000000..00a909a
--- /dev/null
+++ b/verilog/dv/tb/prng_wrap_tb.v
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prng_wrap_tb();
+
+parameter INDEX = 0;
+parameter OUTPUT_BITS = 128;
+
+reg clk;
+reg rst_n;
+wire [OUTPUT_BITS-1:0] random;
+
+prng_wrap #(
+   .INDEX(INDEX),
+   .OUTPUT_BITS(OUTPUT_BITS)
+) prng_wrap_dut (
+   .clk(clk),
+   .rst_n(rst_n),
+   .entropy(1'b0),
+   .random(random)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+   clk <= 0;
+   rst_n <= 0;
+   #10 rst_n <= 1;
+   $display("%8x", prng_wrap_dut.prng_inst.POLYNOMIAL);
+   $display("%8x", prng_wrap_dut.prng_inst.scrambled_init);
+   $monitor("%4d %128b", $time, random);
+   #200 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prog_mux_tb.v b/verilog/dv/tb/prog_mux_tb.v
new file mode 100644
index 0000000..115abc8
--- /dev/null
+++ b/verilog/dv/tb/prog_mux_tb.v
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prog_mux_tb();
+
+parameter CORES=4;
+parameter LOG_CORES=2;
+parameter PC_WIDTH=4;
+parameter INSTR_WIDTH=8;
+
+reg we;
+reg [LOG_CORES-1:0] sel;
+reg [PC_WIDTH-1:0] waddr;
+reg [INSTR_WIDTH-1:0] wdata;
+wire [CORES-1:0] cwe_raw;
+wire [CORES*PC_WIDTH-1:0] cwaddr_raw;
+wire [CORES*INSTR_WIDTH-1:0] cwdata_raw;
+
+prog_mux #(
+   .CORES(CORES),
+   .LOG_CORES(LOG_CORES),
+   .PC_WIDTH(PC_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH)
+) prog_mux_dut (
+   .we(we),
+   .sel(sel),
+   .waddr(waddr),
+   .wdata(wdata),
+   .cwe(cwe_raw),
+   .cwaddr(cwaddr_raw),
+   .cwdata(cwdata_raw)
+);
+
+wire cwe[CORES-1:0];
+wire [PC_WIDTH-1:0] cwaddr[CORES-1:0];
+wire [INSTR_WIDTH-1:0] cwdata[CORES-1:0];
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+   assign cwe[core] = cwe_raw[core];
+   assign cwaddr[core] = cwaddr_raw[core*PC_WIDTH +: PC_WIDTH];
+   assign cwdata[core] = cwdata_raw[core*INSTR_WIDTH +: INSTR_WIDTH];
+end
+endgenerate
+
+initial begin
+   $monitor("time=%4t we=%d sel=%d waddr=%d wdata=%d cwe0=%d cwaddr0=%d cwdata0=%d cwe1=%d cwaddr1=%d cwdata1=%d",
+               $time, we, sel, waddr, wdata, cwe[0], cwaddr[0], cwdata[0], cwe[1], cwaddr[1], cwdata[1]);
+
+   we = 0;
+
+   #10
+   we = 1;
+   sel = 0;
+   waddr = 3;
+   wdata = 11;
+
+   #10
+   we = 0;
+
+   #10
+   we = 1;
+   sel = 1;
+   waddr = 5;
+   wdata = 25;
+
+   #10
+   sel = 0;
+   waddr = 0;
+   wdata = 1;
+
+   #10
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/wb_mux_tb.v b/verilog/dv/tb/wb_mux_tb.v
new file mode 100644
index 0000000..484bc15
--- /dev/null
+++ b/verilog/dv/tb/wb_mux_tb.v
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module wb_mux_tb();
+
+parameter LOG_CORES = 3;
+parameter PC_WIDTH = 8;
+parameter INSTR_WIDTH = 32;
+parameter DATA_WIDTH = 16;
+parameter IO_PINS = 16;
+parameter WB_WIDTH = 32;
+
+reg wb_stb_i;
+reg wb_cyc_i;
+reg wb_we_i;
+reg [WB_WIDTH-1:0] wb_adr_i;
+reg [WB_WIDTH-1:0] wb_dat_i;
+wire wbs_ack_o;
+wire [WB_WIDTH-1:0] wbs_dat_o;
+wire prog_we;
+wire [LOG_CORES-1:0] prog_sel;
+wire [PC_WIDTH-1:0] prog_waddr;
+wire [INSTR_WIDTH-1:0] prog_wdata;
+wire pads_we;
+wire pads_waddr;
+wire [IO_PINS-1:0] pads_wdata;
+wire [LOG_CORES-1:0] debug_sel;
+wire [4:0] debug_addr;
+wire debug_we;
+wire [DATA_WIDTH-1:0] debug_wdata;
+reg [DATA_WIDTH-1:0] debug_rdata;
+wire [WB_WIDTH-1:0] entropy_word;
+
+wb_mux #(
+   .LOG_CORES(LOG_CORES),
+   .PC_WIDTH(PC_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH),
+   .DATA_WIDTH(DATA_WIDTH),
+   .IO_PINS(IO_PINS),
+   .WB_WIDTH(WB_WIDTH)
+) wb_mux_dut (
+   .wb_stb_i(wb_stb_i),
+   .wb_cyc_i(wb_cyc_i),
+   .wb_we_i(wb_we_i),
+   .wb_adr_i(wb_adr_i),
+   .wb_dat_i(wb_dat_i),
+   .wbs_ack_o(wbs_ack_o),
+   .wbs_dat_o(wbs_dat_o),
+   .prog_we(prog_we),
+   .prog_sel(prog_sel),
+   .prog_waddr(prog_waddr),
+   .prog_wdata(prog_wdata),
+   .pads_we(pads_we),
+   .pads_waddr(pads_waddr),
+   .pads_wdata(pads_wdata),
+   .debug_sel(debug_sel),
+   .debug_addr(debug_addr),
+   .debug_we(debug_we),
+   .debug_wdata(debug_wdata),
+   .debug_rdata(debug_rdata),
+   .entropy_word(entropy_word)
+);
+
+initial begin
+   $monitor("time %4t / wa %1b wdo %32b / pwe %1b ps %3b pwa %8b pwd %32b / awe %1b aa %1b awd %16b / ds %3b da %5b dwe %1b dwd %16b / ew %32b",
+      $time, wbs_ack_o, wbs_dat_o, prog_we, prog_sel, prog_waddr, prog_wdata, pads_we, pads_waddr, pads_wdata,
+      debug_sel, debug_addr, debug_we, debug_wdata, entropy_word);
+   // before cycle
+   wb_stb_i = 0;
+   wb_cyc_i = 0;
+   wb_we_i = 0;
+   wb_adr_i = 0;
+   wb_dat_i = 32'b11111111111111111111111111111111;
+   debug_rdata = 16'b1111000010101010;
+   #10
+   // prog read (no effect)
+   wb_stb_i = 1;
+   wb_cyc_i = 1;
+   wb_adr_i = 32'b00_0000000000000000000_101_11011011;
+   #10
+   // prog write
+   wb_we_i = 1;
+   #10
+   // pads read (no effect)
+   wb_we_i = 0;
+   wb_adr_i = 32'b01_000000000000000000000000000001;
+   #10
+   // pads write
+   wb_we_i = 1;
+   #10
+   // debug read
+   wb_we_i = 0;
+   wb_adr_i = 32'b10_0000000000000000000000_010_01010;
+   #10
+   // debug write
+   wb_we_i = 1;
+   #10
+   // entropy read (no effect)
+   wb_we_i = 0;
+   wb_adr_i = 32'b11_000000000000000000000000000000;
+   #10
+   // entropy write
+   wb_we_i = 1;
+   #10
+   // after cycle
+   wb_stb_i = 0;
+   wb_cyc_i = 0;
+   #10
+   $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/alu.v b/verilog/rtl/alu.v
new file mode 100644
index 0000000..0d7c26c
--- /dev/null
+++ b/verilog/rtl/alu.v
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial arithmetic logic unit
+
+Opcode matrix:
+0000  and   in1 & in2               out set to result, carry_out set to |result
+0001  or    in1 | in2               out set to result, carry_out set to &result
+0010  xor   in1 ^ in2               out set to result, carry_out set to ^result
+0011  mux   carry ? in2 : in1       out set to result, carry_out set to highest bit of result
+0100  nand  ~(in1 & in2)            out set to result, carry_out set to &result
+0101  nor   ~(in1 | in2)            out set to result, carry_out set to |result
+0110  nxor  ~(in1 ^ in2)            out set to result, carry_out set to ~^~result
+0111  nmux  ~(carry ? in2 : in1)    out set to result, carry_out set to highest bit of result
+1000  rcl   in1 << in2              carry shifted in, carry_out shifted out
+1001  rcr   in1 >> in2              carry shifted in, carry_out shifted out
+1010  add   in1 + in2 + carry       {carry_out, out} set to result
+1011  sub   in1 - in2 - carry       {carry_out, out} set to result
+1100  mul   in1 * in2               out set to low W bits of result, carry_out set if high W bits are nonzero
+1101  mulh  in1 * in2               out set to high W bits of result, carry_out set if high W bits are nonzero
+1110  muld  in1 * {1, in2}          {carry_out, out} set to high W+1 bits of result
+1111  log   clog2(in1 + carry)      out set to result, carry_out set if in1 + carry is a power of 2
+
+There is no division opcode, but `muld` was included for the "division by invariant multiplication" algorithm.
+Division by a constant can be compiled to a `muld` followed by an `rcr`.
+*/
+
+module alu #(parameter DATA_WIDTH=16) (
+   input [3:0] opcode,
+   input [DATA_WIDTH-1:0] in1,
+   input [DATA_WIDTH-1:0] in2,
+   input carry,
+   output [DATA_WIDTH-1:0] out,
+   output carry_out
+);
+
+   wire [DATA_WIDTH-1:0] op_out[15:0];
+   wire op_carry[15:0];
+
+   wire [DATA_WIDTH-1:0] and_out = in1 & in2;
+   wire and_carry = |and_out;
+   assign op_out[0] = and_out;
+   assign op_carry[0] = and_carry;
+
+   wire [DATA_WIDTH-1:0] or_out = in1 | in2;
+   wire or_carry = &or_out;
+   assign op_out[1] = or_out;
+   assign op_carry[1] = or_carry;
+
+   wire [DATA_WIDTH-1:0] xor_out = in1 ^ in2;
+   wire xor_carry = ^xor_out;
+   assign op_out[2] = xor_out;
+   assign op_carry[2] = xor_carry;
+
+   wire [DATA_WIDTH-1:0] mux_out = carry ? in2 : in1;
+   wire mux_carry = mux_out[DATA_WIDTH-1];
+   assign op_out[3] = mux_out;
+   assign op_carry[3] = mux_carry;
+
+   wire [DATA_WIDTH-1:0] nand_out = ~and_out;
+   wire nand_carry = ~and_carry;
+   assign op_out[4] = nand_out;
+   assign op_carry[4] = nand_carry;
+
+   wire [DATA_WIDTH-1:0] nor_out = ~or_out;
+   wire nor_carry = ~or_carry;
+   assign op_out[5] = nor_out;
+   assign op_carry[5] = nor_carry;
+
+   wire [DATA_WIDTH-1:0] nxor_out = ~xor_out;
+   wire nxor_carry = ~xor_carry;
+   assign op_out[6] = nxor_out;
+   assign op_carry[6] = nxor_carry;
+
+   wire [DATA_WIDTH-1:0] nmux_out = ~mux_out;
+   wire nmux_carry = ~mux_carry;
+   assign op_out[7] = nmux_out;
+   assign op_carry[7] = nmux_carry;
+
+   wire [DATA_WIDTH-1:0] rcl_out;
+   wire rcl_carry, rcl_ignore;
+   assign {rcl_carry, rcl_out, rcl_ignore} = {1'b0, in1, carry} << in2;
+   assign op_out[8] = rcl_out;
+   assign op_carry[8] = rcl_carry;
+
+   wire [DATA_WIDTH-1:0] rcr_out;
+   wire rcr_carry, rcr_ignore;
+   assign {rcr_ignore, rcr_out, rcr_carry} = {carry, in1, 1'b0} >> in2;
+   assign op_out[9] = rcr_out;
+   assign op_carry[9] = rcr_carry;
+
+   wire [DATA_WIDTH-1:0] add_out;
+   wire add_carry;
+   assign {add_carry, add_out} = in1 + in2 + carry;
+   assign op_out[10] = add_out;
+   assign op_carry[10] = add_carry;
+
+   wire [DATA_WIDTH-1:0] sub_out;
+   wire sub_carry;
+   assign {sub_carry, sub_out} = in1 - in2 - carry;
+   assign op_out[11] = sub_out;
+   assign op_carry[11] = sub_carry;
+
+   wire [DATA_WIDTH-1:0] mulh_out;
+   wire [DATA_WIDTH-1:0] mul_out;
+   assign {mulh_out, mul_out} = in1 * in2;
+   wire mul_carry = |mulh_out;
+   wire mulh_carry = mul_carry;
+   assign op_out[12] = mul_out;
+   assign op_carry[12] = mul_carry;
+   assign op_out[13] = mulh_out;
+   assign op_carry[13] = mulh_carry;
+
+   wire [DATA_WIDTH-1:0] muld_out;
+   wire [DATA_WIDTH-1:0] muld_ignore;
+   wire muld_carry;
+   assign {muld_carry, muld_out, muld_ignore} = in1 * {1'b1, in2};
+   assign op_out[14] = muld_out;
+   assign op_carry[14] = muld_carry;
+
+   wire [DATA_WIDTH-1:0] in1c = in1 + carry;
+   wire [DATA_WIDTH-1:0] in1d = in1 - (!carry);
+   wire [DATA_WIDTH-1:0] log_bits;
+   localparam LOG_WIDTH = $clog2(DATA_WIDTH);
+   assign log_bits[DATA_WIDTH-1:LOG_WIDTH] = 0;
+   generate genvar i;
+   for (i=LOG_WIDTH-1; i>=0; i=i-1) begin:g_bit
+      wire [(1<<(i+1))-1:0] subseq;
+      if (i == LOG_WIDTH-1) begin:i_first
+         assign subseq = in1d;
+      end else begin:i_nfirst
+         wire [i+1:0] index = {log_bits[i+1], {(i+1){1'b0}}};
+         assign subseq = g_bit[i+1].subseq[index +: 1<<(i+1)];
+      end
+      assign log_bits[i] = |subseq[1<<i +: 1<<i];
+   end
+   endgenerate
+   wire in1nz = in1c || carry;
+   wire in1no = |in1d;
+   wire [DATA_WIDTH-1:0] log_out = in1nz ? (log_bits + in1no) : -1;
+   wire log_carry = in1nz && !(in1c & in1d);
+   assign op_out[15] = log_out;
+   assign op_carry[15] = log_carry;
+
+   assign out = op_out[opcode];
+   assign carry_out = op_carry[opcode];
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/cpu_core.v b/verilog/rtl/cpu_core.v
new file mode 100644
index 0000000..912b50d
--- /dev/null
+++ b/verilog/rtl/cpu_core.v
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Central processing unit (single core)
+
+Has two general-purpose registers and a carry flag and executes an instruction on every clock cycle.
+Fetches instructions via the progctr (out) and opcode (in) ports. Each opcode instructs te cpu to
+take two values from registers, memory or other sources, feed them through the ALU and put the
+results in a register or memory cell or use it as a jump target.
+
+Opcode structure assumes INSTR_WIDTH=32. Changing it requires substantial edits to the code below.
+
+Opcodes have 32 bits and use the following format:
+AAA BBB C DD EEEE FFF GGGGGGGGGGGGGGGG
+A = source for ALU input 1
+B = source for ALU input 2
+C = reset carry flag used as ALU input
+D = extra options, see below
+E = ALU opcode
+F = target for ALU result
+G = immediate value, can be used as a source
+
+Possible values for sources A & B:
+000   use register 1
+001   use register 2
+010   use program counter
+011   read value from memory address previously specified
+100   use immediate value
+101   use high (A) or low (B) 8 bits of immediate value
+110   use timer (A) or prng (B)
+111   use cpu number (A) or the constant 1 (B)
+
+Possible values for target F:
+000   ignore
+001   set register 1
+010   set register 2
+011   set program counter (jump)
+100   set memory read address
+101   set memory write address
+110   set spread value for memory write
+111   write value to memory address previously specified
+
+Possible values for ALU opcode E and how they use/set the carry flag are described in the
+ALU source header.
+
+Extra options in D were chosen to make classic Random Access Machine operations more
+succinct. They are:
+00    business as usual
+01    set carry to highest bit of input 1 (specified as source A)
+      then replace input 1 with the immediate value;
+      also toggle this carry flag if C was set (and don't clear it, of course)
+10    read value from memory and store it in register 1
+      (if the instruction uses register 1 as the target, store it in register 2 instead)
+11    set memory write address/spread/data based on the immediate value
+      (if write data is set in this operation, it also triggers a memory write)
+      if F==101 (address set from alu out): D ssss ddddddddddd = immediate
+      if F==110 (spread set from alu out):  D aaaaaaaa ddddddd = immediate
+      otherwise:                            A aaaaaaaaaaa ssss = immediate
+      if the D or A bit is present, use register 1 for data/address instead
+      and use the rest of the immediate value for the other part (aaa/sss/ddd)
+
+Example opcodes to implement Random Access Machine instructions:
+
+* M[i] = 0              // set memory slot i to zero
+  000 000 1 11 0010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] + 1       // increment value in memory slot i
+  100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+  011 111 1 11 1010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] - 1       // decrement value in memory slot i
+  100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+  011 111 1 11 1011 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] + M[j]    // add value in memory slot j to memory slot i
+  100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+  100 000 1 10 0011 100 iiiiiiiiiiiiiiii
+  011 000 1 11 1010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] - M[j]    // subtract value in memory slot j from memory slot i
+  100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+  100 000 1 10 0011 100 iiiiiiiiiiiiiiii
+  011 000 1 11 1011 111 0iiiiiiiiiii0000
+
+* M[M[i]] = M[j]        // set memory pointed to by slot i to value in slot j
+  100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+  100 000 1 10 0011 100 jjjjjjjjjjjjjjjj
+  011 000 1 11 0011 111 1000000000000000
+
+* M[i] = M[M[j]]        // set value in slot i to memory pointed to by slot j
+  100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+  011 000 1 00 0011 100 0000000000000000
+  011 000 1 11 0011 111 0iiiiiiiiiii0000
+
+* if M[i] < 0 goto j    // conditional jump
+  100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+  011 010 1 01 0011 011 jjjjjjjjjjjjjjjj
+
+Parameters:
+DATA_WIDTH = processor word size
+PC_WIDTH = size of program counter, should be <= DATA_WIDTH
+ADDR_WIDTH = size of mem_mesh addresses, should be <= DATA_WIDTH
+SPREAD_WIDTH = size of mem_mesh spread value
+INSTR_WIDTH = combined size of opcode & immediate, should be kept at 32
+CPU_NUM = id number to differentiate cpu cores, can be queried by code running on the processor
+*/
+
+module cpu_core #(parameter DATA_WIDTH=16, PC_WIDTH=8, ADDR_WIDTH=8, SPREAD_WIDTH=3, INSTR_WIDTH=32, CPU_NUM=0) (
+   input clk,                             // clock signal
+   input rst_n,                           // reset, active low
+   input [INSTR_WIDTH-1:0] opcode,        // opcode to be executed & immediate args
+   input [DATA_WIDTH-1:0] mem_rdata,      // connected to 'rdata' of memory module
+   input [DATA_WIDTH-1:0] prng_in,        // random number from prng
+   input [1:0] debug_mode,                // debug: 00 = no change, 01 = single step, 10 = run, 11 = stop
+   input [3:0] debug_sel,                 // debug: cpu status register to query or modify
+   input debug_we,                        // debug: modify selected status register
+   input [DATA_WIDTH-1:0] debug_wdata,    // debug: new value of selected status register
+   output [PC_WIDTH-1:0] progctr,         // program counter
+   output mem_we,                         // +-
+   output [ADDR_WIDTH-1:0] mem_waddr,     // | connected to
+   output [SPREAD_WIDTH-1:0] mem_wspread, // | corresponding ports
+   output [DATA_WIDTH-1:0] mem_wdata,     // | of memory module
+   output [ADDR_WIDTH-1:0] mem_raddr,     // +-
+   output debug_stopped,                  // debug: read back whether core is stopped
+   output [DATA_WIDTH-1:0] debug_rdata    // debug: current value of selected status register
+);
+
+reg [DATA_WIDTH-1:0] reg1;      // general-purpose registers
+reg [DATA_WIDTH-1:0] reg2;
+reg carry;                      // carry flag
+reg [DATA_WIDTH-1:0] pc;        // register for program counter
+reg [DATA_WIDTH-1:0] timer;     // clock ticks since last reset
+reg [ADDR_WIDTH-1:0] raddr;     // next read address
+reg we;                         // write to memory on next cycle
+reg [ADDR_WIDTH-1:0] waddr;     // next write address
+reg [SPREAD_WIDTH-1:0] wspread; // next write spread
+reg [DATA_WIDTH-1:0] wdata;     // next write data
+reg stopped;                    // cpu core is stopped
+
+assign progctr = pc;
+assign mem_we = we;
+assign mem_waddr = waddr;
+assign mem_wspread = wspread;
+assign mem_wdata = wdata;
+assign mem_raddr = raddr;
+
+// opcode subdivision
+wire [2:0] op_in1;     // input 1 source
+wire [2:0] op_in2;     // input 2 source
+wire op_rst_carry;     // reset carry flag
+wire [1:0] op_extra;   // extra steps before alu processing
+wire [3:0] op_alu;     // send this opcode (and in1, in2, carry) to the alu
+wire [2:0] op_target;  // target for alu result
+wire [15:0] op_immed;  // hardcoded value(s) to use as an input source
+assign {op_in1, op_in2, op_rst_carry, op_extra, op_alu, op_target, op_immed} = opcode;
+
+wire op_extra_carry = op_extra == 1;   // set carry based on in1, replace in1 with immediate
+wire op_extra_rdata = op_extra == 2;   // copy rdata to reg1 (or reg2 if reg1 is the target)
+wire op_extra_waddr = op_extra == 3;   // fill waddr & wspread from immediate
+
+wire [DATA_WIDTH-1:0] next_pc = pc + 1;
+
+wire [DATA_WIDTH-1:0] sources1[7:0];
+assign sources1[0] = reg1;
+assign sources1[1] = reg2;
+assign sources1[2] = next_pc;
+assign sources1[3] = mem_rdata;
+assign sources1[4] = op_immed;
+assign sources1[5] = op_immed[15:8];
+assign sources1[6] = timer;
+assign sources1[7] = CPU_NUM;
+
+wire [DATA_WIDTH-1:0] sources2[7:0];
+assign sources2[0] = reg1;
+assign sources2[1] = reg2;
+assign sources2[2] = next_pc;
+assign sources2[3] = mem_rdata;
+assign sources2[4] = op_immed;
+assign sources2[5] = op_immed[7:0];
+assign sources2[6] = prng_in;
+assign sources2[7] = 1;
+
+wire [DATA_WIDTH-1:0] in1_orig = sources1[op_in1];                   // data to use as alu input 1, unless overridden by op_extra_carry
+wire in1_oh = in1_orig[DATA_WIDTH-1];                                // highest bit of in1_orig
+wire [DATA_WIDTH-1:0] in1 = op_extra_carry ? op_immed : in1_orig;    // data to use as alu input 1
+wire [DATA_WIDTH-1:0] in2 = sources2[op_in2];                        // data to use as alu input 2
+wire carry_def = op_rst_carry ? 0 : carry;                           // carry to use as alu input, unless overridden by op_extra_carry
+wire carry_ovr = op_rst_carry ? ~in1_oh : in1_oh;                    // override value if op_extra_carry is set
+wire alu_cin = op_extra_carry ? carry_ovr : carry_def;               // consolidated carry input for alu
+
+wire [DATA_WIDTH-1:0] alu_out;                                       // data output from alu
+wire alu_cout;                                                       // carry output from alu
+
+alu #(
+   .DATA_WIDTH(DATA_WIDTH)
+) alu_dut (
+   .opcode(op_alu),
+   .in1(in1),
+   .in2(in2),
+   .carry(alu_cin),
+   .out(alu_out),
+   .carry_out(alu_cout)
+);
+
+wire op_target_reg1    = op_target == 1;
+wire op_target_reg2    = op_target == 2;
+wire op_target_pc      = op_target == 3;
+wire op_target_raddr   = op_target == 4;
+wire op_target_waddr   = op_target == 5;
+wire op_target_wspread = op_target == 6;
+wire op_target_wdata   = op_target == 7;
+
+// extract values from immediate to prepare for op_extra_waddr case
+wire immed_ovr = op_immed[15];
+wire [DATA_WIDTH-1:0] s_hi4  = immed_ovr ? op_immed[14:0] : op_immed[14:11];
+wire [DATA_WIDTH-1:0] d_lo11 = immed_ovr ? reg1 : op_immed[10:0];
+wire [DATA_WIDTH-1:0] a_hi8  = immed_ovr ? op_immed[14:0] : op_immed[14:7];
+wire [DATA_WIDTH-1:0] d_lo7  = immed_ovr ? reg1 : op_immed[6:0];
+wire [DATA_WIDTH-1:0] a_hi11 = immed_ovr ? reg1 : op_immed[14:4];
+wire [DATA_WIDTH-1:0] s_lo4  = immed_ovr ? op_immed[14:0] : op_immed[3:0];
+
+// update target with alu output
+// if op_extra_rdata is set, also write mem_rdata to reg1 (if target is reg1, use reg2 instead)
+// if op_extra_waddr is set, also fill waddr & wspread with immediate (if target is waddr/wspread, replace with wdata)
+wire [DATA_WIDTH-1:0] reg1_mod = op_target_reg1 ? alu_out : (op_extra_rdata ? mem_rdata : reg1);
+wire [DATA_WIDTH-1:0] reg2_mod = op_target_reg2 ? alu_out : ((op_extra_rdata && op_target_reg1) ? mem_rdata : reg2);
+wire [DATA_WIDTH-1:0] pc_mod = op_target_pc ? alu_out : next_pc;
+wire [DATA_WIDTH-1:0] raddr_mod = op_target_raddr ? alu_out : raddr;
+wire [DATA_WIDTH-1:0] waddr_mod = op_target_waddr ? alu_out :
+                          (op_extra_waddr ? (op_target_wspread ? a_hi8 : a_hi11) : waddr);
+wire [DATA_WIDTH-1:0] wspread_mod = op_target_wspread ? alu_out :
+                          (op_extra_waddr ? (op_target_waddr ? s_hi4 : s_lo4) : wspread);
+wire [DATA_WIDTH-1:0] wdata_mod = op_target_wdata ? alu_out :
+                          (op_extra_waddr ? (op_target_wspread ? d_lo7 : (op_target_waddr ? d_lo11 : wdata)) : wdata);
+wire we_mod = op_target_wdata || (op_extra_waddr && (op_target_waddr || op_target_wspread));
+
+// debug interface
+wire [DATA_WIDTH-1:0] debug_reg[15:0];
+assign debug_reg[0] = pc;
+assign debug_reg[1] = opcode[31:16];
+assign debug_reg[2] = opcode[15:0];
+assign debug_reg[3] = reg1;
+assign debug_reg[4] = reg2;
+assign debug_reg[5] = carry;
+assign debug_reg[6] = alu_out;
+assign debug_reg[7] = alu_cout;
+assign debug_reg[8] = timer;
+assign debug_reg[9] = prng_in;
+assign debug_reg[10] = raddr;
+assign debug_reg[11] = mem_rdata;
+assign debug_reg[12] = we;
+assign debug_reg[13] = waddr;
+assign debug_reg[14] = wspread;
+assign debug_reg[15] = wdata;
+assign debug_rdata = debug_reg[debug_sel];
+assign debug_stopped = stopped;
+assign stopped_mod = debug_mode[1] ? debug_mode[0] : stopped;
+
+// sequential logic
+always @ (posedge clk) begin
+   if (!rst_n) begin
+      reg1 <= 0;
+      reg2 <= 0;
+      carry <= 0;
+      pc <= 0;
+      timer <= 0;
+      raddr <= 0;
+      we <= 0;
+      waddr <= 0;
+      wspread <= 0;
+      wdata <= 0;
+      stopped <= 0;
+   end else begin
+      if (debug_we) begin
+         // don't run instructions on cycles with debug writes
+         case (debug_sel)
+            // wires can't be changed, only regs
+            0: pc <= debug_wdata;
+            // opcode high & low skipped
+            3: reg1 <= debug_wdata;
+            4: reg2 <= debug_wdata;
+            5: carry <= debug_wdata;
+            // alu_out & alu_cout skipped
+            8: timer <= debug_wdata;
+            // prng_in skipped
+            10: raddr <= debug_wdata;
+            // mem_rdata skipped
+            12: we <= debug_wdata;
+            13: waddr <= debug_wdata;
+            14: wspread <= debug_wdata;
+            15: wdata <= debug_wdata;
+         endcase
+      end else if (!stopped_mod || debug_mode == 2'b01) begin
+         // running or single stepping
+         reg1 <= reg1_mod;
+         reg2 <= reg2_mod;
+         carry <= alu_cout;
+         pc <= pc_mod;
+         timer <= timer + 1;
+         raddr <= raddr_mod;
+         we <= we_mod;
+         waddr <= waddr_mod;
+         wspread <= wspread_mod;
+         wdata <= wdata_mod;
+         stopped <= stopped_mod;
+      end
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/debug_mux.v b/verilog/rtl/debug_mux.v
new file mode 100644
index 0000000..68717cf
--- /dev/null
+++ b/verilog/rtl/debug_mux.v
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial debugging multiplexer
+
+Sends messages to cpu cores to run, stop or single step.
+Queries or modifies registers and status flags.
+*/
+
+module debug_mux #(parameter CORES=8, LOG_CORES=3, DATA_WIDTH=16) (
+   input [LOG_CORES-1:0] sel,                // controller interface
+   input [4:0] addr,                         // 0xxxx affects status register xxxx, 10000 affects running/stopped state
+   input we,
+   input [DATA_WIDTH-1:0] wdata,
+   output [DATA_WIDTH-1:0] rdata,
+   input [CORES-1:0] reg_stopped,            // interface towards cpu cores
+   input [CORES*DATA_WIDTH-1:0] reg_rdata,
+   output [CORES*2-1:0] cpu_mode,
+   output [CORES*4-1:0] reg_sel,
+   output [CORES-1:0] reg_we,
+   output [CORES*DATA_WIDTH-1:0] reg_wdata
+);
+
+wire reg_stopped_i[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_rdata_i[CORES-1:0];
+wire [1:0] cpu_mode_i[CORES-1:0];
+wire [3:0] reg_sel_i[CORES-1:0];
+wire reg_we_i[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_wdata_i[CORES-1:0];
+
+wire cc_mode;
+wire [3:0] cc_sel;
+assign {cc_mode, cc_sel} = addr;
+assign rdata = cc_mode ? reg_stopped_i[sel] : reg_rdata_i[sel];
+
+generate genvar core;
+for(core=0; core<CORES; core=core+1) begin:g_core
+   assign reg_stopped_i[core] = reg_stopped[core];
+   assign reg_rdata_i[core] = reg_rdata[core*DATA_WIDTH +: DATA_WIDTH];
+   assign cpu_mode[core*2 +: 2] = cpu_mode_i[core];
+   assign reg_sel[core*4 +: 4] = reg_sel_i[core];
+   assign reg_we[core] = reg_we_i[core];
+   assign reg_wdata[core*DATA_WIDTH +: DATA_WIDTH] = reg_wdata_i[core];
+
+   wire cur = sel == core;
+   assign cpu_mode_i[core] = (cur && we && cc_mode) ? wdata : 2'b00;
+   assign reg_sel_i[core] = (cur && !cc_mode) ? cc_sel : 4'b0000;
+   assign reg_we_i[core] = cur && we && !cc_mode;
+   assign reg_wdata_i[core] = (cur && we && !cc_mode) ? wdata : 0;
+end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/entropy_pool.v b/verilog/rtl/entropy_pool.v
new file mode 100644
index 0000000..c9e984c
--- /dev/null
+++ b/verilog/rtl/entropy_pool.v
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Simple entropy pool, shifting a single bit into prng's in each clock cycle
+*/
+
+module entropy_pool #(parameter WIDTH=16) (
+   input clk,
+   input rst_n,
+   input[WIDTH-1:0] e_word,
+   output e_bit
+);
+
+reg[WIDTH-1:0] e_pool;
+wire[WIDTH-1:0] e_pool_mod;
+assign {e_pool_mod, e_bit} = {1'b0, e_pool} ^ {e_word, 1'b0};
+
+always @(posedge clk) begin
+   if(!rst_n)
+      e_pool <= 0;
+   else
+      e_pool <= e_pool_mod;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/instr_mem.v b/verilog/rtl/instr_mem.v
new file mode 100644
index 0000000..61638a9
--- /dev/null
+++ b/verilog/rtl/instr_mem.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Instruction memory
+*/
+
+module instr_mem #(parameter PC_WIDTH=8, INSTR_WIDTH=32, DEPTH=128) (
+   input clk,
+   input rst_n,
+   input [PC_WIDTH-1:0] raddr,
+   output [INSTR_WIDTH-1:0] rdata,
+   input we,
+   input [PC_WIDTH-1:0] waddr,
+   input [INSTR_WIDTH-1:0] wdata
+);
+
+reg [INSTR_WIDTH-1:0] mem[DEPTH-1:0];
+
+assign rdata = mem[raddr];
+
+integer i;
+always @ (posedge clk) begin
+   if (!rst_n) begin
+      for (i=0; i<DEPTH; i=i+1) begin
+         mem[i] <= {(INSTR_WIDTH){1'b0}};
+      end
+   end else begin
+      if (we) mem[waddr] <= wdata;
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_filter.v b/verilog/rtl/io_filter.v
new file mode 100644
index 0000000..91d670c
--- /dev/null
+++ b/verilog/rtl/io_filter.v
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+IO filter
+
+Interfaces the io pins of the chip to the io ports of mem_mesh.
+
+An io port is created for each individual pin where the lowest bit sent on the port is forwarded
+to the pin and a bit coming from the pin is stretched to the full port width.
+
+Two additional io ports are created by joining together all input pins and all output pins respectively,
+right-aligned and zero-padded.
+
+Pins send and receive continuous streams of bits while io ports only fire on changes.
+Writing ports corresponding to individual pins override bits of the joined output port.
+
+We assume IO_PINS <= DATA_WIDTH. Alternatively we could modify the code to use more than one joined
+port per direction.
+*/
+
+module io_filter #(parameter IO_PINS=16, DATA_WIDTH=16) (
+   input clk,
+   input rst_n,
+   input [IO_PINS-1:0] pin_dir,            // 0=input, 1=output
+   input [IO_PINS-1:0] pin_data_in,        // input for both mem_mesh & io_filter
+   output [IO_PINS-1:0] pin_data_out,      // output for both mem_mesh & io_filter
+   output [IO_PINS+2-1:0] port_active_in,  // input for mem_mesh, output for io_filter
+   input [IO_PINS+2-1:0] port_active_out,  // output for mem_mesh, input for io_filter
+   output [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in,
+   input [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out
+);
+
+reg [IO_PINS-1:0] saved_in;
+reg [IO_PINS-1:0] saved_out;
+
+// input
+wire [IO_PINS-1:0] input_indiv = pin_data_in;                              // select input pins
+wire [IO_PINS-1:0] input_indiv_active = pin_data_in ^ saved_in;            // a pin is active if it changed from the last state
+wire input_joined_active = |input_indiv_active;                            // update the joined port if any of the pins changed
+wire [IO_PINS-1:0] input_joined;
+pin_compress #(                                                            // compress input bits together
+   .WIDTH(IO_PINS)
+) comp (
+   .data(input_indiv),
+   .mask(~pin_dir),
+   .result(input_joined)
+);
+
+// input
+assign port_active_in[IO_PINS +: 2] = {input_joined_active, 1'b0};         // assign the joined ports & their active states
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {input_joined, {(DATA_WIDTH){1'b0}}};
+// output
+wire [IO_PINS-1:0] output_indiv;
+wire [IO_PINS-1:0] output_indiv_active;
+generate genvar pin;
+   for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+      // input
+      assign port_active_in[pin] = input_indiv_active[pin];                // assign the individual ports & their active states
+      assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = {(DATA_WIDTH){input_indiv[pin]}};
+      // output
+      assign pin_data_out[pin] = saved_out[pin];                           // output pins keep their state between writes
+      assign output_indiv_active[pin] = port_active_out[pin];              // get pins & their active states from the individual output ports
+      assign output_indiv[pin] = port_data_out[pin*DATA_WIDTH];
+   end
+endgenerate
+
+// output
+wire [IO_PINS-1:0] output_joined = port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]; // get pins & their active state from the joined output port
+wire output_joined_active = port_active_out[IO_PINS];
+wire [IO_PINS-1:0] output_decomp;
+pin_decompress #(                                                          // decompress output pins to their respective bit positions
+   .WIDTH(IO_PINS)
+) decomp (
+   .data(output_joined),
+   .mask(pin_dir),
+   .result(output_decomp)
+);
+
+// consolidate pins set through joined & individual ports (individual ports have priority)
+wire [IO_PINS-1:0] output_mixed = (output_indiv_active & output_indiv) | (~output_indiv_active & output_decomp);
+wire [IO_PINS-1:0] output_mixed_active = output_indiv_active | {(IO_PINS){output_joined_active}};
+
+integer i;
+always @(posedge clk) begin
+   if (!rst_n) begin
+      saved_in <= 0;
+      saved_out <= 0;
+   end else begin
+      for (i=0; i<IO_PINS; i=i+1) begin
+         // active outputs change the saved state in order to keep being sent
+         if (output_mixed_active[i]) saved_out[i] <= output_mixed[i];
+         // inputs are only active for a single cycle while they differ from their saved state
+         saved_in[i] <= input_indiv[i];
+      end
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_filter_rev.v b/verilog/rtl/io_filter_rev.v
new file mode 100644
index 0000000..67a0fc4
--- /dev/null
+++ b/verilog/rtl/io_filter_rev.v
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+IO filter with reversed pin order
+*/
+
+module io_filter_rev #(parameter IO_PINS=16, DATA_WIDTH=16) (
+   input clk,
+   input rst_n,
+   input [IO_PINS-1:0] pin_dir,            // 0=input, 1=output
+   input [IO_PINS-1:0] pin_data_in,        // input for both mem_mesh & io_filter
+   output [IO_PINS-1:0] pin_data_out,      // output for both mem_mesh & io_filter
+   output [IO_PINS+2-1:0] port_active_in,  // input for mem_mesh, output for io_filter
+   input [IO_PINS+2-1:0] port_active_out,  // output for mem_mesh, input for io_filter
+   output [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in,
+   input [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out
+);
+
+wire [IO_PINS-1:0] pin_dir_rev;
+wire [IO_PINS-1:0] pin_data_in_rev;
+wire [IO_PINS-1:0] pin_data_out_rev;
+
+io_filter #(
+   .IO_PINS(IO_PINS),
+   .DATA_WIDTH(DATA_WIDTH)
+) io_filter_inst (
+   .clk(clk),
+   .rst_n(rst_n),
+   .pin_dir(pin_dir_rev),
+   .pin_data_in(pin_data_in_rev),
+   .pin_data_out(pin_data_out_rev),
+   .port_active_in(port_active_in),
+   .port_active_out(port_active_out),
+   .port_data_in(port_data_in),
+   .port_data_out(port_data_out)
+);
+
+generate genvar pin;
+   for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+      localparam rpin = IO_PINS-1-pin;
+      assign pin_dir_rev[pin] = pin_dir[rpin];
+      assign pin_data_in_rev[pin] = pin_data_in[rpin];
+      assign pin_data_out[pin] = pin_data_out_rev[rpin];
+   end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_pads.v b/verilog/rtl/io_pads.v
new file mode 100644
index 0000000..4089b74
--- /dev/null
+++ b/verilog/rtl/io_pads.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Connection to Caravel IO pads & logic analyzer
+
+IO_PINS = logical pins accessible for the program running on the cpu cores
+IO_PADS = pads made available by Caravel for user projects (maps to MPRJ_IO_PADS)
+LOGIC_PROBES = logic analyzer probes
+FIRST_PAD = map pin 0 to pad FIRST_PAD, pin 1 to pad FIRST_PAD+1 etc.
+*/
+
+module io_pads #(parameter IO_PINS=16, IO_PADS=38, LOGIC_PROBES=128, FIRST_PAD=12) (
+   // Caravel interface
+   input wb_clk_i,
+   input wb_rst_i,
+   input [LOGIC_PROBES-1:0] la_data_in,
+   output [LOGIC_PROBES-1:0] la_data_out,
+   input [LOGIC_PROBES-1:0] la_oenb,
+   input [IO_PADS-1:0] io_in,
+   output [IO_PADS-1:0] io_out,
+   output [IO_PADS-1:0] io_oeb,
+   // MCU interface
+   output clk,
+   output rst_hard_n,
+   output rst_soft_n,
+   output rst_prng_n,
+   // IO filter interface
+   output [IO_PINS-1:0] pin_dir,
+   output [IO_PINS-1:0] pin_data_in,
+   input [IO_PINS-1:0] pin_data_out,
+   // Wishbone multiplexer interface
+   input cfg_we,
+   input cfg_addr,
+   input [IO_PINS-1:0] cfg_wdata
+);
+
+reg programming;
+reg [IO_PINS-1:0] saved_dir;
+
+// allow logic analyzer probes to override clock & reset signals
+assign clk = la_oenb[0] ? wb_clk_i : la_data_in[0];
+assign rst_hard_n = la_oenb[1] ? !wb_rst_i : la_data_in[1];
+assign rst_soft_n = la_oenb[2] ? (!wb_rst_i & !programming) : la_data_in[2];
+assign rst_prng_n = la_oenb[3] ? !wb_rst_i : la_data_in[3];
+
+localparam LA_DIR = 4;                 // index of logic analyzer probes for pin directions
+localparam LA_PIN = LA_DIR + IO_PINS;  // index of logic analyzer probes for pin values
+localparam LA_PAD = LA_PIN + IO_PINS;  // index of logic analyzer probes for pad values
+localparam LA_END = LA_PAD + IO_PADS;  // index of first unused logic analyzer probe
+localparam LA_REM = LOGIC_PROBES - LA_END;  // unused logic analyzer probes
+
+localparam PAD_REM = IO_PADS - IO_PINS - FIRST_PAD;   // unused pads remaining after the last io pin
+
+// while programming, all pins are inputs, otherwise they follow the saved_dir array
+// the logic analyzer can override everything
+assign pin_dir = (la_oenb[LA_DIR +: IO_PINS] & (rst_soft_n ? saved_dir : 0)) |
+                 (~la_oenb[LA_DIR +: IO_PINS] & la_data_in[LA_DIR +: IO_PINS]);
+
+// pin values are read from corresponding pads as long as the pin direction is set to input
+assign pin_data_in = (la_oenb[LA_PIN +: IO_PINS] & ~pin_dir & io_in[FIRST_PAD +: IO_PINS]) |
+                     (~la_oenb[LA_PIN +: IO_PINS] & la_data_in[LA_PIN +: IO_PINS]);
+
+// configure pad directions according to pin directions, pads not matched to pins are marked as inputs
+assign io_oeb = (la_oenb[LA_PAD +: IO_PADS] & {{(PAD_REM){1'b1}}, ~pin_dir, {(FIRST_PAD){1'b1}}}) |
+                (~la_oenb[LA_PAD +: IO_PADS] & {(IO_PADS){1'b0}});
+
+// pin values are written to corresponding pads, zeroes are written to unassigned pads (they are inputs anyway)
+assign io_out = (la_oenb[LA_PAD +: IO_PADS] & {{(PAD_REM){1'b0}}, pin_dir & pin_data_out, {(FIRST_PAD){1'b0}}}) |
+                (~la_oenb[LA_PAD +: IO_PADS] & la_data_in[LA_PAD +: IO_PADS]);
+
+// logic analyzer probes can also read back the same signals and values
+assign la_data_out[0] = clk;
+assign la_data_out[1] = rst_hard_n;
+assign la_data_out[2] = rst_soft_n;
+assign la_data_out[3] = rst_prng_n;
+assign la_data_out[LA_DIR +: IO_PINS] = pin_dir;
+assign la_data_out[LA_PIN +: IO_PINS] = pin_data_out;
+assign la_data_out[LA_PAD +: IO_PADS] = io_in;
+assign la_data_out[LA_END +: LA_REM] = {(LA_REM){1'b0}};
+
+// change programming mode & pin directions from the wishbone multiplexer
+always @(posedge clk) begin
+   if (!rst_hard_n) begin
+      programming <= 0;
+      saved_dir <= {(IO_PINS){1'b0}};
+   end else begin
+      if (cfg_we) begin
+         case (cfg_addr)
+            0: programming <= cfg_wdata;
+            1: saved_dir <= cfg_wdata;
+         endcase
+      end
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/mcu.v b/verilog/rtl/mcu.v
new file mode 100644
index 0000000..f488b00
--- /dev/null
+++ b/verilog/rtl/mcu.v
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Microcontroller unit
+
+Combines the cpu cores with their corresponding instruction memories and prng's,
+the memory mesh, io filter and programming multiplexer into a single package
+
+       ||||||                                              |||   |||
+  +--------------+                                       +-----------+
+  |              |=======================================| pads & la |
+  |              |                   +--------------+    +-----------+
+  |    wb mux    |===================| entropy pool |=+       |||
+  |              |   +-----------+   +--------------+ |  +-----------+
+  |              |===| debug mux |                    |  | io filter |
+  +--------------+   +-----------+           +------+ |  +-----------+
+    ||||                  ||| +----------+ +=| prng |=+       |||
+  +------+  +-----------+ ||+=| cpu core |=+ +------+ |  +-----------+
+  |      |==| instr mem |=====|   w/alu  |===============|           |
+  |      |  +-----------+ ||  +----------+   +------+ |  |           |
+  |      |                ||  +----------+ +=| prng |=+  |           |
+  | prog |  +-----------+ |+==| cpu core |=+ +------+ |  |           |
+  |  mux |==| instr mem |=====|   w/alu  |===============| mem mesh  |
+  |      |  +-----------+ |   +----------+   +------+ |  |           |
+  |      |                |   +----------+ +=| prng |=+  |           |
+  |      |  +-----------+ +===| cpu core |=+ +------+    |           |
+  |      |==| instr mem |=====|   w/alu  |===============|           |
+  +------+  +-----------+     +----------+               +-----------+
+
+*/
+
+module mcu #(parameter
+   CORES = 8,                             // number of cpu cores
+   LOG_CORES = 3,                         // clog2(CORES)
+   MEM_DEPTH = 256,                       // number of memory mesh cells per cpu core
+   DATA_WIDTH = 16,                       // machine word size
+   PC_WIDTH = 8,                          // program counter size, should be at least clog2(INSTR_DEPTH)+clog2(CORES)
+   ADDR_WIDTH = 8,                        // memory mesh address width, should be at least clog2(MEM_DEPTH)
+   INSTR_WIDTH = 32,                      // opcode width including args, should be fixed at 32 or opcode handling needs to be changed
+   INSTR_DEPTH = 32,                      // minimum number of instructions in program memory (some cores will have a multiple of it)
+   IO_PINS = 16,                          // number of io pins usable by code on cpu cores
+   IO_PADS = 38,                          // number of caravel io pads
+   FIRST_PAD = 12,                        // map io pin 0 to caravel io pad FIRST_PAD
+   LOGIC_PROBES = 128,                    // number of caravel logic analyzer probes
+   WB_WIDTH = 32                          // wishbone bus width, fixed to 32
+)(
+   input wb_clk_i,                        // wishbone clock
+   input wb_rst_i,                        // wb reset, active high
+   input wb_stb_i,                        // wb strobe
+   input wb_cyc_i,                        // wb cycle
+   input wb_we_i,                         // wb write enable
+   input [WB_WIDTH-1:0] wb_adr_i,         // wb address
+   input [WB_WIDTH-1:0] wb_dat_i,         // wb input data
+   output wbs_ack_o,                      // wb acknowledge
+   output [WB_WIDTH-1:0] wbs_dat_o,       // wb output data
+   input [LOGIC_PROBES-1:0] la_data_in,   // logic analyzer probes input
+   output [LOGIC_PROBES-1:0] la_data_out, // la probes output
+   input [LOGIC_PROBES-1:0] la_oenb,      // la probes direction, 0=input (write by la), 1=output (read by la)
+   input [IO_PADS-1:0] io_in,             // io pads input
+   output [IO_PADS-1:0] io_out,           // io pads output
+   output [IO_PADS-1:0] io_oeb            // io pads direction, 0=output (write by mcu), 1=input (read by mcu)
+);
+
+localparam SPREAD_LAYERS = LOG_CORES;
+localparam SPREAD_WIDTH = $clog2(2 + SPREAD_LAYERS);
+localparam MEM_IO_PORTS = 2 + IO_PINS;
+localparam MEM_IO_FIRST = MEM_DEPTH - MEM_IO_PORTS;
+
+// clock and reset signals, set by io_pads using wb_clk_i, wb_rst_i and logic probes
+wire clk;
+wire rst_hard_n;
+wire rst_soft_n;
+wire rst_prng_n;
+
+// between io pads and io filter
+wire [IO_PINS-1:0] pin_dir;                           // pads > iof
+wire [IO_PINS-1:0] pin_data_in;                       // pads > iof
+wire [IO_PINS-1:0] pin_data_out;                      // pads < iof
+
+// between cpu core and corresponding instruction memory
+wire [INSTR_WIDTH-1:0] opcode[CORES-1:0];             // cpu < im
+wire [PC_WIDTH-1:0] progctr[CORES-1:0];               // cpu > im
+
+// between cpu core and memory mesh (unpacked versions for cpu cores)
+wire [DATA_WIDTH-1:0] mem_rdata[CORES-1:0];           // cpu < mesh
+wire mem_we[CORES-1:0];                               // cpu > mesh
+wire [ADDR_WIDTH-1:0] mem_waddr[CORES-1:0];           // cpu > mesh
+wire [SPREAD_WIDTH-1:0] mem_wspread[CORES-1:0];       // cpu > mesh
+wire [DATA_WIDTH-1:0] mem_wdata[CORES-1:0];           // cpu > mesh
+wire [ADDR_WIDTH-1:0] mem_raddr[CORES-1:0];           // cpu > mesh
+
+// between cpu core and memory mesh (packed versions for memory mesh)
+wire [CORES*DATA_WIDTH-1:0] mem_rdata_raw;            // cpu < mesh
+wire [CORES-1:0] mem_we_raw;                          // cpu > mesh
+wire [CORES*ADDR_WIDTH-1:0] mem_waddr_raw;            // cpu > mesh
+wire [CORES*SPREAD_WIDTH-1:0] mem_wspread_raw;        // cpu > mesh
+wire [CORES*DATA_WIDTH-1:0] mem_wdata_raw;            // cpu > mesh
+wire [CORES*ADDR_WIDTH-1:0] mem_raddr_raw;            // cpu > mesh
+
+// between cpu core and corresponding prng
+wire [DATA_WIDTH-1:0] prng_random[CORES-1:0];         // cpu < prng
+
+// between instruction memory and programming multiplexer (unpacked versions for instruction memory)
+wire im_we[CORES-1:0];                                // im < pmux
+wire [PC_WIDTH-1:0] im_waddr[CORES-1:0];              // im < pmux
+wire [INSTR_WIDTH-1:0] im_wdata[CORES-1:0];           // im < pmux
+
+// between instruction memory and programming multiplexer (packed versions for programming multiplexer)
+wire [CORES-1:0] im_we_raw;                           // im < pmux
+wire [CORES*PC_WIDTH-1:0] im_waddr_raw;               // im < pmux
+wire [CORES*INSTR_WIDTH-1:0] im_wdata_raw;            // im < pmux
+
+// between memory mesh and io filter
+wire [MEM_IO_PORTS-1:0] mem_io_active_in;             // mesh < iof
+wire [MEM_IO_PORTS-1:0] mem_io_active_out;            // mesh > iof
+wire [MEM_IO_PORTS*DATA_WIDTH-1:0] mem_io_data_in;    // mesh < iof
+wire [MEM_IO_PORTS*DATA_WIDTH-1:0] mem_io_data_out;   // mesh > iof
+
+// between debugging multiplexer and cpu core (unpacked versions for cpu core)
+wire [1:0] debug_cpu_mode[CORES-1:0];                 // dmux > cpu
+wire [3:0] debug_reg_sel[CORES-1:0];                  // dmux > cpu
+wire debug_reg_we[CORES-1:0];                         // dmux > cpu
+wire [DATA_WIDTH-1:0] debug_reg_wdata[CORES-1:0];     // dmux > cpu
+wire debug_reg_stopped[CORES-1:0];                    // dmux < cpu
+wire [DATA_WIDTH-1:0] debug_reg_rdata[CORES-1:0];     // dmux < cpu
+
+// between debugging multiplexer and cpu core (packed versions for debugging multiplexer)
+wire [CORES*2-1:0] debug_cpu_mode_raw;                // dmux > cpu
+wire [CORES*4-1:0] debug_reg_sel_raw;                 // dmux > cpu
+wire [CORES-1:0] debug_reg_we_raw;                    // dmux > cpu
+wire [CORES*DATA_WIDTH-1:0] debug_reg_wdata_raw;      // dmux > cpu
+wire [CORES-1:0] debug_reg_stopped_raw;               // dmux < cpu
+wire [CORES*DATA_WIDTH-1:0] debug_reg_rdata_raw;      // dmux < cpu
+
+// between wishbone multiplexer and programming multiplexer
+wire prog_we;                                         // wbmux > pmux
+wire [LOG_CORES-1:0] prog_sel;                        // wbmux > pmux
+wire [PC_WIDTH-1:0] prog_waddr;                       // wbmux > pmux
+wire [INSTR_WIDTH-1:0] prog_wdata;                    // wbmux > pmux
+
+// between wishbone multiplexer and io pads
+wire pads_we;                                         // wbmux > pads
+wire pads_waddr;                                      // wbmux > pads
+wire [IO_PINS-1:0] pads_wdata;                        // wbmux > pads
+
+// between wishbone multiplexer and debugging multiplexer
+wire [LOG_CORES-1:0] debug_sel;                       // wbmux > dmux
+wire [4:0] debug_addr;                                // wbmux > dmux
+wire debug_we;                                        // wbmux > dmux
+wire [DATA_WIDTH-1:0] debug_wdata;                    // wbmux > dmux
+wire [DATA_WIDTH-1:0] debug_rdata;                    // wbmux < dmux
+
+// between wishbone multiplexer and entropy pool
+wire [WB_WIDTH-1:0] entropy_word;                     // wbmux > ep
+
+// between entropy pool and prng's
+wire entropy_bit;                                     // ep > prng
+
+// repeat for each cpu core
+generate genvar core;
+for(core=0; core<CORES; core=core+1) begin:g_core
+
+   // add the cpu core itself
+   cpu_core #(
+      .DATA_WIDTH(DATA_WIDTH),
+      .PC_WIDTH(PC_WIDTH),
+      .ADDR_WIDTH(ADDR_WIDTH),
+      .SPREAD_WIDTH(SPREAD_WIDTH),
+      .INSTR_WIDTH(INSTR_WIDTH),
+      .CPU_NUM(core)
+   ) cpu_core_inst (
+      .clk(clk),
+      .rst_n(rst_soft_n),
+      .opcode(opcode[core]),
+      .mem_rdata(mem_rdata[core]),
+      .prng_in(prng_random[core]),
+      .debug_mode(debug_cpu_mode[core]),
+      .debug_sel(debug_reg_sel[core]),
+      .debug_we(debug_reg_we[core]),
+      .debug_wdata(debug_reg_wdata[core]),
+      .progctr(progctr[core]),
+      .mem_we(mem_we[core]),
+      .mem_waddr(mem_waddr[core]),
+      .mem_wspread(mem_wspread[core]),
+      .mem_wdata(mem_wdata[core]),
+      .mem_raddr(mem_raddr[core]),
+      .debug_stopped(debug_reg_stopped[core]),
+      .debug_rdata(debug_reg_rdata[core])
+   );
+
+   // add corresponding instruction memory
+   localparam CORES_RNDUP = 1 << LOG_CORES;
+   localparam DEPTH_MULT = (core + CORES_RNDUP) & ~(core + CORES_RNDUP-1);
+   // e.g. for 8 cores, depths are multiplied by 8, 1, 2, 1, 4, 1, 2, 1
+   // so that we have a few cores that accept longer programs but the total
+   // memory required is still kept reasonably low
+   instr_mem #(
+      .PC_WIDTH(PC_WIDTH),
+      .INSTR_WIDTH(INSTR_WIDTH),
+      .DEPTH(INSTR_DEPTH * DEPTH_MULT)
+   ) instr_mem_inst (
+      .clk(clk),
+      .rst_n(rst_hard_n),
+      .raddr(progctr[core]),
+      .rdata(opcode[core]),
+      .we(im_we[core]),
+      .waddr(im_waddr[core]),
+      .wdata(im_wdata[core])
+   );
+
+   // add its own pseudorandom number generator
+   prng_wrap #(
+      .INDEX(core),
+      .OUTPUT_BITS(DATA_WIDTH)
+   ) prng_inst (
+      .clk(clk),
+      .rst_n(rst_prng_n),
+      .entropy(entropy_bit),
+      .random(prng_random[core])
+   );
+
+   // convert memory mesh inputs: unpacked to packed
+   assign mem_we_raw[core] = mem_we[core];
+   assign mem_waddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = mem_waddr[core];
+   assign mem_wspread_raw[core*SPREAD_WIDTH +: SPREAD_WIDTH] = mem_wspread[core];
+   assign mem_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = mem_wdata[core];
+   assign mem_raddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = mem_raddr[core];
+
+   // convert memory mesh outputs: packed to unpacked
+   assign mem_rdata[core] = mem_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+
+   // convert programming multiplexer outputs: packed to unpacked
+   assign im_we[core] = im_we_raw[core];
+   assign im_waddr[core] = im_waddr_raw[core*PC_WIDTH +: PC_WIDTH];
+   assign im_wdata[core] = im_wdata_raw[core*INSTR_WIDTH +: INSTR_WIDTH];
+
+   // convert debugging multiplexer inputs: unpacked to packed
+   assign debug_reg_stopped_raw[core] = debug_reg_stopped[core];
+   assign debug_reg_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = debug_reg_rdata[core];
+
+   // convert debugging multiplexer outputs: packed to unpacked
+   assign debug_cpu_mode[core] = debug_cpu_mode_raw[core*2 +: 2];
+   assign debug_reg_sel[core] = debug_reg_sel_raw[core*4 +: 4];
+   assign debug_reg_we[core] = debug_reg_we_raw[core];
+   assign debug_reg_wdata[core] = debug_reg_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+
+end
+endgenerate
+
+// add the memory mesh, with a packed bus towards the cpu cores
+mem_mesh #(
+   .CORES(CORES),
+   .DEPTH(MEM_DEPTH),
+   .DATA_WIDTH(DATA_WIDTH),
+   .ADDR_WIDTH(ADDR_WIDTH),
+   .SPREAD_LAYERS(SPREAD_LAYERS),
+   .SPREAD_WIDTH(SPREAD_WIDTH),
+   .USE_IO(1),
+   .IO_PORTS(MEM_IO_PORTS),
+   .IO_FIRST(MEM_IO_FIRST)
+) mem_mesh_inst (
+   .clk(clk),
+   .rst_n(rst_soft_n),
+   .we(mem_we_raw),
+   .waddr(mem_waddr_raw),
+   .wspread(mem_wspread_raw),
+   .wdata(mem_wdata_raw),
+   .raddr(mem_raddr_raw),
+   .rdata(mem_rdata_raw),
+   .io_active_in(mem_io_active_in),
+   .io_active_out(mem_io_active_out),
+   .io_data_in(mem_io_data_in),
+   .io_data_out(mem_io_data_out)
+);
+
+// add the io filter connected to the memory mesh
+io_filter_rev #(
+   .IO_PINS(IO_PINS),
+   .DATA_WIDTH(DATA_WIDTH)
+) io_filter_inst (
+   .clk(clk),
+   .rst_n(rst_soft_n),
+   .pin_dir(pin_dir),
+   .pin_data_in(pin_data_in),
+   .pin_data_out(pin_data_out),
+   .port_active_in(mem_io_active_in),
+   .port_active_out(mem_io_active_out),
+   .port_data_in(mem_io_data_in),
+   .port_data_out(mem_io_data_out)
+);
+
+// add the programming multiplexer, with a packed bus towards instruction memories
+prog_mux #(
+   .CORES(CORES),
+   .LOG_CORES(LOG_CORES),
+   .PC_WIDTH(PC_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH)
+) prog_mux_inst (
+   .we(prog_we),
+   .sel(prog_sel),
+   .waddr(prog_waddr),
+   .wdata(prog_wdata),
+   .cwe(im_we_raw),
+   .cwaddr(im_waddr_raw),
+   .cwdata(im_wdata_raw)
+);
+
+// add the debugging multiplexer, with a packed bus towards cpu cores
+debug_mux #(
+   .CORES(CORES),
+   .LOG_CORES(LOG_CORES),
+   .DATA_WIDTH(DATA_WIDTH)
+) debug_mux_inst (
+   .sel(debug_sel),
+   .addr(debug_addr),
+   .we(debug_we),
+   .wdata(debug_wdata),
+   .rdata(debug_rdata),
+   .reg_stopped(debug_reg_stopped_raw),
+   .reg_rdata(debug_reg_rdata_raw),
+   .cpu_mode(debug_cpu_mode_raw),
+   .reg_sel(debug_reg_sel_raw),
+   .reg_we(debug_reg_we_raw),
+   .reg_wdata(debug_reg_wdata_raw)
+);
+
+// add the entropy pool
+entropy_pool #(
+   .WIDTH(WB_WIDTH)
+) entropy_pool_inst (
+   .clk(clk),
+   .rst_n(rst_prng_n),
+   .e_word(entropy_word),
+   .e_bit(entropy_bit)
+);
+
+// add the wishbone multiplexer
+wb_mux #(
+   .LOG_CORES(LOG_CORES),
+   .PC_WIDTH(PC_WIDTH),
+   .INSTR_WIDTH(INSTR_WIDTH),
+   .DATA_WIDTH(DATA_WIDTH),
+   .IO_PINS(IO_PINS),
+   .WB_WIDTH(WB_WIDTH)
+) wb_mux_inst (
+   .wb_stb_i(wb_stb_i),
+   .wb_cyc_i(wb_cyc_i),
+   .wb_we_i(wb_we_i),
+   .wb_adr_i(wb_adr_i),
+   .wb_dat_i(wb_dat_i),
+   .wbs_ack_o(wbs_ack_o),
+   .wbs_dat_o(wbs_dat_o),
+   .prog_we(prog_we),
+   .prog_sel(prog_sel),
+   .prog_waddr(prog_waddr),
+   .prog_wdata(prog_wdata),
+   .pads_we(pads_we),
+   .pads_waddr(pads_waddr),
+   .pads_wdata(pads_wdata),
+   .debug_sel(debug_sel),
+   .debug_addr(debug_addr),
+   .debug_we(debug_we),
+   .debug_wdata(debug_wdata),
+   .debug_rdata(debug_rdata),
+   .entropy_word(entropy_word)
+);
+
+// add the io pads & logic analyzer probes
+// (this includes some reset & clock logic as well)
+io_pads #(
+   .IO_PINS(IO_PINS),
+   .IO_PADS(IO_PADS),
+   .LOGIC_PROBES(LOGIC_PROBES),
+   .FIRST_PAD(FIRST_PAD)
+) io_pads_inst (
+   .wb_clk_i(wb_clk_i),
+   .wb_rst_i(wb_rst_i),
+   .la_data_in(la_data_in),
+   .la_data_out(la_data_out),
+   .la_oenb(la_oenb),
+   .io_in(io_in),
+   .io_out(io_out),
+   .io_oeb(io_oeb),
+   .clk(clk),
+   .rst_hard_n(rst_hard_n),
+   .rst_soft_n(rst_soft_n),
+   .rst_prng_n(rst_prng_n),
+   .pin_dir(pin_dir),
+   .pin_data_in(pin_data_in),
+   .pin_data_out(pin_data_out),
+   .cfg_we(pads_we),
+   .cfg_addr(pads_waddr),
+   .cfg_wdata(pads_wdata)
+);
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/mem_mesh.v b/verilog/rtl/mem_mesh.v
new file mode 100644
index 0000000..0afa424
--- /dev/null
+++ b/verilog/rtl/mem_mesh.v
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Generates a DFF RAM block for each core with a tree-like interconnect mesh between them
+
+Parameters:
+CORES = number of cpu cores, also specifies the number of ram blocks
+DEPTH = number of words per ram block
+DATA_WIDTH = word size, number of bits per memory cell
+ADDR_WIDTH = address bus width, should be clog2(DEPTH)
+SPREAD_LAYERS = number of spread layers, should be clog2(CORES)
+SPREAD_WIDTH = spread bus width, should be clog2(2+SPREAD_LAYERS)
+IO_PORTS = number of io ports, should be <= DEPTH
+IO_FIRST = memory cell mapped to the first io port, should be <= DEPTH - IO_PORTS
+
+A value of wspread > 0 on write operations specifies that the same address should also be written in some
+other memory blocks. In particular, blocks whose number only differ in the lowest wspread bits are affected.
+If several simultaneous write operations affect the same memory cell, writes with higher wspread have
+priority. For writes having equal wspread the core with the lowest number wins.
+
+If addresses < IO_BUS_WIDTH are written with wspread > SPREAD_LAYERS, wdata is also sent to the io bus.
+Incoming data on the io bus is written to the respective cells with maximal spread (affecting all cores).
+*/
+
+module mem_mesh #(parameter CORES=8, DEPTH=256, DATA_WIDTH=16, ADDR_WIDTH=8, SPREAD_LAYERS=3, SPREAD_WIDTH=3, USE_IO=1, IO_PORTS=16, IO_FIRST=240) (
+   input clk,                                   // clock signal
+   input rst_n,                                 // reset, active low
+   input [CORES-1:0] we,                        // write enable
+   input [CORES*ADDR_WIDTH-1:0] waddr,          // write address
+   input [CORES*SPREAD_WIDTH-1:0] wspread,      // write spread
+   input [CORES*DATA_WIDTH-1:0] wdata,          // write data
+   input [CORES*ADDR_WIDTH-1:0] raddr,          // read address
+   output [CORES*DATA_WIDTH-1:0] rdata,         // read data
+   input [IO_PORTS-1:0] io_active_in,           // is receiving data on io bus
+   output [IO_PORTS-1:0] io_active_out,         // is sending data on io bus
+   input [IO_PORTS*DATA_WIDTH-1:0] io_data_in,  // io bus input
+   output [IO_PORTS*DATA_WIDTH-1:0] io_data_out // io bus output
+);
+
+reg [DATA_WIDTH-1:0] mem[CORES-1:0][DEPTH-1:0];       // memory cells
+wire presel[CORES-1:0][DEPTH-1:0];                    // is address selected before spreading
+wire uspread[CORES-1:0][SPREAD_LAYERS+1-1:0];         // is spreading to layer
+wire postsel[CORES-1:0][DEPTH-1:0];                   // is address selected after spreading
+wire [DATA_WIDTH-1:0] postdata[CORES-1:0][DEPTH-1:0]; // data to be written after spreading
+
+generate genvar core, addr, layer, group, spl;
+
+// convert spread to unary
+for (core=0; core<CORES; core=core+1) begin:g_core
+   for(layer=0; layer<=SPREAD_LAYERS; layer=layer+1) begin:g_layer
+      assign uspread[core][layer] = we[core] & wspread[core*SPREAD_WIDTH +: SPREAD_WIDTH] > layer;
+   end
+end
+
+for (addr=0; addr<DEPTH; addr=addr+1) begin:g_cell
+
+   // convert write address to one-hot encoding
+   for (core=0; core<CORES; core=core+1) begin:g_core_m
+      assign presel[core][addr] = we[core] & (waddr[core*ADDR_WIDTH +: ADDR_WIDTH] == addr);
+   end
+
+   // calculate spreading from individual cores to groups of cores
+   for (layer=0; layer<=SPREAD_LAYERS; layer=layer+1) begin:spread
+      localparam GROUPS = CORES >> layer;
+      wire gsel[GROUPS-1:0];
+      wire [DATA_WIDTH-1:0] gdata[GROUPS-1:0];
+      wire gspread[GROUPS-1:0][SPREAD_LAYERS+1-layer-1:0];
+      if (layer == 0) begin:i_layerz
+         for (group=0; group<GROUPS; group=group+1) begin:g_group
+            assign gsel[group] = presel[group][addr];
+            assign gdata[group] = {(DATA_WIDTH){we[group]}} & wdata[group*DATA_WIDTH +: DATA_WIDTH];
+            for (spl=0; spl<=SPREAD_LAYERS; spl=spl+1) begin:cspread
+               assign gspread[group][spl] = uspread[group][spl];
+            end
+         end
+      end else begin:i_layernz
+         for (group=0; group<GROUPS; group=group+1) begin:g_group
+            wire gs1 = spread[layer-1].gsel[group*2] & spread[layer-1].gspread[group*2][0];
+            wire gs2 = spread[layer-1].gsel[group*2+1] & spread[layer-1].gspread[group*2+1][0];
+            wire [DATA_WIDTH-1:0] gd1 = spread[layer-1].gdata[group*2];
+            wire [DATA_WIDTH-1:0] gd2 = spread[layer-1].gdata[group*2+1];
+            assign gsel[group] = gs1 | gs2;
+            assign gdata[group] = gs1 ? gd1 : gd2;
+            for (spl=0; spl<=SPREAD_LAYERS-layer; spl=spl+1) begin:g_spread
+               wire gsp1 = spread[layer-1].gspread[group*2][spl+1];
+               wire gsp2 = spread[layer-1].gspread[group*2+1][spl+1];
+               assign gspread[group][spl] = gs1 ? gsp1 : gsp2;
+            end
+         end
+      end
+   end
+
+   // mix in io logic at the highest spreading level
+   wire gs_i;
+   wire [DATA_WIDTH-1:0] gd_i;
+   if (USE_IO && IO_FIRST <= addr && addr < IO_FIRST + IO_PORTS) begin:i_io
+      localparam io = addr - IO_FIRST;
+      wire gs_o = spread[SPREAD_LAYERS].gsel[0] & spread[SPREAD_LAYERS].gspread[0][0];
+      wire [DATA_WIDTH-1:0] gd_o = {(DATA_WIDTH){gs_o}} & spread[SPREAD_LAYERS].gdata[0];
+      assign io_active_out[io] = gs_o;
+      assign io_data_out[io*DATA_WIDTH +: DATA_WIDTH] = gd_o;
+      assign gs_i = io_active_in[io] ? 1'b1 : spread[SPREAD_LAYERS].gsel[0];
+      assign gd_i = io_active_in[io] ? io_data_in[io*DATA_WIDTH +: DATA_WIDTH] : spread[SPREAD_LAYERS].gdata[0];
+   end else begin:i_nio
+      assign gs_i = spread[SPREAD_LAYERS].gsel[0];
+      assign gd_i = spread[SPREAD_LAYERS].gdata[0];
+   end
+
+   // calculate spreading back from groups of cores to individual cores
+   for (layer=SPREAD_LAYERS; layer>=0; layer=layer-1) begin:collect
+      localparam GROUPS = CORES >> layer;
+      wire pgsel[GROUPS-1:0];
+      wire [DATA_WIDTH-1:0] pgdata[GROUPS-1:0];
+      if (layer == SPREAD_LAYERS) begin:i_layerl
+         assign pgsel[0] = gs_i;
+         assign pgdata[0] = gd_i;
+         for (group=1; group<GROUPS; group=group+1) begin:g_group
+            assign pgsel[group] = spread[layer].gsel[group];
+            assign pgdata[group] = spread[layer].gdata[group];
+         end
+      end else begin:i_layernl
+         for (group=0; group<GROUPS; group=group+1) begin:g_group
+            wire gs = spread[layer].gsel[group];
+            wire [DATA_WIDTH-1:0] gd = spread[layer].gdata[group];
+            wire cgs = collect[layer+1].pgsel[group/2];
+            wire [DATA_WIDTH-1:0] cgd = collect[layer+1].pgdata[group/2];
+            assign pgsel[group] = cgs | gs;
+            assign pgdata[group] = cgs ? cgd : gd;
+         end
+      end
+   end
+   for (core=0; core<CORES; core=core+1) begin:g_core_c
+      assign postsel[core][addr] = collect[0].pgsel[core];
+      assign postdata[core][addr] = collect[0].pgdata[core];
+   end
+
+   // sequential write logic
+   for (core=0; core<CORES; core=core+1) begin:g_core_w
+      always @(posedge clk) begin
+         if (!rst_n) begin
+            mem[core][addr] <= 0;
+         end else begin
+            if (postsel[core][addr]) begin
+               mem[core][addr] <= postdata[core][addr];
+            end
+         end
+      end
+   end
+
+end
+
+// read logic
+for (core=0; core<CORES; core=core+1) begin:g_core_r
+   wire [ADDR_WIDTH-1:0] craddr = raddr[core*ADDR_WIDTH +: ADDR_WIDTH];
+   assign rdata[core*DATA_WIDTH +: DATA_WIDTH] = mem[core][craddr];
+end
+
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/pin_compress.v b/verilog/rtl/pin_compress.v
new file mode 100644
index 0000000..b508527
--- /dev/null
+++ b/verilog/rtl/pin_compress.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial circuit shifting input bits from the mask bit positions
+
+E.g.
+data   = 1001110100110101
+mask   = 0100100101000101
+          0  1  1 0   1 1
+result = 0000000000011011
+*/
+
+module pin_compress #(parameter WIDTH=16) (
+   input [WIDTH-1:0] data,
+   input [WIDTH-1:0] mask,
+   output [WIDTH-1:0] result
+);
+
+generate genvar layer;
+   for (layer=0; layer<WIDTH; layer=layer+1) begin:comp
+      wire [WIDTH-1:0] sd;
+      if (layer == 0) begin:i_first
+         assign sd = {{(WIDTH-1){1'b0}}, data[WIDTH-1] & mask[WIDTH-1]};
+      end else begin:i_nfirst
+         wire [WIDTH-1:0] sdp = comp[layer-1].sd;
+         assign sd = mask[WIDTH-1-layer] ? {sdp[WIDTH-2:0], data[WIDTH-1-layer]} : sdp;
+      end
+   end
+   assign result = comp[WIDTH-1].sd;
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/pin_decompress.v b/verilog/rtl/pin_decompress.v
new file mode 100644
index 0000000..bea24eb
--- /dev/null
+++ b/verilog/rtl/pin_decompress.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial circuit shifting input bits to the mask bit positions
+
+E.g.
+data   = 0000000000001011
+mask   = 0101000101000101
+          0 0   1 0   1 1
+result = 0000000100000101
+*/
+
+module pin_decompress #(parameter WIDTH=16) (
+   input [WIDTH-1:0] data,
+   input [WIDTH-1:0] mask,
+   output [WIDTH-1:0] result
+);
+
+generate genvar layer;
+   for (layer=0; layer<WIDTH; layer=layer+1) begin:decomp
+      wire [WIDTH-1:0] sd;
+      if (layer == 0) begin:i_first
+         assign sd = data;
+      end else begin:i_nfirst
+         wire [WIDTH-1:0] sdp = decomp[layer-1].sd;
+         assign sd = mask[layer-1] ? sdp >> 1 : sdp;
+      end
+      assign result[layer] = mask[layer] & sd[0];
+   end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prng.v b/verilog/rtl/prng.v
new file mode 100644
index 0000000..30d647a
--- /dev/null
+++ b/verilog/rtl/prng.v
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Pseudorandom number generator using a Fibonacci-style XNOR linear feedback shift register
+
+STATE_BITS = number of bits for prng state
+POLYNOMIAL = bit mask used for feedback, should be chosen so that the prng repeats ifself after 2^(STATE_BITS-1) cycles
+STATE_INIT = used to seed the prng on reset
+OUTPUT_BITS = number of bits shifted out every clock cycle
+*/
+
+module prng #(parameter STATE_BITS = 4, POLYNOMIAL = 4'b1001, STATE_INIT = 4'b0000, OUTPUT_BITS = 2) (
+   input clk,
+   input rst_n,
+   input entropy,    // optional external entropy for more randomness
+   output [OUTPUT_BITS-1:0] random
+);
+
+localparam SCRAMBLE_CYCLES = STATE_BITS;
+reg [STATE_BITS-1:0] state;
+
+generate genvar shift;
+
+// shift register for generating next OUTPUT_BITS states
+for (shift=0; shift<OUTPUT_BITS; shift=shift+1) begin:g_shift
+   wire [STATE_BITS-1:0] prev_state;
+   wire feedback;
+   if (shift == 0) begin:i_first
+      assign prev_state = state;
+      assign feedback = ^(prev_state & POLYNOMIAL) ^ entropy;
+   end else begin:i_nfirst
+      assign prev_state = g_shift[shift-1].new_state;
+      assign feedback = ^(prev_state & POLYNOMIAL);
+   end
+   wire [STATE_BITS-1:0] new_state = {prev_state[STATE_BITS-2:0], ~feedback};
+   assign random[OUTPUT_BITS-shift-1] = prev_state[STATE_BITS-1];
+end
+wire [STATE_BITS-1:0] final_state = g_shift[OUTPUT_BITS-1].new_state;
+
+// reuse the same shift register to shift out a couple of bits in the beginning so that
+// we can use a very simple seed without affecting the quality of the first few cycles
+// (this happens at synth time, so it's practically free)
+for (shift=0; shift<SCRAMBLE_CYCLES; shift=shift+1) begin:g_scramble
+   wire [STATE_BITS-1:0] prev_state;
+   if (shift == 0) begin:i_first
+      assign prev_state = STATE_INIT;
+   end else begin:i_nfirst
+      assign prev_state = g_scramble[shift-1].new_state;
+   end
+   wire feedback = ^(prev_state & POLYNOMIAL);
+   wire [STATE_BITS-1:0] new_state = {prev_state[STATE_BITS-2:0], ~feedback};
+end
+wire [STATE_BITS-1:0] scrambled_init = g_scramble[SCRAMBLE_CYCLES-1].new_state;
+
+endgenerate
+
+always @(posedge clk) begin
+   if (!rst_n) begin
+      state <= scrambled_init;
+   end else begin
+      state <= final_state;
+   end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prng_wrap.v b/verilog/rtl/prng_wrap.v
new file mode 100644
index 0000000..9a635d3
--- /dev/null
+++ b/verilog/rtl/prng_wrap.v
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Wrapper for prng with known good polynomials (having a cycle length of 2^32-1 and a minimal bit count)
+
+Different choices of 0 <= INDEX < 256 generate independent prng's. For even more, the table below should be extended.
+*/
+
+module prng_wrap #(parameter INDEX = 0, OUTPUT_BITS = 16) (
+   input clk,
+   input rst_n,
+   input entropy,
+   output [OUTPUT_BITS-1:0] random
+);
+
+localparam STATE_BITS = 32;
+localparam POLY_ARRAY_LEN = 256;
+localparam POLY_ARRAY = {
+   32'h80000062, 32'h80000092, 32'h80000106, 32'h80000114, 32'h80000412, 32'h80000414, 32'h80000806, 32'h80000850,
+   32'h8000100C, 32'h80001050, 32'h80001C00, 32'h80002021, 32'h80002204, 32'h80002810, 32'h80004050, 32'h80004201,
+   32'h80008006, 32'h80008042, 32'h80008102, 32'h80008401, 32'h80008500, 32'h80009004, 32'h80010006, 32'h80010048,
+   32'h80010240, 32'h80014004, 32'h80014800, 32'h80020030, 32'h80020102, 32'h80020402, 32'h80022010, 32'h80022100,
+   32'h80030010, 32'h80040022, 32'h80040280, 32'h80042020, 32'h80043000, 32'h80050008, 32'h80060040, 32'h80061000,
+   32'h80080012, 32'h80080120, 32'h80094000, 32'h800A0010, 32'h80100048, 32'h80100820, 32'h801C0000, 32'h80200003,
+   32'h80200060, 32'h80200101, 32'h80202001, 32'h80210001, 32'h80400021, 32'h80401020, 32'h80420010, 32'h80422000,
+   32'h80508000, 32'h80800012, 32'h80801002, 32'h80810004, 32'h80840001, 32'h80900002, 32'h80A01000, 32'h81000021,
+   32'h81000050, 32'h810000C0, 32'h81000220, 32'h81001020, 32'h81003000, 32'h81004040, 32'h81010020, 32'h81010040,
+   32'h81204000, 32'h81400001, 32'h81400008, 32'h81800040, 32'h82000014, 32'h82000024, 32'h82000044, 32'h82000048,
+   32'h82000108, 32'h82000110, 32'h82000410, 32'h82004040, 32'h82010002, 32'h82021000, 32'h82040040, 32'h82040100,
+   32'h82080400, 32'h82200040, 32'h82400800, 32'h82800010, 32'h83000200, 32'h84000050, 32'h840000A0, 32'h84000401,
+   32'h84002100, 32'h84002800, 32'h84006000, 32'h84022000, 32'h840A0000, 32'h84100002, 32'h84100020, 32'h84400020,
+   32'h85000010, 32'h85000040, 32'h85010000, 32'h85040000, 32'h85080000, 32'h86000004, 32'h86002000, 32'h88000102,
+   32'h88000140, 32'h88001002, 32'h88005000, 32'h88020001, 32'h88400020, 32'h89000002, 32'h89000020, 32'h89000400,
+   32'h89004000, 32'h8A000004, 32'h8C000001, 32'h90000028, 32'h90000030, 32'h90004002, 32'h90004080, 32'h90014000,
+   32'h90048000, 32'h90220000, 32'h90800002, 32'h91000020, 32'h92000020, 32'h94000020, 32'h94100000, 32'h94400000,
+   32'h98040000, 32'hA0000048, 32'hA0000084, 32'hA0000410, 32'hA0000480, 32'hA0004020, 32'hA0008001, 32'hA0010004,
+   32'hA0040008, 32'hA0040080, 32'hA0102000, 32'hA0400008, 32'hA0402000, 32'hA0408000, 32'hA1008000, 32'hA2001000,
+   32'hA3000000, 32'hA4000080, 32'hA4000800, 32'hA4100000, 32'hA4800000, 32'hB0004000, 32'hB0008000, 32'hB0080000,
+   32'hB0400000, 32'hC0000005, 32'hC0000018, 32'hC0000140, 32'hC0001080, 32'hC0002008, 32'hC0004200, 32'hC0008002,
+   32'hC0020200, 32'hC0100010, 32'hC0108000, 32'hC0210000, 32'hC0400200, 32'hC2000040, 32'hC2000100, 32'hC2020000,
+   32'hD0000001, 32'hE0000200, 32'h80000057, 32'h8000007A, 32'h800000B9, 32'h800000BA, 32'h8000012D, 32'h8000014E,
+   32'h8000016C, 32'h800001A6, 32'h8000020F, 32'h800002CC, 32'h80000349, 32'h80000370, 32'h80000392, 32'h80000398,
+   32'h80000417, 32'h80000465, 32'h8000046A, 32'h80000478, 32'h800004D4, 32'h8000050B, 32'h80000526, 32'h8000054C,
+   32'h800005C1, 32'h8000060D, 32'h8000060E, 32'h80000629, 32'h80000638, 32'h80000662, 32'h800006B0, 32'h80000748,
+   32'h8000088D, 32'h800008E1, 32'h80000923, 32'h80000931, 32'h80000934, 32'h80000958, 32'h80000A25, 32'h80000A26,
+   32'h80000A54, 32'h80000A92, 32'h80000AC4, 32'h80000B28, 32'h80000B84, 32'h80000C34, 32'h80000C43, 32'h80000CA2,
+   32'h80000D22, 32'h80000D28, 32'h80000E24, 32'h8000100F, 32'h80001027, 32'h80001035, 32'h80001047, 32'h80001071,
+   32'h80001078, 32'h8000108E, 32'h800010C9, 32'h80001126, 32'h80001164, 32'h80001231, 32'h8000140E, 32'h80001485,
+   32'h80001491, 32'h80001560, 32'h80001614, 32'h80001624, 32'h80001684, 32'h80001702, 32'h80001813, 32'h80001851,
+   32'h80001870, 32'h800018C1, 32'h80001928, 32'h80001A06, 32'h80001A12, 32'h80001C50, 32'h80001C88, 32'h80002053
+};
+
+prng #(
+   .STATE_BITS(STATE_BITS),
+   .POLYNOMIAL(POLY_ARRAY[(POLY_ARRAY_LEN-1-(INDEX % POLY_ARRAY_LEN))*STATE_BITS +: STATE_BITS]),
+   .STATE_INIT(INDEX),
+   .OUTPUT_BITS(OUTPUT_BITS)
+) prng_inst (
+   .clk(clk),
+   .rst_n(rst_n),
+   .entropy(entropy),
+   .random(random)
+);
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prog_mux.v b/verilog/rtl/prog_mux.v
new file mode 100644
index 0000000..b358bca
--- /dev/null
+++ b/verilog/rtl/prog_mux.v
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial programming multiplexer
+*/
+
+module prog_mux #(parameter CORES=8, LOG_CORES=3, PC_WIDTH=8, INSTR_WIDTH=32) (
+   input we,
+   input [LOG_CORES-1:0] sel,
+   input [PC_WIDTH-1:0] waddr,
+   input [INSTR_WIDTH-1:0] wdata,
+   output [CORES-1:0] cwe,
+   output [CORES*PC_WIDTH-1:0] cwaddr,
+   output [CORES*INSTR_WIDTH-1:0] cwdata
+);
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+   wire active = we && sel==core;
+   assign cwe[core] = active;
+   assign cwaddr[core*PC_WIDTH +: PC_WIDTH] = {(PC_WIDTH){active}} & waddr;
+   assign cwdata[core*INSTR_WIDTH +: INSTR_WIDTH] = {(INSTR_WIDTH){active}} & wdata;
+end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/uprj_netlists.v b/verilog/rtl/uprj_netlists.v
index 3537de8..b7ceb05 100644
--- a/verilog/rtl/uprj_netlists.v
+++ b/verilog/rtl/uprj_netlists.v
@@ -21,8 +21,11 @@
     // Assume default net type to be wire because GL netlists don't have the wire definitions
     `default_nettype wire
     `include "gl/user_project_wrapper.v"
-    `include "gl/user_proj_example.v"
+    `include "gl/user_project.v"
 `else
     `include "user_project_wrapper.v"
-    `include "user_proj_example.v"
-`endif
\ No newline at end of file
+    `include "user_project.v"
+`endif
+
+`default_nettype wire
+
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
deleted file mode 100644
index 26081e9..0000000
--- a/verilog/rtl/user_proj_example.v
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-FileCopyrightText: 2020 Efabless Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// SPDX-License-Identifier: Apache-2.0
-
-`default_nettype none
-/*
- *-------------------------------------------------------------
- *
- * user_proj_example
- *
- * This is an example of a (trivially simple) user project,
- * showing how the user project can connect to the logic
- * analyzer, the wishbone bus, and the I/O pads.
- *
- * This project generates an integer count, which is output
- * on the user area GPIO pads (digital output only).  The
- * wishbone connection allows the project to be controlled
- * (start and stop) from the management SoC program.
- *
- * See the testbenches in directory "mprj_counter" for the
- * example programs that drive this user project.  The three
- * testbenches are "io_ports", "la_test1", and "la_test2".
- *
- *-------------------------------------------------------------
- */
-
-module user_proj_example #(
-    parameter BITS = 32
-)(
-`ifdef USE_POWER_PINS
-    inout vccd1,	// User area 1 1.8V supply
-    inout vssd1,	// User area 1 digital ground
-`endif
-
-    // Wishbone Slave ports (WB MI A)
-    input wb_clk_i,
-    input wb_rst_i,
-    input wbs_stb_i,
-    input wbs_cyc_i,
-    input wbs_we_i,
-    input [3:0] wbs_sel_i,
-    input [31:0] wbs_dat_i,
-    input [31:0] wbs_adr_i,
-    output wbs_ack_o,
-    output [31:0] wbs_dat_o,
-
-    // Logic Analyzer Signals
-    input  [127:0] la_data_in,
-    output [127:0] la_data_out,
-    input  [127:0] la_oenb,
-
-    // IOs
-    input  [`MPRJ_IO_PADS-1:0] io_in,
-    output [`MPRJ_IO_PADS-1:0] io_out,
-    output [`MPRJ_IO_PADS-1:0] io_oeb,
-
-    // IRQ
-    output [2:0] irq
-);
-    wire clk;
-    wire rst;
-
-    wire [`MPRJ_IO_PADS-1:0] io_in;
-    wire [`MPRJ_IO_PADS-1:0] io_out;
-    wire [`MPRJ_IO_PADS-1:0] io_oeb;
-
-    wire [31:0] rdata; 
-    wire [31:0] wdata;
-    wire [BITS-1:0] count;
-
-    wire valid;
-    wire [3:0] wstrb;
-    wire [31:0] la_write;
-
-    // WB MI A
-    assign valid = wbs_cyc_i && wbs_stb_i; 
-    assign wstrb = wbs_sel_i & {4{wbs_we_i}};
-    assign wbs_dat_o = rdata;
-    assign wdata = wbs_dat_i;
-
-    // IO
-    assign io_out = count;
-    assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};
-
-    // IRQ
-    assign irq = 3'b000;	// Unused
-
-    // LA
-    assign la_data_out = {{(127-BITS){1'b0}}, count};
-    // Assuming LA probes [63:32] are for controlling the count register  
-    assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
-    // Assuming LA probes [65:64] are for controlling the count clk & reset  
-    assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
-    assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
-
-    counter #(
-        .BITS(BITS)
-    ) counter(
-        .clk(clk),
-        .reset(rst),
-        .ready(wbs_ack_o),
-        .valid(valid),
-        .rdata(rdata),
-        .wdata(wbs_dat_i),
-        .wstrb(wstrb),
-        .la_write(la_write),
-        .la_input(la_data_in[63:32]),
-        .count(count)
-    );
-
-endmodule
-
-module counter #(
-    parameter BITS = 32
-)(
-    input clk,
-    input reset,
-    input valid,
-    input [3:0] wstrb,
-    input [BITS-1:0] wdata,
-    input [BITS-1:0] la_write,
-    input [BITS-1:0] la_input,
-    output ready,
-    output [BITS-1:0] rdata,
-    output [BITS-1:0] count
-);
-    reg ready;
-    reg [BITS-1:0] count;
-    reg [BITS-1:0] rdata;
-
-    always @(posedge clk) begin
-        if (reset) begin
-            count <= 0;
-            ready <= 0;
-        end else begin
-            ready <= 1'b0;
-            if (~|la_write) begin
-                count <= count + 1;
-            end
-            if (valid && !ready) begin
-                ready <= 1'b1;
-                rdata <= count;
-                if (wstrb[0]) count[7:0]   <= wdata[7:0];
-                if (wstrb[1]) count[15:8]  <= wdata[15:8];
-                if (wstrb[2]) count[23:16] <= wdata[23:16];
-                if (wstrb[3]) count[31:24] <= wdata[31:24];
-            end else if (|la_write) begin
-                count <= la_write & la_input;
-            end
-        end
-    end
-
-endmodule
-`default_nettype wire
diff --git a/verilog/rtl/user_project.v b/verilog/rtl/user_project.v
new file mode 100644
index 0000000..3b1b0ed
--- /dev/null
+++ b/verilog/rtl/user_project.v
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module user_project (
+`ifdef USE_POWER_PINS
+    inout vccd1,	// User area 1 1.8V supply
+    inout vssd1,	// User area 1 digital ground
+`endif
+
+    // Wishbone Slave ports (WB MI A)
+    input wb_clk_i,
+    input wb_rst_i,
+    input wbs_stb_i,
+    input wbs_cyc_i,
+    input wbs_we_i,
+    input [3:0] wbs_sel_i,
+    input [31:0] wbs_dat_i,
+    input [31:0] wbs_adr_i,
+    output wbs_ack_o,
+    output [31:0] wbs_dat_o,
+
+    // Logic Analyzer Signals
+    input  [127:0] la_data_in,
+    output [127:0] la_data_out,
+    input  [127:0] la_oenb,
+
+    // IOs
+    input  [`MPRJ_IO_PADS-1:0] io_in,
+    output [`MPRJ_IO_PADS-1:0] io_out,
+    output [`MPRJ_IO_PADS-1:0] io_oeb,
+
+    // IRQ
+    output [2:0] irq
+);
+    
+mcu #(
+    .CORES(4),
+    .LOG_CORES(2),
+    .MEM_DEPTH(32),
+    .DATA_WIDTH(16),
+    .PC_WIDTH(6),
+    .ADDR_WIDTH(5),
+    .INSTR_WIDTH(32),
+    .INSTR_DEPTH(16),
+    .IO_PINS(16),
+    .IO_PADS(`MPRJ_IO_PADS),
+    .FIRST_PAD(12),
+    .LOGIC_PROBES(128),
+    .WB_WIDTH(32)
+) mcu_inst (
+    .wb_clk_i(wb_clk_i),
+    .wb_rst_i(wb_rst_i),
+    .wb_stb_i(wb_stb_i),
+    .wb_cyc_i(wb_cyc_i),
+    .wb_we_i(wb_we_i),
+    .wb_adr_i(wb_adr_i),
+    .wb_dat_i(wb_dat_i),
+    .wbs_ack_o(wbs_ack_o),
+    .wbs_dat_o(wbs_dat_o),
+    .la_data_in(la_data_in),
+    .la_data_out(la_data_out),
+    .la_oenb(la_oenb),
+    .io_in(io_in),
+    .io_out(io_out),
+    .io_oeb(io_oeb)
+);
+
+assign irq = 3'b000;	// unused
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/user_project_wrapper.v b/verilog/rtl/user_project_wrapper.v
index 5ee1cee..5057915 100644
--- a/verilog/rtl/user_project_wrapper.v
+++ b/verilog/rtl/user_project_wrapper.v
@@ -29,9 +29,7 @@
  *-------------------------------------------------------------
  */
 
-module user_project_wrapper #(
-    parameter BITS = 32
-) (
+module user_project_wrapper (
 `ifdef USE_POWER_PINS
     inout vdda1,	// User area 1 3.3V supply
     inout vdda2,	// User area 2 3.3V supply
@@ -82,7 +80,7 @@
 /* User project is instantiated  here   */
 /*--------------------------------------*/
 
-user_proj_example mprj (
+user_project mprj (
 `ifdef USE_POWER_PINS
 	.vccd1(vccd1),	// User area 1 1.8V power
 	.vssd1(vssd1),	// User area 1 digital ground
@@ -121,3 +119,4 @@
 endmodule	// user_project_wrapper
 
 `default_nettype wire
+
diff --git a/verilog/rtl/wb_mux.v b/verilog/rtl/wb_mux.v
new file mode 100644
index 0000000..e24244e
--- /dev/null
+++ b/verilog/rtl/wb_mux.v
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Wishbone multiplexer to process messages from Caravel
+
+We use wishbone in classic mode with the simplest possible interface:
+- all operations complete in a single cycle
+- input is valid if STB_I && CYC_I is asserted
+- for valid inputs, ACK_O is held asserted
+- if WE_I is asserted, a write operation is performed using ADR_I and DAT_I
+- if WE_I is negated, a read operation is performed using ADR_I with the result in DAT_O
+- all other ports are unused
+
+The wishbone bus width (WB_WIDTH below) is fixed to 32 by the platform and our code
+assumes that all other widths fit into it.
+
+This module (like other muxes in this project) is fully combinatorial.
+Registered logic happens in connected cpu cores, instruction memories and the entropy pool.
+Therefore CLK_I and RST_I are not directly used here. However, it is used in the
+parent module as the main clock and reset signal and thus affect the modules
+connected to the other interfaces.
+*/
+
+module wb_mux #(parameter
+   LOG_CORES=3,
+   PC_WIDTH=8,
+   INSTR_WIDTH=32,
+   DATA_WIDTH=16,
+   IO_PINS=16,
+   WB_WIDTH=32
+) (
+   // wishbone interface
+   //input wb_clk_i,          // wb clock
+   //input wb_rst_i,          // wb reset, active high
+   input wb_stb_i,            // wb strobe signal
+   input wb_cyc_i,            // wb cycle signal, sending on the bus requires wb_stb_i && wb_cyc_i
+   input wb_we_i,             // wb write enable signal, 0=input 1=output
+   input [WB_WIDTH-1:0] wb_adr_i,         // wb address
+   input [WB_WIDTH-1:0] wb_dat_i,         // wb input data
+   output wbs_ack_o,                      // wb acknowledge
+   output [WB_WIDTH-1:0] wbs_dat_o,       // wb output data
+   // programmer interface
+   output prog_we,
+   output [LOG_CORES-1:0] prog_sel,
+   output [PC_WIDTH-1:0] prog_waddr,
+   output [INSTR_WIDTH-1:0] prog_wdata,
+   // pads & soft reset interface
+   output pads_we,
+   output pads_waddr,
+   output [IO_PINS-1:0] pads_wdata,
+   // debugger interface
+   output [LOG_CORES-1:0] debug_sel,
+   output [4:0] debug_addr,
+   output debug_we,
+   output [DATA_WIDTH-1:0] debug_wdata,
+   input [DATA_WIDTH-1:0] debug_rdata,
+   // entropy pool interface
+   output[WB_WIDTH-1:0] entropy_word
+);
+
+// minimal wishbone logic
+wire valid = wb_stb_i && wb_cyc_i;
+assign wbs_ack_o = valid;
+
+// interface selection
+wire[1:0] interface = wb_adr_i[WB_WIDTH-2 +: 2];
+wire if_prog = valid && interface == 2'b00;
+wire if_pads = valid && interface == 2'b01;
+wire if_debug = valid && interface == 2'b10;
+wire if_entropy = valid && interface == 2'b11;
+
+// programmer interface
+assign prog_we = if_prog && wb_we_i;
+assign {prog_sel, prog_waddr} = prog_we ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign prog_wdata = prog_we ? wb_dat_i : 0;
+
+// pads interface
+assign pads_we = if_pads && wb_we_i;
+assign pads_waddr = pads_we ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign pads_wdata = pads_we ? wb_dat_i : 0;
+
+// debugger interface, input
+assign {debug_sel, debug_addr} = if_debug ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign debug_we = if_debug && wb_we_i;
+assign debug_wdata = debug_we ? wb_dat_i : 0;
+
+// debugger interface, output
+assign wbs_dat_o = (if_debug && !wb_we_i) ? debug_rdata : 0;
+
+// entropy pool interface
+assign entropy_word = (if_entropy && wb_we_i) ? wb_dat_i : 0;
+
+endmodule
+
+`default_nettype wire
+