Import verilog files to rtl & tb
diff --git a/.gitignore b/.gitignore
index f4e486c..b2a927b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-precheck_results
\ No newline at end of file
+precheck_results
+*~
diff --git a/verilog/dv/tb/alu_tb.v b/verilog/dv/tb/alu_tb.v
new file mode 100644
index 0000000..c87ec05
--- /dev/null
+++ b/verilog/dv/tb/alu_tb.v
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module alu_tb();
+
+parameter DATA_WIDTH = 16;
+
+reg [3:0] opcode;
+reg [DATA_WIDTH-1:0] in1;
+reg [DATA_WIDTH-1:0] in2;
+reg carry;
+wire [DATA_WIDTH-1:0] out;
+wire carry_out;
+
+alu #(
+ .DATA_WIDTH(DATA_WIDTH)
+) alu_dut (
+ .opcode(opcode),
+ .in1(in1),
+ .in2(in2),
+ .carry(carry),
+ .out(out),
+ .carry_out(carry_out)
+);
+
+integer i;
+
+initial begin
+ $monitor("time=%4t op=%4b in1=%16b in2=%16b carry=%1b out=%16b carry_out=%1b", $time, opcode, in1, in2, carry, out, carry_out);
+
+ in1 = 16'b0011001100110011;
+ //in2 = 16'b0000111100001111;
+ in2 = 16'b101;
+ for (i=0; i<16; i=i+1) begin
+ #10 opcode=i;
+ carry = 0;
+ #10 carry = 1;
+ end
+
+ opcode = 15;
+ in2 = 0;
+ carry = 1;
+ for (i=0; i<18; i=i+1) begin
+ in1 = i;
+ #10;
+ end
+ for (i=17; i>=0; i=i-1) begin
+ in1 = ~i & {1'b0, {(15){1'b1}}};
+ #10;
+ end
+ for (i=0; i<18; i=i+1) begin
+ in1 = i | {1'b1, {(15){1'b0}}};
+ #10;
+ end
+ for (i=17; i>=0; i=i-1) begin
+ in1 = ~i;
+ #10;
+ end
+
+ #10 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/cpu_core_tb.v b/verilog/dv/tb/cpu_core_tb.v
new file mode 100644
index 0000000..7883ed8
--- /dev/null
+++ b/verilog/dv/tb/cpu_core_tb.v
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module cpu_core_tb();
+
+parameter DATA_WIDTH = 16;
+parameter PC_WIDTH = 8;
+parameter ADDR_WIDTH = 8;
+parameter SPREAD_WIDTH = 2;
+parameter INSTR_WIDTH = 32;
+
+reg clk;
+reg rst_n;
+
+wire [INSTR_WIDTH-1:0] opcode;
+wire [PC_WIDTH-1:0] progctr;
+wire mem_we;
+wire [ADDR_WIDTH-1:0] mem_waddr;
+wire [SPREAD_WIDTH-1:0] mem_wspread;
+wire [DATA_WIDTH-1:0] mem_wdata;
+wire [ADDR_WIDTH-1:0] mem_raddr;
+wire [DATA_WIDTH-1:0] mem_rdata;
+wire debug_stopped;
+wire [DATA_WIDTH-1:0] debug_rdata;
+
+cpu_core #(
+ .DATA_WIDTH(DATA_WIDTH),
+ .PC_WIDTH(PC_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .SPREAD_WIDTH(SPREAD_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH)
+) cpu_core_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .opcode(opcode),
+ .mem_rdata(mem_rdata),
+ .prng_in(16'd0),
+ .debug_mode(2'd0),
+ .debug_sel(4'd6),
+ .debug_we(1'd0),
+ .debug_wdata(16'd0),
+ .progctr(progctr),
+ .mem_we(mem_we),
+ .mem_waddr(mem_waddr),
+ .mem_wspread(mem_wspread),
+ .mem_wdata(mem_wdata),
+ .mem_raddr(mem_raddr),
+ .debug_stopped(debug_stopped),
+ .debug_rdata(debug_rdata)
+);
+
+wire io_dummy_active;
+wire [DATA_WIDTH-1:0] io_dummy_data;
+
+mem_mesh #(
+ .CORES(1),
+ .DEPTH(16),
+ .DATA_WIDTH(DATA_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .SPREAD_LAYERS(0),
+ .SPREAD_WIDTH(SPREAD_WIDTH),
+ .USE_IO(0),
+ .IO_PORTS(1),
+ .IO_FIRST(0)
+) mem_mesh_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .we(mem_we),
+ .waddr(mem_waddr),
+ .wspread(mem_wspread),
+ .wdata(mem_wdata),
+ .raddr(mem_raddr),
+ .rdata(mem_rdata),
+ .io_dir(1'b1),
+ .io_active(io_dummy_active),
+ .io_data(io_dummy_data)
+);
+
+always #5 clk = ~clk;
+
+reg [3:0] round;
+wire [INSTR_WIDTH-1:0] noop = 32'b000_000_0_00_0000_000_0000000000000000;
+wire [INSTR_WIDTH-1:0] progmem ['h100:0];
+
+localparam n_tests = 4;
+
+// test 1
+assign progmem['h00] = 32'b100_000_1_00_0011_001_0000000001001001; // reg1 = 73
+assign progmem['h01] = 32'b100_000_1_00_0011_010_0000000001001010; // reg2 = 74
+assign progmem['h02] = 32'b000_001_1_00_1010_011_0000000000000000; // jmp reg1 + reg2
+
+// test 2
+assign progmem['h10] = 32'b100_000_1_00_0011_001_0000000011110011; // reg1 = 243
+assign progmem['h11] = 32'b000_000_1_11_0011_111_0000000000010000; // mem[1] = reg1
+assign progmem['h12] = 32'b100_000_1_00_0011_100_0000000000000000; // t = mem[0]
+assign progmem['h13] = 32'b011_111_1_11_1010_111_0000000000000000; // mem[0] = t+1
+assign progmem['h14] = 32'b100_000_1_00_0011_100_0000000000000000; // t = mem[0]
+assign progmem['h15] = 32'b011_000_1_00_0011_100_0000000000000000; // t = mem[t]
+assign progmem['h16] = 32'b011_000_1_00_0011_011_0000000000000000; // jmp t
+
+// test 3
+assign progmem['h20] = 32'b110_100_1_00_1011_001_0000000000010111; // reg1 = timer - 23
+assign progmem['h21] = 32'b000_010_1_01_0011_011_0000000000000000; // jmp (reg1 < 0) ? 0 : pc
+assign progmem['h22] = 32'b100_000_1_00_0011_011_0000000000101100; // jmp 44
+
+// test 4
+assign progmem['h30] = 32'b100_000_1_00_0011_010_0000000010001000; // reg2 = 136
+assign progmem['h31] = 32'b001_000_1_11_0011_111_0000000000100000; // mem[2] = reg2
+assign progmem['h32] = 32'b100_000_1_00_0011_100_0000000000000010; // t = mem[2]
+assign progmem['h33] = 32'b100_000_1_10_0011_010_0000000000010001; // reg1 = t; reg2 = 17
+assign progmem['h34] = 32'b000_001_1_00_1010_011_0000000000000000; // jmp reg1 + reg2
+
+assign opcode = rst_n ? (progctr < 16 ? progmem[round << 4 | progctr] : noop) : noop;
+
+always @ (posedge clk) begin
+ if (progctr >= 16) begin
+ rst_n = 0;
+ if (round + 1 >= n_tests) $finish;
+ round = round + 1;
+ $display("");
+ #12 rst_n = 1;
+ end
+end
+
+initial begin
+ $monitor("time=%4t round=%1x rstn=%1b ct=%2d op=%32b new_pc=%8b(%2x) reg1=%16b we=%1b wa=%8b ws=%2b wd=%16b ra=%8b rd=%16b dd=%16b",
+ $time, round, rst_n, cpu_core_dut.timer, opcode, progctr, progctr, cpu_core_dut.reg1,
+ mem_we, mem_waddr, mem_wspread, mem_wdata, mem_raddr, mem_rdata, debug_rdata);
+ round = 0;
+ clk = 0;
+ rst_n = 0;
+ #12 rst_n = 1;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/debug_mux_tb.v b/verilog/dv/tb/debug_mux_tb.v
new file mode 100644
index 0000000..5127f03
--- /dev/null
+++ b/verilog/dv/tb/debug_mux_tb.v
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module debug_mux_tb();
+
+parameter CORES=4;
+parameter LOG_CORES=2;
+parameter DATA_WIDTH=8;
+
+reg [LOG_CORES-1:0] sel;
+reg [4:0] addr;
+reg we;
+reg [DATA_WIDTH-1:0] wdata;
+wire [DATA_WIDTH-1:0] rdata;
+reg reg_stopped[CORES-1:0];
+reg [DATA_WIDTH-1:0] reg_rdata[CORES-1:0];
+wire [1:0] cpu_mode[CORES-1:0];
+wire [3:0] reg_sel[CORES-1:0];
+wire reg_we[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_wdata[CORES-1:0];
+
+wire [CORES-1:0] reg_stopped_raw;
+wire [CORES*DATA_WIDTH-1:0] reg_rdata_raw;
+wire [CORES*2-1:0] cpu_mode_raw;
+wire [CORES*4-1:0] reg_sel_raw;
+wire [CORES-1:0] reg_we_raw;
+wire [CORES*DATA_WIDTH-1:0] reg_wdata_raw;
+
+debug_mux #(
+ .CORES(CORES),
+ .LOG_CORES(LOG_CORES),
+ .DATA_WIDTH(DATA_WIDTH)
+) debug_mux_dut (
+ .sel(sel),
+ .addr(addr),
+ .we(we),
+ .wdata(wdata),
+ .rdata(rdata),
+ .reg_stopped(reg_stopped_raw),
+ .reg_rdata(reg_rdata_raw),
+ .cpu_mode(cpu_mode_raw),
+ .reg_sel(reg_sel_raw),
+ .reg_we(reg_we_raw),
+ .reg_wdata(reg_wdata_raw)
+);
+
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+ assign reg_stopped_raw[core] = reg_stopped[core];
+ assign reg_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = reg_rdata[core];
+ assign cpu_mode[core] = cpu_mode_raw[core*2 +: 2];
+ assign reg_sel[core] = reg_sel_raw[core*4 +: 4];
+ assign reg_we[core] = reg_we_raw[core];
+ assign reg_wdata[core] = reg_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+
+initial begin
+ $monitor("time=%4t SEL=%b ADDR=%b WE=%b WDATA=%b rdata=%b, ST0=%b RD0=%b cm0=%b s0=%b we0=%b wd0=%b ST1=%b RD1=%b cm1=%b s1=%b we1=%b wd1=%b",
+ $time, sel, addr, we, wdata, rdata, reg_stopped[0], reg_rdata[0], cpu_mode[0], reg_sel[0], reg_we[0], reg_wdata[0],
+ reg_stopped[1], reg_rdata[1], cpu_mode[1], reg_sel[1], reg_we[1], reg_wdata[1]);
+ sel = 0;
+ addr = 5'b01100;
+ we = 0;
+ wdata = 8'b10101010;
+ reg_stopped[0] = 1;
+ reg_stopped[1] = 0;
+ reg_stopped[2] = 1;
+ reg_stopped[3] = 0;
+ reg_rdata[0] = 8'b11110000;
+ reg_rdata[1] = 8'b11100001;
+ reg_rdata[2] = 8'b11000011;
+ reg_rdata[3] = 8'b10000111;
+
+ #10
+ sel = 1;
+ we = 1;
+
+ #10
+ sel = 0;
+ we = 0;
+ addr = 5'b10000;
+
+ #10
+ sel = 1;
+ we = 1;
+ wdata = 8'b00000011;
+
+ #10
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/entropy_pool_tb.v b/verilog/dv/tb/entropy_pool_tb.v
new file mode 100644
index 0000000..b501729
--- /dev/null
+++ b/verilog/dv/tb/entropy_pool_tb.v
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module entropy_pool_tb();
+
+parameter WIDTH = 16;
+
+reg clk;
+reg rst_n;
+reg [WIDTH-1:0] e_word;
+wire e_bit;
+
+entropy_pool #(
+ .WIDTH(WIDTH)
+) entropy_pool_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .e_word(e_word),
+ .e_bit(e_bit)
+);
+
+always #5 clk = ~clk;
+
+reg strobe;
+always @(posedge clk) strobe = ~strobe; // force a $monitor strobe every clock cycle
+
+initial begin
+ $monitor("time %4t s %1b ew %16b es %15b eb %1b", $time, strobe, e_word, entropy_pool_dut.e_pool_mod, e_bit);
+ clk = 0;
+ rst_n = 0;
+ strobe = 0;
+ #10
+ rst_n = 1;
+ e_word = 16'b0111110000111001;
+ #10
+ e_word = 0;
+ #100
+ e_word = 16'b1010101010101010;
+ #10
+ e_word = 0;
+ #200
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/instr_mem_tb.v b/verilog/dv/tb/instr_mem_tb.v
new file mode 100644
index 0000000..64167ff
--- /dev/null
+++ b/verilog/dv/tb/instr_mem_tb.v
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module instr_mem_tb();
+
+parameter PC_WIDTH = 4;
+parameter INSTR_WIDTH = 8;
+parameter DEPTH = 16;
+
+reg clk;
+reg rst_n;
+reg [PC_WIDTH-1:0] raddr;
+wire [INSTR_WIDTH-1:0] rdata;
+reg we;
+reg [PC_WIDTH-1:0] waddr;
+reg [INSTR_WIDTH-1:0] wdata;
+
+instr_mem #(
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .PC_WIDTH(PC_WIDTH),
+ .DEPTH(DEPTH)
+) instr_mem_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .raddr(raddr),
+ .rdata(rdata),
+ .we(we),
+ .waddr(waddr),
+ .wdata(wdata)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+ $monitor("time=%4t rstn=%1b we=%1b waddr=%4b wdata=%8b raddr=%4b rdata=%8b", $time, rst_n, we, waddr, wdata, raddr, rdata);
+ clk <= 0;
+ rst_n <= 0;
+ we <= 0;
+ waddr <= 0;
+ wdata <= 1;
+ raddr <= 0;
+ #500 $display("");
+ rst_n <= 0;
+ #500 $display("");
+ rst_n <= 0;
+ #500 $finish;
+end
+
+always @(posedge clk) begin
+ if (!rst_n) begin
+ rst_n <= 1;
+ end else begin
+ if (we) begin
+ waddr <= waddr + 1;
+ wdata <= wdata + 1;
+ end
+ we <= !we;
+ raddr <= raddr + 1;
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_filter_rev_tb.v b/verilog/dv/tb/io_filter_rev_tb.v
new file mode 100644
index 0000000..085d2e1
--- /dev/null
+++ b/verilog/dv/tb/io_filter_rev_tb.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_filter_rev_tb();
+
+parameter IO_PINS = 4;
+parameter DATA_WIDTH = 8;
+
+reg clk;
+reg rst_n;
+reg [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in_raw;
+wire [IO_PINS-1:0] pin_data_out_raw;
+wire [IO_PINS+2-1:0] port_active_in_raw;
+wire [IO_PINS+2-1:0] port_active_out_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out_raw;
+
+io_filter_rev #(
+ .IO_PINS(IO_PINS),
+ .DATA_WIDTH(DATA_WIDTH)
+) io_filter_rev_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .pin_dir(pin_dir),
+ .pin_data_in(pin_data_in_raw),
+ .pin_data_out(pin_data_out_raw),
+ .port_active_in(port_active_in_raw),
+ .port_active_out(port_active_out_raw),
+ .port_data_in(port_data_in_raw),
+ .port_data_out(port_data_out_raw)
+);
+
+// The testbench acts as the "external world" for the io filter, so it simulates both the cpu/memory part
+// and the peripherals. An "output" message is one sent from the cpu/memory to the peripherals which means
+// the testbench ports act as output and the pins act as input. Conversely, an "input" message is one coming
+// from the peripherals to the cpu/memory where testbench pins will act as output and ports as input.
+
+wire [IO_PINS-1:0] pin_data_out;
+reg [IO_PINS+2-1:0] port_active_out;
+reg [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out;
+
+reg [IO_PINS-1:0] pin_data_in;
+wire [IO_PINS+2-1:0] port_active_in;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in;
+
+generate genvar pin;
+for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+ // output
+ assign port_active_out_raw[pin] = port_active_out[pin];
+ assign port_data_out_raw[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_out[pin*DATA_WIDTH +: DATA_WIDTH];
+ assign pin_data_out = pin_data_out_raw;
+ // input
+ assign pin_data_in_raw[pin] = pin_data_in[pin];
+ assign port_active_in[pin] = port_active_in_raw[pin];
+ assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_in_raw[pin*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+// output
+assign port_active_out_raw[IO_PINS +: 2] = {1'b0, port_active_out[IO_PINS]};
+assign port_data_out_raw[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {{(DATA_WIDTH){1'b0}}, port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]};
+// input
+assign port_active_in[IO_PINS +: 2] = {port_active_in_raw[IO_PINS+1], 1'b0};
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {port_data_in_raw[(IO_PINS+1)*DATA_WIDTH +: DATA_WIDTH], {(DATA_WIDTH){1'b0}}};
+
+always #5 clk = ~clk;
+
+initial begin
+ $monitor("time %4d pin_data_in %4b pin_data_out %4b port_active_in %6b port_active_out %6b port_data_in %24b port_data_out %24b",
+ $time, pin_data_in_raw, pin_data_out_raw, port_active_in_raw, port_active_out_raw, port_data_in_raw, port_data_out_raw);
+ clk <= 0;
+ rst_n <= 0;
+ #40
+ rst_n <= 1;
+ pin_dir <= 4'b1010;
+ port_active_out <= 6'b0;
+ port_data_out <= 48'b0;
+ pin_data_in <= 4'b0;
+ #40
+ pin_data_in <= 4'b0001;
+ #40
+ port_active_out <= 6'b000100;
+ port_data_out <= 48'b00000000_00000000_00000000_11111111_00000000_11111111;
+ #10 port_active_out <= 6'b0;
+ #30
+ port_active_out <= 6'b010000;
+ port_data_out <= 48'b00000000_00000001_00000000_00000000_00000000_00000000;
+ #10 port_active_out <= 6'b0;
+ #30
+ port_active_out <= 6'b010001;
+ port_data_out <= 48'b00000000_00000011_00000000_00000000_00000000_00000000;
+ #10 port_active_out <= 6'b0;
+ #30
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_filter_tb.v b/verilog/dv/tb/io_filter_tb.v
new file mode 100644
index 0000000..0bf940e
--- /dev/null
+++ b/verilog/dv/tb/io_filter_tb.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_filter_tb();
+
+parameter IO_PINS = 4;
+parameter DATA_WIDTH = 8;
+
+reg clk;
+reg rst_n;
+reg [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in_raw;
+wire [IO_PINS-1:0] pin_data_out_raw;
+wire [IO_PINS+2-1:0] port_active_in_raw;
+wire [IO_PINS+2-1:0] port_active_out_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in_raw;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out_raw;
+
+io_filter #(
+ .IO_PINS(IO_PINS),
+ .DATA_WIDTH(DATA_WIDTH)
+) io_filter_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .pin_dir(pin_dir),
+ .pin_data_in(pin_data_in_raw),
+ .pin_data_out(pin_data_out_raw),
+ .port_active_in(port_active_in_raw),
+ .port_active_out(port_active_out_raw),
+ .port_data_in(port_data_in_raw),
+ .port_data_out(port_data_out_raw)
+);
+
+// The testbench acts as the "external world" for the io filter, so it simulates both the cpu/memory part
+// and the peripherals. An "output" message is one sent from the cpu/memory to the peripherals which means
+// the testbench ports act as output and the pins act as input. Conversely, an "input" message is one coming
+// from the peripherals to the cpu/memory where testbench pins will act as output and ports as input.
+
+wire [IO_PINS-1:0] pin_data_out;
+reg [IO_PINS+2-1:0] port_active_out;
+reg [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out;
+
+reg [IO_PINS-1:0] pin_data_in;
+wire [IO_PINS+2-1:0] port_active_in;
+wire [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in;
+
+generate genvar pin;
+for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+ // output
+ assign port_active_out_raw[pin] = port_active_out[pin];
+ assign port_data_out_raw[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_out[pin*DATA_WIDTH +: DATA_WIDTH];
+ assign pin_data_out = pin_data_out_raw;
+ // input
+ assign pin_data_in_raw[pin] = pin_data_in[pin];
+ assign port_active_in[pin] = port_active_in_raw[pin];
+ assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = port_data_in_raw[pin*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+// output
+assign port_active_out_raw[IO_PINS +: 2] = {1'b0, port_active_out[IO_PINS]};
+assign port_data_out_raw[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {{(DATA_WIDTH){1'b0}}, port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]};
+// input
+assign port_active_in[IO_PINS +: 2] = {port_active_in_raw[IO_PINS+1], 1'b0};
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {port_data_in_raw[(IO_PINS+1)*DATA_WIDTH +: DATA_WIDTH], {(DATA_WIDTH){1'b0}}};
+
+always #5 clk = ~clk;
+
+initial begin
+ $monitor("time %4d pin_data_in %4b pin_data_out %4b port_active_in %6b port_active_out %6b port_data_in %24b port_data_out %24b",
+ $time, pin_data_in_raw, pin_data_out_raw, port_active_in_raw, port_active_out_raw, port_data_in_raw, port_data_out_raw);
+ clk <= 0;
+ rst_n <= 0;
+ #40
+ rst_n <= 1;
+ pin_dir <= 4'b0101;
+ port_active_out <= 6'b0;
+ port_data_out <= 48'b0;
+ pin_data_in <= 4'b0;
+ #40
+ pin_data_in <= 4'b1000;
+ #40
+ port_active_out <= 6'b000100;
+ port_data_out <= 48'b00000000_00000000_00000000_11111111_00000000_11111111;
+ #10 port_active_out <= 6'b0;
+ #30
+ port_active_out <= 6'b010000;
+ port_data_out <= 48'b00000000_00000001_00000000_00000000_00000000_00000000;
+ #10 port_active_out <= 6'b0;
+ #30
+ port_active_out <= 6'b010001;
+ port_data_out <= 48'b00000000_00000011_00000000_00000000_00000000_00000000;
+ #10 port_active_out <= 6'b0;
+ #30
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/io_pads_tb.v b/verilog/dv/tb/io_pads_tb.v
new file mode 100644
index 0000000..d05df8f
--- /dev/null
+++ b/verilog/dv/tb/io_pads_tb.v
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module io_pads_tb();
+
+parameter IO_PINS=16;
+parameter IO_PADS=38;
+parameter LOGIC_PROBES=128;
+parameter FIRST_PAD=12;
+
+reg wb_clk_i;
+reg wb_rst_i;
+reg [LOGIC_PROBES-1:0] la_data_in;
+wire [LOGIC_PROBES-1:0] la_data_out;
+reg [LOGIC_PROBES-1:0] la_oenb;
+reg [IO_PADS-1:0] io_in;
+wire [IO_PADS-1:0] io_out;
+wire [IO_PADS-1:0] io_oeb;
+wire clk;
+wire rst_hard_n;
+wire rst_soft_n;
+wire rst_prng_n;
+wire [IO_PINS-1:0] pin_dir;
+wire [IO_PINS-1:0] pin_data_in;
+reg [IO_PINS-1:0] pin_data_out;
+reg cfg_we;
+reg cfg_addr;
+reg [IO_PINS-1:0] cfg_wdata;
+
+io_pads #(
+ .IO_PINS(IO_PINS),
+ .IO_PADS(IO_PADS),
+ .LOGIC_PROBES(LOGIC_PROBES),
+ .FIRST_PAD(FIRST_PAD)
+) io_pads_dut (
+ .wb_clk_i(wb_clk_i),
+ .wb_rst_i(wb_rst_i),
+ .la_data_in(la_data_in),
+ .la_data_out(la_data_out),
+ .la_oenb(la_oenb),
+ .io_in(io_in),
+ .io_out(io_out),
+ .io_oeb(io_oeb),
+ .clk(clk),
+ .rst_hard_n(rst_hard_n),
+ .rst_soft_n(rst_soft_n),
+ .rst_prng_n(rst_prng_n),
+ .pin_dir(pin_dir),
+ .pin_data_in(pin_data_in),
+ .pin_data_out(pin_data_out),
+ .cfg_we(cfg_we),
+ .cfg_addr(cfg_addr),
+ .cfg_wdata(cfg_wdata)
+);
+
+always #5 wb_clk_i = ~wb_clk_i;
+
+initial begin
+ $monitor("time %4t lado %b io %b ioe %b clk %b rh %b rs %b rp %b pd %b pi %b pm %b sd %b",
+ $time, la_data_out, io_out, io_oeb, clk, rst_hard_n, rst_soft_n, rst_prng_n, pin_dir, pin_data_in, io_pads_dut.programming, io_pads_dut.saved_dir);
+ wb_clk_i = 0;
+ wb_rst_i = 1;
+ la_data_in = 128'b0;
+ la_oenb = ~128'b0;
+ io_in = 38'b0;
+ pin_data_out = 16'b0;
+ cfg_we = 0;
+ cfg_addr = 0;
+ cfg_wdata = 16'b0;
+ #10
+ wb_rst_i = 0;
+ #30
+ $display("clock & reset tests");
+ la_oenb[0] = 0;
+ #30
+ la_data_in[0] = 1;
+ #30
+ la_data_in[0] = 0;
+ #30
+ la_oenb[0] = 1;
+ la_oenb[1] = 0;
+ la_data_in[1] = 0;
+ #30
+ la_oenb[1] = 1;
+ #30
+ la_oenb[2] = 0;
+ #30
+ la_oenb[2] = 1;
+ #30
+ la_oenb[3] = 0;
+ #30
+ la_oenb[3] = 1;
+ #30
+ wb_rst_i = 1;
+ #30
+ wb_rst_i = 0;
+ #30
+ la_oenb[4:1] = 3'b000;
+ la_data_in[4:1] = 3'b111;
+ wb_rst_i = 1;
+ #30
+ la_oenb[4:1] = 3'b111;
+ la_data_in[4:1] = 3'b000;
+ wb_rst_i = 0;
+ #10
+ $display("wb mux config test");
+ cfg_we = 1;
+ cfg_addr = 0;
+ cfg_wdata = 1;
+ #10
+ cfg_wdata = 0;
+ #10
+ cfg_addr = 1;
+ cfg_wdata = 16'b1111111100000000;
+ #10
+ cfg_we = 0;
+ #10
+ $display("io pin & pad tests");
+ $display("%d", io_pads_dut.LA_PAD);
+ io_in = 38'b111010101010101010111111111111;
+ #10
+ pin_data_out = 16'b1100110011001100;
+ #10
+ la_oenb[8 +: 8] = 8'b00000000;
+ la_data_in[8 +: 8] = 8'b00001111;
+ #10
+ la_oenb[8 +: 8] = 8'b11111111;
+ la_data_in[8 +: 8] = 8'b00000000;
+ #10
+ la_oenb[24 +: 8] = 8'b00000000;
+ la_data_in[24 +: 8] = 8'b11110000;
+ #10
+ la_oenb[24 +: 8] = 8'b11111111;
+ la_data_in[24 +: 8] = 8'b00000000;
+ #10
+ la_oenb[52 +: 8] = 8'b00000000;
+ la_data_in[52 +: 8] = 8'b11110000;
+ #10
+ la_oenb[52 +: 8] = 8'b11111111;
+ la_data_in[52 +: 8] = 8'b00000000;
+ #10
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/mcu_tb.v b/verilog/dv/tb/mcu_tb.v
new file mode 100644
index 0000000..1c475c3
--- /dev/null
+++ b/verilog/dv/tb/mcu_tb.v
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module mcu_tb();
+
+parameter CORES = 2;
+parameter LOG_CORES = 1;
+parameter MEM_DEPTH = 16;
+parameter DATA_WIDTH = 16;
+parameter PC_WIDTH = 3;
+parameter ADDR_WIDTH = 4;
+parameter INSTR_WIDTH = 32;
+parameter INSTR_DEPTH = 4;
+parameter IN_PINS = 4;
+parameter OUT_PINS = 4;
+parameter IO_PADS = 38;
+parameter FIRST_PAD = 12;
+parameter LOGIC_PROBES = 128;
+parameter WB_WIDTH = 32;
+parameter IO_PINS = IN_PINS + OUT_PINS;
+
+reg clk;
+wire wb_clk_i = clk;
+reg wb_rst_i;
+reg wb_stb_i;
+reg wb_cyc_i;
+reg wb_we_i;
+reg [WB_WIDTH-1:0] wb_adr_i;
+reg [WB_WIDTH-1:0] wb_dat_i;
+wire wbs_ack_o;
+wire [WB_WIDTH-1:0] wbs_dat_o;
+reg [LOGIC_PROBES-1:0] la_data_in;
+wire [LOGIC_PROBES-1:0] la_data_out;
+reg [LOGIC_PROBES-1:0] la_oenb;
+wire [IO_PADS-1:0] io_in;
+wire [IO_PADS-1:0] io_out;
+wire [IO_PADS-1:0] io_oeb;
+
+mcu #(
+ .CORES(CORES),
+ .LOG_CORES(LOG_CORES),
+ .MEM_DEPTH(MEM_DEPTH),
+ .DATA_WIDTH(DATA_WIDTH),
+ .PC_WIDTH(PC_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .INSTR_DEPTH(INSTR_DEPTH),
+ .IO_PINS(IO_PINS),
+ .IO_PADS(IO_PADS),
+ .FIRST_PAD(FIRST_PAD),
+ .LOGIC_PROBES(LOGIC_PROBES),
+ .WB_WIDTH(WB_WIDTH)
+) mcu_dut (
+ .wb_clk_i(wb_clk_i),
+ .wb_rst_i(wb_rst_i),
+ .wb_stb_i(wb_stb_i),
+ .wb_cyc_i(wb_cyc_i),
+ .wb_we_i(wb_we_i),
+ .wb_adr_i(wb_adr_i),
+ .wb_dat_i(wb_dat_i),
+ .wbs_ack_o(wbs_ack_o),
+ .wbs_dat_o(wbs_dat_o),
+ .la_data_in(la_data_in),
+ .la_data_out(la_data_out),
+ .la_oenb(la_oenb),
+ .io_in(io_in),
+ .io_out(io_out),
+ .io_oeb(io_oeb)
+);
+
+reg [IN_PINS-1:0] pin_data_in;
+assign io_in = {{(IO_PADS - IN_PINS - FIRST_PAD){1'b0}}, pin_data_in, {(FIRST_PAD){1'b0}}};
+
+wire [OUT_PINS-1:0] pin_data_out = io_out[FIRST_PAD + IN_PINS +: OUT_PINS];
+
+always #5 clk = ~clk;
+
+initial begin
+ $monitor("time %4t rh %1b rs %1b wwei %1b wai %32b pdi %4b pdo %4b",
+ $time, la_data_out[1], la_data_out[2], wb_we_i, wb_adr_i, pin_data_in, pin_data_out);
+ // power up
+ clk = 0;
+ wb_rst_i = 1;
+ wb_stb_i = 0;
+ wb_cyc_i = 0;
+ wb_we_i = 0;
+ wb_adr_i = 0;
+ wb_dat_i = 0;
+ la_data_in = {(LOGIC_PROBES){1'b0}};
+ la_oenb = {(LOGIC_PROBES){1'b1}};
+ pin_data_in = 4'b0000;
+ #10
+ // wishbone reset off, start communications
+ wb_rst_i = 0;
+ wb_stb_i = 1;
+ wb_cyc_i = 1;
+ wb_we_i = 1;
+ // programming mode
+ wb_adr_i = 32'b01_000000000000000000000000000000; // set programming mode
+ wb_dat_i = 32'b00000000000000000000000000000001; // to 1
+ #10
+ // send code for cpu core 0
+ wb_adr_i = 32'b00_00000000000000000000000000_0_000; // address 0:
+ wb_dat_i = 32'b100_000_1_00_0011_100_0000000000001111; // read value from memory cell 15 (joined input)
+ #10
+ wb_adr_i = 32'b00_00000000000000000000000000_0_001; // address 1:
+ wb_dat_i = 32'b011_000_1_11_0011_111_0000000000000001; // write value to memory cell 0, spread 1
+ #10
+ wb_adr_i = 32'b00_00000000000000000000000000_0_010; // address 2:
+ wb_dat_i = 32'b100_000_1_00_0011_011_0000000000000000; // jump to address 0
+ #10
+ // send code for cpu core 1
+ wb_adr_i = 32'b00_00000000000000000000000000_1_000; // address 0:
+ wb_dat_i = 32'b100_000_1_00_0011_100_0000000000000000; // read value from memory cell 0
+ #10
+ wb_adr_i = 32'b00_00000000000000000000000000_1_001; // address 1:
+ wb_dat_i = 32'b011_000_1_11_0011_111_0000000011100010; // write value to memory cell 14, spread 2 (joined output)
+ #10
+ wb_adr_i = 32'b00_00000000000000000000000000_1_010; // address 2:
+ wb_dat_i = 32'b100_000_1_00_0011_011_0000000000000000; // jump to address 0
+ #10
+ // set pin directions
+ wb_adr_i = 32'b01_000000000000000000000000000001; // set pin directions
+ wb_dat_i = 32'b00000000000000000000000011110000; // first 4 pins are inputs, next 4 pins are outputs
+ #10
+ // exit programming mode
+ wb_adr_i = 32'b01_000000000000000000000000000000; // set programming mode
+ wb_dat_i = 32'b00000000000000000000000000000000; // to 0
+ #10
+ // stop wishbone communications
+ wb_we_i = 0;
+ wb_cyc_i = 0;
+ wb_stb_i = 0;
+ // set input pins
+ pin_data_in = 4'b0011;
+ // wait for data to appear on output pins
+ #100
+ // change input pins
+ pin_data_in = 4'b1001;
+ // wait for data to appear on output pins
+ #100
+ // change input pins
+ pin_data_in = 4'b1100;
+ // wait for data to appear on output pins
+ #100
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/mem_mesh_tb.v b/verilog/dv/tb/mem_mesh_tb.v
new file mode 100644
index 0000000..9b81a5f
--- /dev/null
+++ b/verilog/dv/tb/mem_mesh_tb.v
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module mem_mesh_tb();
+
+parameter CORES = 8;
+parameter DEPTH = 256;
+parameter DATA_WIDTH = 16;
+parameter ADDR_WIDTH = 8;
+parameter SPREAD_LAYERS = 3;
+parameter SPREAD_WIDTH = 3;
+parameter USE_IO = 1;
+parameter IO_PORTS = 16;
+parameter IO_FIRST = 5;
+
+reg clk;
+reg rst_n;
+reg we[CORES-1:0];
+reg [ADDR_WIDTH-1:0] waddr[CORES-1:0];
+reg [SPREAD_WIDTH-1:0] wspread[CORES-1:0];
+reg [DATA_WIDTH-1:0] wdata[CORES-1:0];
+reg [ADDR_WIDTH-1:0] raddr[CORES-1:0];
+wire [DATA_WIDTH-1:0] rdata[CORES-1:0];
+
+// io directions are according to the cpu & memory, so they are
+// reversed from the point of view of the testbench / external world
+reg io_dir[IO_PORTS-1:0];
+reg io_receiving[IO_PORTS-1:0];
+wire io_sending[IO_PORTS-1:0];
+reg [DATA_WIDTH-1:0] io_input[IO_PORTS-1:0];
+wire [DATA_WIDTH-1:0] io_output[IO_PORTS-1:0];
+
+wire [CORES-1:0] we_raw;
+wire [CORES*ADDR_WIDTH-1:0] waddr_raw;
+wire [CORES*SPREAD_WIDTH-1:0] wspread_raw;
+wire [CORES*DATA_WIDTH-1:0] wdata_raw;
+wire [CORES*ADDR_WIDTH-1:0] raddr_raw;
+wire [CORES*DATA_WIDTH-1:0] rdata_raw;
+
+wire [IO_PORTS-1:0] io_active_in_raw;
+wire [IO_PORTS-1:0] io_active_out_raw;
+wire [IO_PORTS*DATA_WIDTH-1:0] io_data_in_raw;
+wire [IO_PORTS*DATA_WIDTH-1:0] io_data_out_raw;
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+ assign we_raw[core] = we[core];
+ assign waddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = waddr[core];
+ assign wspread_raw[core*SPREAD_WIDTH +: SPREAD_WIDTH] = wspread[core];
+ assign wdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = wdata[core];
+ assign raddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = raddr[core];
+ assign rdata[core] = rdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+end
+endgenerate
+
+generate genvar port;
+for (port=0; port<IO_PORTS; port=port+1) begin:g_port
+ assign io_active_in_raw[port] = io_dir[port] ? 1'b0 : io_receiving[port];
+ assign io_sending[port] = io_dir[port] ? io_active_out_raw[port] : 1'b0;
+ assign io_data_in_raw[port*DATA_WIDTH +: DATA_WIDTH] = io_dir[port] ? {(DATA_WIDTH){1'b0}} : io_input[port];
+ assign io_output[port] = io_dir[port] ? io_data_out_raw[port*DATA_WIDTH +: DATA_WIDTH] : {(DATA_WIDTH){1'b0}};
+end
+endgenerate
+
+mem_mesh #(
+ .CORES(CORES),
+ .DEPTH(DEPTH),
+ .DATA_WIDTH(DATA_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .SPREAD_LAYERS(SPREAD_LAYERS),
+ .SPREAD_WIDTH(SPREAD_WIDTH),
+ .USE_IO(USE_IO),
+ .IO_PORTS(IO_PORTS),
+ .IO_FIRST(IO_FIRST)
+) mem_mesh_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .we(we_raw),
+ .waddr(waddr_raw),
+ .wspread(wspread_raw),
+ .wdata(wdata_raw),
+ .raddr(raddr_raw),
+ .rdata(rdata_raw),
+ .io_active_in(io_active_in_raw),
+ .io_active_out(io_active_out_raw),
+ .io_data_in(io_data_in_raw),
+ .io_data_out(io_data_out_raw)
+);
+
+always #5 clk = ~clk;
+
+integer i;
+
+// for synchronization checking
+reg io_sending_reg[IO_PORTS-1:0];
+reg [DATA_WIDTH-1:0] io_output_reg[IO_PORTS-1:0];
+
+always @(posedge clk) begin
+ for (i=0; i<IO_PORTS; i=i+1) begin
+ io_sending_reg[i] <= io_sending[i];
+ io_output_reg[i] <= io_output[i];
+ end
+end
+
+initial begin
+ raddr[2] = 8;
+ raddr[3] = 8;
+ raddr[6] = 8;
+ raddr[7] = 192;
+ wspread[2] = 0;
+ io_input[3] = 0;
+ $monitor("time=%t mem[2][8]=%d mem[3][8]=%d mem[6][8]=%d mem[7][192]=%d io_dir[3]=%d io_sending[3]=%d io_out[3]=%d",
+ $time, rdata[2], rdata[3], rdata[6], rdata[7], io_dir[3], io_sending_reg[3], io_output_reg[3]);
+
+ for (i=0; i<CORES; i=i+1) begin
+ we[i] = 0;
+ end
+
+ for (i=0; i<IO_PORTS; i=i+1) begin
+ io_dir[i] = 1;
+ end
+
+ clk = 0;
+ rst_n = 1;
+ #10 rst_n = 0;
+ #10 rst_n = 1;
+
+ #20
+ we[2] = 1;
+ waddr[2] = 8;
+ wspread[2] = 0;
+ wdata[2] = 100;
+
+ #20
+ waddr[2] = 8;
+ wspread[2] = 1;
+ wdata[2] = 200;
+
+ #20
+ waddr[2] = 8;
+ wspread[2] = 2;
+ wdata[2] = 300;
+
+ #20
+ waddr[2] = 8;
+ wspread[2] = 3;
+ wdata[2] = 400;
+
+ #20
+ waddr[2] = 8;
+ wspread[2] = 4;
+ wdata[2] = 500;
+
+ #20
+ io_dir[3] = 0;
+ io_receiving[3] = 0;
+
+ #20
+ io_receiving[3] = 1;
+ io_input[3] = 1234;
+
+ #20 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/pin_compress_tb.v b/verilog/dv/tb/pin_compress_tb.v
new file mode 100644
index 0000000..6cb29ea
--- /dev/null
+++ b/verilog/dv/tb/pin_compress_tb.v
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module pin_compress_tb ();
+
+parameter WIDTH = 16;
+
+reg [WIDTH-1:0] data;
+reg [WIDTH-1:0] mask;
+wire [WIDTH-1:0] result;
+
+pin_compress #(
+ .WIDTH(WIDTH)
+) pin_compress_dut (
+ .data(data),
+ .mask(mask),
+ .result(result)
+);
+
+initial begin
+ data <= 16'b1001110100110101;
+ mask <= 16'b0100100101000101;
+ #10
+ $display("%16b", result);
+ $display("%16b", 16'b0000000000011011);
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/pin_decompress_tb.v b/verilog/dv/tb/pin_decompress_tb.v
new file mode 100644
index 0000000..09bb432
--- /dev/null
+++ b/verilog/dv/tb/pin_decompress_tb.v
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module pin_decompress_tb ();
+
+parameter WIDTH = 16;
+
+reg [WIDTH-1:0] data;
+reg [WIDTH-1:0] mask;
+wire [WIDTH-1:0] result;
+
+pin_decompress #(
+ .WIDTH(WIDTH)
+) pin_decompress_dut (
+ .data(data),
+ .mask(mask),
+ .result(result)
+);
+
+initial begin
+ data <= 16'b0000000000001011;
+ mask <= 16'b0101000101000101;
+ #10
+ $display("%16b", result);
+ $display("%16b", 16'b0000000100000101);
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prng_tb.v b/verilog/dv/tb/prng_tb.v
new file mode 100644
index 0000000..d010787
--- /dev/null
+++ b/verilog/dv/tb/prng_tb.v
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prng_tb();
+
+parameter STATE_BITS = 4;
+parameter POLYNOMIAL = 4'b1100;
+parameter STATE_INIT = 4'b0000;
+parameter OUTPUT_BITS = 2;
+
+reg clk;
+reg rst_n;
+wire [OUTPUT_BITS-1:0] random;
+
+prng #(
+ .STATE_BITS(STATE_BITS),
+ .POLYNOMIAL(POLYNOMIAL),
+ .STATE_INIT(STATE_INIT),
+ .OUTPUT_BITS(OUTPUT_BITS)
+) prng_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .entropy(1'b0),
+ .random(random)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+ $monitor("%4d %4b %4b %4b %2b", $time, prng_dut.state, prng_dut.g_shift[0].new_state, prng_dut.g_shift[1].new_state, random);
+ clk <= 0;
+ rst_n <= 0;
+ #10 rst_n <= 1;
+ #100 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prng_wrap_tb.v b/verilog/dv/tb/prng_wrap_tb.v
new file mode 100644
index 0000000..00a909a
--- /dev/null
+++ b/verilog/dv/tb/prng_wrap_tb.v
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prng_wrap_tb();
+
+parameter INDEX = 0;
+parameter OUTPUT_BITS = 128;
+
+reg clk;
+reg rst_n;
+wire [OUTPUT_BITS-1:0] random;
+
+prng_wrap #(
+ .INDEX(INDEX),
+ .OUTPUT_BITS(OUTPUT_BITS)
+) prng_wrap_dut (
+ .clk(clk),
+ .rst_n(rst_n),
+ .entropy(1'b0),
+ .random(random)
+);
+
+always #5 clk = ~clk;
+
+initial begin
+ clk <= 0;
+ rst_n <= 0;
+ #10 rst_n <= 1;
+ $display("%8x", prng_wrap_dut.prng_inst.POLYNOMIAL);
+ $display("%8x", prng_wrap_dut.prng_inst.scrambled_init);
+ $monitor("%4d %128b", $time, random);
+ #200 $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/prog_mux_tb.v b/verilog/dv/tb/prog_mux_tb.v
new file mode 100644
index 0000000..115abc8
--- /dev/null
+++ b/verilog/dv/tb/prog_mux_tb.v
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module prog_mux_tb();
+
+parameter CORES=4;
+parameter LOG_CORES=2;
+parameter PC_WIDTH=4;
+parameter INSTR_WIDTH=8;
+
+reg we;
+reg [LOG_CORES-1:0] sel;
+reg [PC_WIDTH-1:0] waddr;
+reg [INSTR_WIDTH-1:0] wdata;
+wire [CORES-1:0] cwe_raw;
+wire [CORES*PC_WIDTH-1:0] cwaddr_raw;
+wire [CORES*INSTR_WIDTH-1:0] cwdata_raw;
+
+prog_mux #(
+ .CORES(CORES),
+ .LOG_CORES(LOG_CORES),
+ .PC_WIDTH(PC_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH)
+) prog_mux_dut (
+ .we(we),
+ .sel(sel),
+ .waddr(waddr),
+ .wdata(wdata),
+ .cwe(cwe_raw),
+ .cwaddr(cwaddr_raw),
+ .cwdata(cwdata_raw)
+);
+
+wire cwe[CORES-1:0];
+wire [PC_WIDTH-1:0] cwaddr[CORES-1:0];
+wire [INSTR_WIDTH-1:0] cwdata[CORES-1:0];
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+ assign cwe[core] = cwe_raw[core];
+ assign cwaddr[core] = cwaddr_raw[core*PC_WIDTH +: PC_WIDTH];
+ assign cwdata[core] = cwdata_raw[core*INSTR_WIDTH +: INSTR_WIDTH];
+end
+endgenerate
+
+initial begin
+ $monitor("time=%4t we=%d sel=%d waddr=%d wdata=%d cwe0=%d cwaddr0=%d cwdata0=%d cwe1=%d cwaddr1=%d cwdata1=%d",
+ $time, we, sel, waddr, wdata, cwe[0], cwaddr[0], cwdata[0], cwe[1], cwaddr[1], cwdata[1]);
+
+ we = 0;
+
+ #10
+ we = 1;
+ sel = 0;
+ waddr = 3;
+ wdata = 11;
+
+ #10
+ we = 0;
+
+ #10
+ we = 1;
+ sel = 1;
+ waddr = 5;
+ wdata = 25;
+
+ #10
+ sel = 0;
+ waddr = 0;
+ wdata = 1;
+
+ #10
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/dv/tb/wb_mux_tb.v b/verilog/dv/tb/wb_mux_tb.v
new file mode 100644
index 0000000..484bc15
--- /dev/null
+++ b/verilog/dv/tb/wb_mux_tb.v
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module wb_mux_tb();
+
+parameter LOG_CORES = 3;
+parameter PC_WIDTH = 8;
+parameter INSTR_WIDTH = 32;
+parameter DATA_WIDTH = 16;
+parameter IO_PINS = 16;
+parameter WB_WIDTH = 32;
+
+reg wb_stb_i;
+reg wb_cyc_i;
+reg wb_we_i;
+reg [WB_WIDTH-1:0] wb_adr_i;
+reg [WB_WIDTH-1:0] wb_dat_i;
+wire wbs_ack_o;
+wire [WB_WIDTH-1:0] wbs_dat_o;
+wire prog_we;
+wire [LOG_CORES-1:0] prog_sel;
+wire [PC_WIDTH-1:0] prog_waddr;
+wire [INSTR_WIDTH-1:0] prog_wdata;
+wire pads_we;
+wire pads_waddr;
+wire [IO_PINS-1:0] pads_wdata;
+wire [LOG_CORES-1:0] debug_sel;
+wire [4:0] debug_addr;
+wire debug_we;
+wire [DATA_WIDTH-1:0] debug_wdata;
+reg [DATA_WIDTH-1:0] debug_rdata;
+wire [WB_WIDTH-1:0] entropy_word;
+
+wb_mux #(
+ .LOG_CORES(LOG_CORES),
+ .PC_WIDTH(PC_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .DATA_WIDTH(DATA_WIDTH),
+ .IO_PINS(IO_PINS),
+ .WB_WIDTH(WB_WIDTH)
+) wb_mux_dut (
+ .wb_stb_i(wb_stb_i),
+ .wb_cyc_i(wb_cyc_i),
+ .wb_we_i(wb_we_i),
+ .wb_adr_i(wb_adr_i),
+ .wb_dat_i(wb_dat_i),
+ .wbs_ack_o(wbs_ack_o),
+ .wbs_dat_o(wbs_dat_o),
+ .prog_we(prog_we),
+ .prog_sel(prog_sel),
+ .prog_waddr(prog_waddr),
+ .prog_wdata(prog_wdata),
+ .pads_we(pads_we),
+ .pads_waddr(pads_waddr),
+ .pads_wdata(pads_wdata),
+ .debug_sel(debug_sel),
+ .debug_addr(debug_addr),
+ .debug_we(debug_we),
+ .debug_wdata(debug_wdata),
+ .debug_rdata(debug_rdata),
+ .entropy_word(entropy_word)
+);
+
+initial begin
+ $monitor("time %4t / wa %1b wdo %32b / pwe %1b ps %3b pwa %8b pwd %32b / awe %1b aa %1b awd %16b / ds %3b da %5b dwe %1b dwd %16b / ew %32b",
+ $time, wbs_ack_o, wbs_dat_o, prog_we, prog_sel, prog_waddr, prog_wdata, pads_we, pads_waddr, pads_wdata,
+ debug_sel, debug_addr, debug_we, debug_wdata, entropy_word);
+ // before cycle
+ wb_stb_i = 0;
+ wb_cyc_i = 0;
+ wb_we_i = 0;
+ wb_adr_i = 0;
+ wb_dat_i = 32'b11111111111111111111111111111111;
+ debug_rdata = 16'b1111000010101010;
+ #10
+ // prog read (no effect)
+ wb_stb_i = 1;
+ wb_cyc_i = 1;
+ wb_adr_i = 32'b00_0000000000000000000_101_11011011;
+ #10
+ // prog write
+ wb_we_i = 1;
+ #10
+ // pads read (no effect)
+ wb_we_i = 0;
+ wb_adr_i = 32'b01_000000000000000000000000000001;
+ #10
+ // pads write
+ wb_we_i = 1;
+ #10
+ // debug read
+ wb_we_i = 0;
+ wb_adr_i = 32'b10_0000000000000000000000_010_01010;
+ #10
+ // debug write
+ wb_we_i = 1;
+ #10
+ // entropy read (no effect)
+ wb_we_i = 0;
+ wb_adr_i = 32'b11_000000000000000000000000000000;
+ #10
+ // entropy write
+ wb_we_i = 1;
+ #10
+ // after cycle
+ wb_stb_i = 0;
+ wb_cyc_i = 0;
+ #10
+ $stop;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/alu.v b/verilog/rtl/alu.v
new file mode 100644
index 0000000..0d7c26c
--- /dev/null
+++ b/verilog/rtl/alu.v
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial arithmetic logic unit
+
+Opcode matrix:
+0000 and in1 & in2 out set to result, carry_out set to |result
+0001 or in1 | in2 out set to result, carry_out set to &result
+0010 xor in1 ^ in2 out set to result, carry_out set to ^result
+0011 mux carry ? in2 : in1 out set to result, carry_out set to highest bit of result
+0100 nand ~(in1 & in2) out set to result, carry_out set to &result
+0101 nor ~(in1 | in2) out set to result, carry_out set to |result
+0110 nxor ~(in1 ^ in2) out set to result, carry_out set to ~^~result
+0111 nmux ~(carry ? in2 : in1) out set to result, carry_out set to highest bit of result
+1000 rcl in1 << in2 carry shifted in, carry_out shifted out
+1001 rcr in1 >> in2 carry shifted in, carry_out shifted out
+1010 add in1 + in2 + carry {carry_out, out} set to result
+1011 sub in1 - in2 - carry {carry_out, out} set to result
+1100 mul in1 * in2 out set to low W bits of result, carry_out set if high W bits are nonzero
+1101 mulh in1 * in2 out set to high W bits of result, carry_out set if high W bits are nonzero
+1110 muld in1 * {1, in2} {carry_out, out} set to high W+1 bits of result
+1111 log clog2(in1 + carry) out set to result, carry_out set if in1 + carry is a power of 2
+
+There is no division opcode, but `muld` was included for the "division by invariant multiplication" algorithm.
+Division by a constant can be compiled to a `muld` followed by an `rcr`.
+*/
+
+module alu #(parameter DATA_WIDTH=16) (
+ input [3:0] opcode,
+ input [DATA_WIDTH-1:0] in1,
+ input [DATA_WIDTH-1:0] in2,
+ input carry,
+ output [DATA_WIDTH-1:0] out,
+ output carry_out
+);
+
+ wire [DATA_WIDTH-1:0] op_out[15:0];
+ wire op_carry[15:0];
+
+ wire [DATA_WIDTH-1:0] and_out = in1 & in2;
+ wire and_carry = |and_out;
+ assign op_out[0] = and_out;
+ assign op_carry[0] = and_carry;
+
+ wire [DATA_WIDTH-1:0] or_out = in1 | in2;
+ wire or_carry = &or_out;
+ assign op_out[1] = or_out;
+ assign op_carry[1] = or_carry;
+
+ wire [DATA_WIDTH-1:0] xor_out = in1 ^ in2;
+ wire xor_carry = ^xor_out;
+ assign op_out[2] = xor_out;
+ assign op_carry[2] = xor_carry;
+
+ wire [DATA_WIDTH-1:0] mux_out = carry ? in2 : in1;
+ wire mux_carry = mux_out[DATA_WIDTH-1];
+ assign op_out[3] = mux_out;
+ assign op_carry[3] = mux_carry;
+
+ wire [DATA_WIDTH-1:0] nand_out = ~and_out;
+ wire nand_carry = ~and_carry;
+ assign op_out[4] = nand_out;
+ assign op_carry[4] = nand_carry;
+
+ wire [DATA_WIDTH-1:0] nor_out = ~or_out;
+ wire nor_carry = ~or_carry;
+ assign op_out[5] = nor_out;
+ assign op_carry[5] = nor_carry;
+
+ wire [DATA_WIDTH-1:0] nxor_out = ~xor_out;
+ wire nxor_carry = ~xor_carry;
+ assign op_out[6] = nxor_out;
+ assign op_carry[6] = nxor_carry;
+
+ wire [DATA_WIDTH-1:0] nmux_out = ~mux_out;
+ wire nmux_carry = ~mux_carry;
+ assign op_out[7] = nmux_out;
+ assign op_carry[7] = nmux_carry;
+
+ wire [DATA_WIDTH-1:0] rcl_out;
+ wire rcl_carry, rcl_ignore;
+ assign {rcl_carry, rcl_out, rcl_ignore} = {1'b0, in1, carry} << in2;
+ assign op_out[8] = rcl_out;
+ assign op_carry[8] = rcl_carry;
+
+ wire [DATA_WIDTH-1:0] rcr_out;
+ wire rcr_carry, rcr_ignore;
+ assign {rcr_ignore, rcr_out, rcr_carry} = {carry, in1, 1'b0} >> in2;
+ assign op_out[9] = rcr_out;
+ assign op_carry[9] = rcr_carry;
+
+ wire [DATA_WIDTH-1:0] add_out;
+ wire add_carry;
+ assign {add_carry, add_out} = in1 + in2 + carry;
+ assign op_out[10] = add_out;
+ assign op_carry[10] = add_carry;
+
+ wire [DATA_WIDTH-1:0] sub_out;
+ wire sub_carry;
+ assign {sub_carry, sub_out} = in1 - in2 - carry;
+ assign op_out[11] = sub_out;
+ assign op_carry[11] = sub_carry;
+
+ wire [DATA_WIDTH-1:0] mulh_out;
+ wire [DATA_WIDTH-1:0] mul_out;
+ assign {mulh_out, mul_out} = in1 * in2;
+ wire mul_carry = |mulh_out;
+ wire mulh_carry = mul_carry;
+ assign op_out[12] = mul_out;
+ assign op_carry[12] = mul_carry;
+ assign op_out[13] = mulh_out;
+ assign op_carry[13] = mulh_carry;
+
+ wire [DATA_WIDTH-1:0] muld_out;
+ wire [DATA_WIDTH-1:0] muld_ignore;
+ wire muld_carry;
+ assign {muld_carry, muld_out, muld_ignore} = in1 * {1'b1, in2};
+ assign op_out[14] = muld_out;
+ assign op_carry[14] = muld_carry;
+
+ wire [DATA_WIDTH-1:0] in1c = in1 + carry;
+ wire [DATA_WIDTH-1:0] in1d = in1 - (!carry);
+ wire [DATA_WIDTH-1:0] log_bits;
+ localparam LOG_WIDTH = $clog2(DATA_WIDTH);
+ assign log_bits[DATA_WIDTH-1:LOG_WIDTH] = 0;
+ generate genvar i;
+ for (i=LOG_WIDTH-1; i>=0; i=i-1) begin:g_bit
+ wire [(1<<(i+1))-1:0] subseq;
+ if (i == LOG_WIDTH-1) begin:i_first
+ assign subseq = in1d;
+ end else begin:i_nfirst
+ wire [i+1:0] index = {log_bits[i+1], {(i+1){1'b0}}};
+ assign subseq = g_bit[i+1].subseq[index +: 1<<(i+1)];
+ end
+ assign log_bits[i] = |subseq[1<<i +: 1<<i];
+ end
+ endgenerate
+ wire in1nz = in1c || carry;
+ wire in1no = |in1d;
+ wire [DATA_WIDTH-1:0] log_out = in1nz ? (log_bits + in1no) : -1;
+ wire log_carry = in1nz && !(in1c & in1d);
+ assign op_out[15] = log_out;
+ assign op_carry[15] = log_carry;
+
+ assign out = op_out[opcode];
+ assign carry_out = op_carry[opcode];
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/cpu_core.v b/verilog/rtl/cpu_core.v
new file mode 100644
index 0000000..912b50d
--- /dev/null
+++ b/verilog/rtl/cpu_core.v
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Central processing unit (single core)
+
+Has two general-purpose registers and a carry flag and executes an instruction on every clock cycle.
+Fetches instructions via the progctr (out) and opcode (in) ports. Each opcode instructs te cpu to
+take two values from registers, memory or other sources, feed them through the ALU and put the
+results in a register or memory cell or use it as a jump target.
+
+Opcode structure assumes INSTR_WIDTH=32. Changing it requires substantial edits to the code below.
+
+Opcodes have 32 bits and use the following format:
+AAA BBB C DD EEEE FFF GGGGGGGGGGGGGGGG
+A = source for ALU input 1
+B = source for ALU input 2
+C = reset carry flag used as ALU input
+D = extra options, see below
+E = ALU opcode
+F = target for ALU result
+G = immediate value, can be used as a source
+
+Possible values for sources A & B:
+000 use register 1
+001 use register 2
+010 use program counter
+011 read value from memory address previously specified
+100 use immediate value
+101 use high (A) or low (B) 8 bits of immediate value
+110 use timer (A) or prng (B)
+111 use cpu number (A) or the constant 1 (B)
+
+Possible values for target F:
+000 ignore
+001 set register 1
+010 set register 2
+011 set program counter (jump)
+100 set memory read address
+101 set memory write address
+110 set spread value for memory write
+111 write value to memory address previously specified
+
+Possible values for ALU opcode E and how they use/set the carry flag are described in the
+ALU source header.
+
+Extra options in D were chosen to make classic Random Access Machine operations more
+succinct. They are:
+00 business as usual
+01 set carry to highest bit of input 1 (specified as source A)
+ then replace input 1 with the immediate value;
+ also toggle this carry flag if C was set (and don't clear it, of course)
+10 read value from memory and store it in register 1
+ (if the instruction uses register 1 as the target, store it in register 2 instead)
+11 set memory write address/spread/data based on the immediate value
+ (if write data is set in this operation, it also triggers a memory write)
+ if F==101 (address set from alu out): D ssss ddddddddddd = immediate
+ if F==110 (spread set from alu out): D aaaaaaaa ddddddd = immediate
+ otherwise: A aaaaaaaaaaa ssss = immediate
+ if the D or A bit is present, use register 1 for data/address instead
+ and use the rest of the immediate value for the other part (aaa/sss/ddd)
+
+Example opcodes to implement Random Access Machine instructions:
+
+* M[i] = 0 // set memory slot i to zero
+ 000 000 1 11 0010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] + 1 // increment value in memory slot i
+ 100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+ 011 111 1 11 1010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] - 1 // decrement value in memory slot i
+ 100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+ 011 111 1 11 1011 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] + M[j] // add value in memory slot j to memory slot i
+ 100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+ 100 000 1 10 0011 100 iiiiiiiiiiiiiiii
+ 011 000 1 11 1010 111 0iiiiiiiiiii0000
+
+* M[i] = M[i] - M[j] // subtract value in memory slot j from memory slot i
+ 100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+ 100 000 1 10 0011 100 iiiiiiiiiiiiiiii
+ 011 000 1 11 1011 111 0iiiiiiiiiii0000
+
+* M[M[i]] = M[j] // set memory pointed to by slot i to value in slot j
+ 100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+ 100 000 1 10 0011 100 jjjjjjjjjjjjjjjj
+ 011 000 1 11 0011 111 1000000000000000
+
+* M[i] = M[M[j]] // set value in slot i to memory pointed to by slot j
+ 100 000 1 00 0011 100 jjjjjjjjjjjjjjjj
+ 011 000 1 00 0011 100 0000000000000000
+ 011 000 1 11 0011 111 0iiiiiiiiiii0000
+
+* if M[i] < 0 goto j // conditional jump
+ 100 000 1 00 0011 100 iiiiiiiiiiiiiiii
+ 011 010 1 01 0011 011 jjjjjjjjjjjjjjjj
+
+Parameters:
+DATA_WIDTH = processor word size
+PC_WIDTH = size of program counter, should be <= DATA_WIDTH
+ADDR_WIDTH = size of mem_mesh addresses, should be <= DATA_WIDTH
+SPREAD_WIDTH = size of mem_mesh spread value
+INSTR_WIDTH = combined size of opcode & immediate, should be kept at 32
+CPU_NUM = id number to differentiate cpu cores, can be queried by code running on the processor
+*/
+
+module cpu_core #(parameter DATA_WIDTH=16, PC_WIDTH=8, ADDR_WIDTH=8, SPREAD_WIDTH=3, INSTR_WIDTH=32, CPU_NUM=0) (
+ input clk, // clock signal
+ input rst_n, // reset, active low
+ input [INSTR_WIDTH-1:0] opcode, // opcode to be executed & immediate args
+ input [DATA_WIDTH-1:0] mem_rdata, // connected to 'rdata' of memory module
+ input [DATA_WIDTH-1:0] prng_in, // random number from prng
+ input [1:0] debug_mode, // debug: 00 = no change, 01 = single step, 10 = run, 11 = stop
+ input [3:0] debug_sel, // debug: cpu status register to query or modify
+ input debug_we, // debug: modify selected status register
+ input [DATA_WIDTH-1:0] debug_wdata, // debug: new value of selected status register
+ output [PC_WIDTH-1:0] progctr, // program counter
+ output mem_we, // +-
+ output [ADDR_WIDTH-1:0] mem_waddr, // | connected to
+ output [SPREAD_WIDTH-1:0] mem_wspread, // | corresponding ports
+ output [DATA_WIDTH-1:0] mem_wdata, // | of memory module
+ output [ADDR_WIDTH-1:0] mem_raddr, // +-
+ output debug_stopped, // debug: read back whether core is stopped
+ output [DATA_WIDTH-1:0] debug_rdata // debug: current value of selected status register
+);
+
+reg [DATA_WIDTH-1:0] reg1; // general-purpose registers
+reg [DATA_WIDTH-1:0] reg2;
+reg carry; // carry flag
+reg [DATA_WIDTH-1:0] pc; // register for program counter
+reg [DATA_WIDTH-1:0] timer; // clock ticks since last reset
+reg [ADDR_WIDTH-1:0] raddr; // next read address
+reg we; // write to memory on next cycle
+reg [ADDR_WIDTH-1:0] waddr; // next write address
+reg [SPREAD_WIDTH-1:0] wspread; // next write spread
+reg [DATA_WIDTH-1:0] wdata; // next write data
+reg stopped; // cpu core is stopped
+
+assign progctr = pc;
+assign mem_we = we;
+assign mem_waddr = waddr;
+assign mem_wspread = wspread;
+assign mem_wdata = wdata;
+assign mem_raddr = raddr;
+
+// opcode subdivision
+wire [2:0] op_in1; // input 1 source
+wire [2:0] op_in2; // input 2 source
+wire op_rst_carry; // reset carry flag
+wire [1:0] op_extra; // extra steps before alu processing
+wire [3:0] op_alu; // send this opcode (and in1, in2, carry) to the alu
+wire [2:0] op_target; // target for alu result
+wire [15:0] op_immed; // hardcoded value(s) to use as an input source
+assign {op_in1, op_in2, op_rst_carry, op_extra, op_alu, op_target, op_immed} = opcode;
+
+wire op_extra_carry = op_extra == 1; // set carry based on in1, replace in1 with immediate
+wire op_extra_rdata = op_extra == 2; // copy rdata to reg1 (or reg2 if reg1 is the target)
+wire op_extra_waddr = op_extra == 3; // fill waddr & wspread from immediate
+
+wire [DATA_WIDTH-1:0] next_pc = pc + 1;
+
+wire [DATA_WIDTH-1:0] sources1[7:0];
+assign sources1[0] = reg1;
+assign sources1[1] = reg2;
+assign sources1[2] = next_pc;
+assign sources1[3] = mem_rdata;
+assign sources1[4] = op_immed;
+assign sources1[5] = op_immed[15:8];
+assign sources1[6] = timer;
+assign sources1[7] = CPU_NUM;
+
+wire [DATA_WIDTH-1:0] sources2[7:0];
+assign sources2[0] = reg1;
+assign sources2[1] = reg2;
+assign sources2[2] = next_pc;
+assign sources2[3] = mem_rdata;
+assign sources2[4] = op_immed;
+assign sources2[5] = op_immed[7:0];
+assign sources2[6] = prng_in;
+assign sources2[7] = 1;
+
+wire [DATA_WIDTH-1:0] in1_orig = sources1[op_in1]; // data to use as alu input 1, unless overridden by op_extra_carry
+wire in1_oh = in1_orig[DATA_WIDTH-1]; // highest bit of in1_orig
+wire [DATA_WIDTH-1:0] in1 = op_extra_carry ? op_immed : in1_orig; // data to use as alu input 1
+wire [DATA_WIDTH-1:0] in2 = sources2[op_in2]; // data to use as alu input 2
+wire carry_def = op_rst_carry ? 0 : carry; // carry to use as alu input, unless overridden by op_extra_carry
+wire carry_ovr = op_rst_carry ? ~in1_oh : in1_oh; // override value if op_extra_carry is set
+wire alu_cin = op_extra_carry ? carry_ovr : carry_def; // consolidated carry input for alu
+
+wire [DATA_WIDTH-1:0] alu_out; // data output from alu
+wire alu_cout; // carry output from alu
+
+alu #(
+ .DATA_WIDTH(DATA_WIDTH)
+) alu_dut (
+ .opcode(op_alu),
+ .in1(in1),
+ .in2(in2),
+ .carry(alu_cin),
+ .out(alu_out),
+ .carry_out(alu_cout)
+);
+
+wire op_target_reg1 = op_target == 1;
+wire op_target_reg2 = op_target == 2;
+wire op_target_pc = op_target == 3;
+wire op_target_raddr = op_target == 4;
+wire op_target_waddr = op_target == 5;
+wire op_target_wspread = op_target == 6;
+wire op_target_wdata = op_target == 7;
+
+// extract values from immediate to prepare for op_extra_waddr case
+wire immed_ovr = op_immed[15];
+wire [DATA_WIDTH-1:0] s_hi4 = immed_ovr ? op_immed[14:0] : op_immed[14:11];
+wire [DATA_WIDTH-1:0] d_lo11 = immed_ovr ? reg1 : op_immed[10:0];
+wire [DATA_WIDTH-1:0] a_hi8 = immed_ovr ? op_immed[14:0] : op_immed[14:7];
+wire [DATA_WIDTH-1:0] d_lo7 = immed_ovr ? reg1 : op_immed[6:0];
+wire [DATA_WIDTH-1:0] a_hi11 = immed_ovr ? reg1 : op_immed[14:4];
+wire [DATA_WIDTH-1:0] s_lo4 = immed_ovr ? op_immed[14:0] : op_immed[3:0];
+
+// update target with alu output
+// if op_extra_rdata is set, also write mem_rdata to reg1 (if target is reg1, use reg2 instead)
+// if op_extra_waddr is set, also fill waddr & wspread with immediate (if target is waddr/wspread, replace with wdata)
+wire [DATA_WIDTH-1:0] reg1_mod = op_target_reg1 ? alu_out : (op_extra_rdata ? mem_rdata : reg1);
+wire [DATA_WIDTH-1:0] reg2_mod = op_target_reg2 ? alu_out : ((op_extra_rdata && op_target_reg1) ? mem_rdata : reg2);
+wire [DATA_WIDTH-1:0] pc_mod = op_target_pc ? alu_out : next_pc;
+wire [DATA_WIDTH-1:0] raddr_mod = op_target_raddr ? alu_out : raddr;
+wire [DATA_WIDTH-1:0] waddr_mod = op_target_waddr ? alu_out :
+ (op_extra_waddr ? (op_target_wspread ? a_hi8 : a_hi11) : waddr);
+wire [DATA_WIDTH-1:0] wspread_mod = op_target_wspread ? alu_out :
+ (op_extra_waddr ? (op_target_waddr ? s_hi4 : s_lo4) : wspread);
+wire [DATA_WIDTH-1:0] wdata_mod = op_target_wdata ? alu_out :
+ (op_extra_waddr ? (op_target_wspread ? d_lo7 : (op_target_waddr ? d_lo11 : wdata)) : wdata);
+wire we_mod = op_target_wdata || (op_extra_waddr && (op_target_waddr || op_target_wspread));
+
+// debug interface
+wire [DATA_WIDTH-1:0] debug_reg[15:0];
+assign debug_reg[0] = pc;
+assign debug_reg[1] = opcode[31:16];
+assign debug_reg[2] = opcode[15:0];
+assign debug_reg[3] = reg1;
+assign debug_reg[4] = reg2;
+assign debug_reg[5] = carry;
+assign debug_reg[6] = alu_out;
+assign debug_reg[7] = alu_cout;
+assign debug_reg[8] = timer;
+assign debug_reg[9] = prng_in;
+assign debug_reg[10] = raddr;
+assign debug_reg[11] = mem_rdata;
+assign debug_reg[12] = we;
+assign debug_reg[13] = waddr;
+assign debug_reg[14] = wspread;
+assign debug_reg[15] = wdata;
+assign debug_rdata = debug_reg[debug_sel];
+assign debug_stopped = stopped;
+assign stopped_mod = debug_mode[1] ? debug_mode[0] : stopped;
+
+// sequential logic
+always @ (posedge clk) begin
+ if (!rst_n) begin
+ reg1 <= 0;
+ reg2 <= 0;
+ carry <= 0;
+ pc <= 0;
+ timer <= 0;
+ raddr <= 0;
+ we <= 0;
+ waddr <= 0;
+ wspread <= 0;
+ wdata <= 0;
+ stopped <= 0;
+ end else begin
+ if (debug_we) begin
+ // don't run instructions on cycles with debug writes
+ case (debug_sel)
+ // wires can't be changed, only regs
+ 0: pc <= debug_wdata;
+ // opcode high & low skipped
+ 3: reg1 <= debug_wdata;
+ 4: reg2 <= debug_wdata;
+ 5: carry <= debug_wdata;
+ // alu_out & alu_cout skipped
+ 8: timer <= debug_wdata;
+ // prng_in skipped
+ 10: raddr <= debug_wdata;
+ // mem_rdata skipped
+ 12: we <= debug_wdata;
+ 13: waddr <= debug_wdata;
+ 14: wspread <= debug_wdata;
+ 15: wdata <= debug_wdata;
+ endcase
+ end else if (!stopped_mod || debug_mode == 2'b01) begin
+ // running or single stepping
+ reg1 <= reg1_mod;
+ reg2 <= reg2_mod;
+ carry <= alu_cout;
+ pc <= pc_mod;
+ timer <= timer + 1;
+ raddr <= raddr_mod;
+ we <= we_mod;
+ waddr <= waddr_mod;
+ wspread <= wspread_mod;
+ wdata <= wdata_mod;
+ stopped <= stopped_mod;
+ end
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/debug_mux.v b/verilog/rtl/debug_mux.v
new file mode 100644
index 0000000..68717cf
--- /dev/null
+++ b/verilog/rtl/debug_mux.v
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial debugging multiplexer
+
+Sends messages to cpu cores to run, stop or single step.
+Queries or modifies registers and status flags.
+*/
+
+module debug_mux #(parameter CORES=8, LOG_CORES=3, DATA_WIDTH=16) (
+ input [LOG_CORES-1:0] sel, // controller interface
+ input [4:0] addr, // 0xxxx affects status register xxxx, 10000 affects running/stopped state
+ input we,
+ input [DATA_WIDTH-1:0] wdata,
+ output [DATA_WIDTH-1:0] rdata,
+ input [CORES-1:0] reg_stopped, // interface towards cpu cores
+ input [CORES*DATA_WIDTH-1:0] reg_rdata,
+ output [CORES*2-1:0] cpu_mode,
+ output [CORES*4-1:0] reg_sel,
+ output [CORES-1:0] reg_we,
+ output [CORES*DATA_WIDTH-1:0] reg_wdata
+);
+
+wire reg_stopped_i[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_rdata_i[CORES-1:0];
+wire [1:0] cpu_mode_i[CORES-1:0];
+wire [3:0] reg_sel_i[CORES-1:0];
+wire reg_we_i[CORES-1:0];
+wire [DATA_WIDTH-1:0] reg_wdata_i[CORES-1:0];
+
+wire cc_mode;
+wire [3:0] cc_sel;
+assign {cc_mode, cc_sel} = addr;
+assign rdata = cc_mode ? reg_stopped_i[sel] : reg_rdata_i[sel];
+
+generate genvar core;
+for(core=0; core<CORES; core=core+1) begin:g_core
+ assign reg_stopped_i[core] = reg_stopped[core];
+ assign reg_rdata_i[core] = reg_rdata[core*DATA_WIDTH +: DATA_WIDTH];
+ assign cpu_mode[core*2 +: 2] = cpu_mode_i[core];
+ assign reg_sel[core*4 +: 4] = reg_sel_i[core];
+ assign reg_we[core] = reg_we_i[core];
+ assign reg_wdata[core*DATA_WIDTH +: DATA_WIDTH] = reg_wdata_i[core];
+
+ wire cur = sel == core;
+ assign cpu_mode_i[core] = (cur && we && cc_mode) ? wdata : 2'b00;
+ assign reg_sel_i[core] = (cur && !cc_mode) ? cc_sel : 4'b0000;
+ assign reg_we_i[core] = cur && we && !cc_mode;
+ assign reg_wdata_i[core] = (cur && we && !cc_mode) ? wdata : 0;
+end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/entropy_pool.v b/verilog/rtl/entropy_pool.v
new file mode 100644
index 0000000..c9e984c
--- /dev/null
+++ b/verilog/rtl/entropy_pool.v
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Simple entropy pool, shifting a single bit into prng's in each clock cycle
+*/
+
+module entropy_pool #(parameter WIDTH=16) (
+ input clk,
+ input rst_n,
+ input[WIDTH-1:0] e_word,
+ output e_bit
+);
+
+reg[WIDTH-1:0] e_pool;
+wire[WIDTH-1:0] e_pool_mod;
+assign {e_pool_mod, e_bit} = {1'b0, e_pool} ^ {e_word, 1'b0};
+
+always @(posedge clk) begin
+ if(!rst_n)
+ e_pool <= 0;
+ else
+ e_pool <= e_pool_mod;
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/instr_mem.v b/verilog/rtl/instr_mem.v
new file mode 100644
index 0000000..61638a9
--- /dev/null
+++ b/verilog/rtl/instr_mem.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Instruction memory
+*/
+
+module instr_mem #(parameter PC_WIDTH=8, INSTR_WIDTH=32, DEPTH=128) (
+ input clk,
+ input rst_n,
+ input [PC_WIDTH-1:0] raddr,
+ output [INSTR_WIDTH-1:0] rdata,
+ input we,
+ input [PC_WIDTH-1:0] waddr,
+ input [INSTR_WIDTH-1:0] wdata
+);
+
+reg [INSTR_WIDTH-1:0] mem[DEPTH-1:0];
+
+assign rdata = mem[raddr];
+
+integer i;
+always @ (posedge clk) begin
+ if (!rst_n) begin
+ for (i=0; i<DEPTH; i=i+1) begin
+ mem[i] <= {(INSTR_WIDTH){1'b0}};
+ end
+ end else begin
+ if (we) mem[waddr] <= wdata;
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_filter.v b/verilog/rtl/io_filter.v
new file mode 100644
index 0000000..91d670c
--- /dev/null
+++ b/verilog/rtl/io_filter.v
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+IO filter
+
+Interfaces the io pins of the chip to the io ports of mem_mesh.
+
+An io port is created for each individual pin where the lowest bit sent on the port is forwarded
+to the pin and a bit coming from the pin is stretched to the full port width.
+
+Two additional io ports are created by joining together all input pins and all output pins respectively,
+right-aligned and zero-padded.
+
+Pins send and receive continuous streams of bits while io ports only fire on changes.
+Writing ports corresponding to individual pins override bits of the joined output port.
+
+We assume IO_PINS <= DATA_WIDTH. Alternatively we could modify the code to use more than one joined
+port per direction.
+*/
+
+module io_filter #(parameter IO_PINS=16, DATA_WIDTH=16) (
+ input clk,
+ input rst_n,
+ input [IO_PINS-1:0] pin_dir, // 0=input, 1=output
+ input [IO_PINS-1:0] pin_data_in, // input for both mem_mesh & io_filter
+ output [IO_PINS-1:0] pin_data_out, // output for both mem_mesh & io_filter
+ output [IO_PINS+2-1:0] port_active_in, // input for mem_mesh, output for io_filter
+ input [IO_PINS+2-1:0] port_active_out, // output for mem_mesh, input for io_filter
+ output [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in,
+ input [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out
+);
+
+reg [IO_PINS-1:0] saved_in;
+reg [IO_PINS-1:0] saved_out;
+
+// input
+wire [IO_PINS-1:0] input_indiv = pin_data_in; // select input pins
+wire [IO_PINS-1:0] input_indiv_active = pin_data_in ^ saved_in; // a pin is active if it changed from the last state
+wire input_joined_active = |input_indiv_active; // update the joined port if any of the pins changed
+wire [IO_PINS-1:0] input_joined;
+pin_compress #( // compress input bits together
+ .WIDTH(IO_PINS)
+) comp (
+ .data(input_indiv),
+ .mask(~pin_dir),
+ .result(input_joined)
+);
+
+// input
+assign port_active_in[IO_PINS +: 2] = {input_joined_active, 1'b0}; // assign the joined ports & their active states
+assign port_data_in[IO_PINS*DATA_WIDTH +: 2*DATA_WIDTH] = {input_joined, {(DATA_WIDTH){1'b0}}};
+// output
+wire [IO_PINS-1:0] output_indiv;
+wire [IO_PINS-1:0] output_indiv_active;
+generate genvar pin;
+ for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+ // input
+ assign port_active_in[pin] = input_indiv_active[pin]; // assign the individual ports & their active states
+ assign port_data_in[pin*DATA_WIDTH +: DATA_WIDTH] = {(DATA_WIDTH){input_indiv[pin]}};
+ // output
+ assign pin_data_out[pin] = saved_out[pin]; // output pins keep their state between writes
+ assign output_indiv_active[pin] = port_active_out[pin]; // get pins & their active states from the individual output ports
+ assign output_indiv[pin] = port_data_out[pin*DATA_WIDTH];
+ end
+endgenerate
+
+// output
+wire [IO_PINS-1:0] output_joined = port_data_out[IO_PINS*DATA_WIDTH +: DATA_WIDTH]; // get pins & their active state from the joined output port
+wire output_joined_active = port_active_out[IO_PINS];
+wire [IO_PINS-1:0] output_decomp;
+pin_decompress #( // decompress output pins to their respective bit positions
+ .WIDTH(IO_PINS)
+) decomp (
+ .data(output_joined),
+ .mask(pin_dir),
+ .result(output_decomp)
+);
+
+// consolidate pins set through joined & individual ports (individual ports have priority)
+wire [IO_PINS-1:0] output_mixed = (output_indiv_active & output_indiv) | (~output_indiv_active & output_decomp);
+wire [IO_PINS-1:0] output_mixed_active = output_indiv_active | {(IO_PINS){output_joined_active}};
+
+integer i;
+always @(posedge clk) begin
+ if (!rst_n) begin
+ saved_in <= 0;
+ saved_out <= 0;
+ end else begin
+ for (i=0; i<IO_PINS; i=i+1) begin
+ // active outputs change the saved state in order to keep being sent
+ if (output_mixed_active[i]) saved_out[i] <= output_mixed[i];
+ // inputs are only active for a single cycle while they differ from their saved state
+ saved_in[i] <= input_indiv[i];
+ end
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_filter_rev.v b/verilog/rtl/io_filter_rev.v
new file mode 100644
index 0000000..67a0fc4
--- /dev/null
+++ b/verilog/rtl/io_filter_rev.v
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+IO filter with reversed pin order
+*/
+
+module io_filter_rev #(parameter IO_PINS=16, DATA_WIDTH=16) (
+ input clk,
+ input rst_n,
+ input [IO_PINS-1:0] pin_dir, // 0=input, 1=output
+ input [IO_PINS-1:0] pin_data_in, // input for both mem_mesh & io_filter
+ output [IO_PINS-1:0] pin_data_out, // output for both mem_mesh & io_filter
+ output [IO_PINS+2-1:0] port_active_in, // input for mem_mesh, output for io_filter
+ input [IO_PINS+2-1:0] port_active_out, // output for mem_mesh, input for io_filter
+ output [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_in,
+ input [(IO_PINS+2)*DATA_WIDTH-1:0] port_data_out
+);
+
+wire [IO_PINS-1:0] pin_dir_rev;
+wire [IO_PINS-1:0] pin_data_in_rev;
+wire [IO_PINS-1:0] pin_data_out_rev;
+
+io_filter #(
+ .IO_PINS(IO_PINS),
+ .DATA_WIDTH(DATA_WIDTH)
+) io_filter_inst (
+ .clk(clk),
+ .rst_n(rst_n),
+ .pin_dir(pin_dir_rev),
+ .pin_data_in(pin_data_in_rev),
+ .pin_data_out(pin_data_out_rev),
+ .port_active_in(port_active_in),
+ .port_active_out(port_active_out),
+ .port_data_in(port_data_in),
+ .port_data_out(port_data_out)
+);
+
+generate genvar pin;
+ for (pin=0; pin<IO_PINS; pin=pin+1) begin:g_pin
+ localparam rpin = IO_PINS-1-pin;
+ assign pin_dir_rev[pin] = pin_dir[rpin];
+ assign pin_data_in_rev[pin] = pin_data_in[rpin];
+ assign pin_data_out[pin] = pin_data_out_rev[rpin];
+ end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/io_pads.v b/verilog/rtl/io_pads.v
new file mode 100644
index 0000000..4089b74
--- /dev/null
+++ b/verilog/rtl/io_pads.v
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Connection to Caravel IO pads & logic analyzer
+
+IO_PINS = logical pins accessible for the program running on the cpu cores
+IO_PADS = pads made available by Caravel for user projects (maps to MPRJ_IO_PADS)
+LOGIC_PROBES = logic analyzer probes
+FIRST_PAD = map pin 0 to pad FIRST_PAD, pin 1 to pad FIRST_PAD+1 etc.
+*/
+
+module io_pads #(parameter IO_PINS=16, IO_PADS=38, LOGIC_PROBES=128, FIRST_PAD=12) (
+ // Caravel interface
+ input wb_clk_i,
+ input wb_rst_i,
+ input [LOGIC_PROBES-1:0] la_data_in,
+ output [LOGIC_PROBES-1:0] la_data_out,
+ input [LOGIC_PROBES-1:0] la_oenb,
+ input [IO_PADS-1:0] io_in,
+ output [IO_PADS-1:0] io_out,
+ output [IO_PADS-1:0] io_oeb,
+ // MCU interface
+ output clk,
+ output rst_hard_n,
+ output rst_soft_n,
+ output rst_prng_n,
+ // IO filter interface
+ output [IO_PINS-1:0] pin_dir,
+ output [IO_PINS-1:0] pin_data_in,
+ input [IO_PINS-1:0] pin_data_out,
+ // Wishbone multiplexer interface
+ input cfg_we,
+ input cfg_addr,
+ input [IO_PINS-1:0] cfg_wdata
+);
+
+reg programming;
+reg [IO_PINS-1:0] saved_dir;
+
+// allow logic analyzer probes to override clock & reset signals
+assign clk = la_oenb[0] ? wb_clk_i : la_data_in[0];
+assign rst_hard_n = la_oenb[1] ? !wb_rst_i : la_data_in[1];
+assign rst_soft_n = la_oenb[2] ? (!wb_rst_i & !programming) : la_data_in[2];
+assign rst_prng_n = la_oenb[3] ? !wb_rst_i : la_data_in[3];
+
+localparam LA_DIR = 4; // index of logic analyzer probes for pin directions
+localparam LA_PIN = LA_DIR + IO_PINS; // index of logic analyzer probes for pin values
+localparam LA_PAD = LA_PIN + IO_PINS; // index of logic analyzer probes for pad values
+localparam LA_END = LA_PAD + IO_PADS; // index of first unused logic analyzer probe
+localparam LA_REM = LOGIC_PROBES - LA_END; // unused logic analyzer probes
+
+localparam PAD_REM = IO_PADS - IO_PINS - FIRST_PAD; // unused pads remaining after the last io pin
+
+// while programming, all pins are inputs, otherwise they follow the saved_dir array
+// the logic analyzer can override everything
+assign pin_dir = (la_oenb[LA_DIR +: IO_PINS] & (rst_soft_n ? saved_dir : 0)) |
+ (~la_oenb[LA_DIR +: IO_PINS] & la_data_in[LA_DIR +: IO_PINS]);
+
+// pin values are read from corresponding pads as long as the pin direction is set to input
+assign pin_data_in = (la_oenb[LA_PIN +: IO_PINS] & ~pin_dir & io_in[FIRST_PAD +: IO_PINS]) |
+ (~la_oenb[LA_PIN +: IO_PINS] & la_data_in[LA_PIN +: IO_PINS]);
+
+// configure pad directions according to pin directions, pads not matched to pins are marked as inputs
+assign io_oeb = (la_oenb[LA_PAD +: IO_PADS] & {{(PAD_REM){1'b1}}, ~pin_dir, {(FIRST_PAD){1'b1}}}) |
+ (~la_oenb[LA_PAD +: IO_PADS] & {(IO_PADS){1'b0}});
+
+// pin values are written to corresponding pads, zeroes are written to unassigned pads (they are inputs anyway)
+assign io_out = (la_oenb[LA_PAD +: IO_PADS] & {{(PAD_REM){1'b0}}, pin_dir & pin_data_out, {(FIRST_PAD){1'b0}}}) |
+ (~la_oenb[LA_PAD +: IO_PADS] & la_data_in[LA_PAD +: IO_PADS]);
+
+// logic analyzer probes can also read back the same signals and values
+assign la_data_out[0] = clk;
+assign la_data_out[1] = rst_hard_n;
+assign la_data_out[2] = rst_soft_n;
+assign la_data_out[3] = rst_prng_n;
+assign la_data_out[LA_DIR +: IO_PINS] = pin_dir;
+assign la_data_out[LA_PIN +: IO_PINS] = pin_data_out;
+assign la_data_out[LA_PAD +: IO_PADS] = io_in;
+assign la_data_out[LA_END +: LA_REM] = {(LA_REM){1'b0}};
+
+// change programming mode & pin directions from the wishbone multiplexer
+always @(posedge clk) begin
+ if (!rst_hard_n) begin
+ programming <= 0;
+ saved_dir <= {(IO_PINS){1'b0}};
+ end else begin
+ if (cfg_we) begin
+ case (cfg_addr)
+ 0: programming <= cfg_wdata;
+ 1: saved_dir <= cfg_wdata;
+ endcase
+ end
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/mcu.v b/verilog/rtl/mcu.v
new file mode 100644
index 0000000..f488b00
--- /dev/null
+++ b/verilog/rtl/mcu.v
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Microcontroller unit
+
+Combines the cpu cores with their corresponding instruction memories and prng's,
+the memory mesh, io filter and programming multiplexer into a single package
+
+ |||||| ||| |||
+ +--------------+ +-----------+
+ | |=======================================| pads & la |
+ | | +--------------+ +-----------+
+ | wb mux |===================| entropy pool |=+ |||
+ | | +-----------+ +--------------+ | +-----------+
+ | |===| debug mux | | | io filter |
+ +--------------+ +-----------+ +------+ | +-----------+
+ |||| ||| +----------+ +=| prng |=+ |||
+ +------+ +-----------+ ||+=| cpu core |=+ +------+ | +-----------+
+ | |==| instr mem |=====| w/alu |===============| |
+ | | +-----------+ || +----------+ +------+ | | |
+ | | || +----------+ +=| prng |=+ | |
+ | prog | +-----------+ |+==| cpu core |=+ +------+ | | |
+ | mux |==| instr mem |=====| w/alu |===============| mem mesh |
+ | | +-----------+ | +----------+ +------+ | | |
+ | | | +----------+ +=| prng |=+ | |
+ | | +-----------+ +===| cpu core |=+ +------+ | |
+ | |==| instr mem |=====| w/alu |===============| |
+ +------+ +-----------+ +----------+ +-----------+
+
+*/
+
+module mcu #(parameter
+ CORES = 8, // number of cpu cores
+ LOG_CORES = 3, // clog2(CORES)
+ MEM_DEPTH = 256, // number of memory mesh cells per cpu core
+ DATA_WIDTH = 16, // machine word size
+ PC_WIDTH = 8, // program counter size, should be at least clog2(INSTR_DEPTH)+clog2(CORES)
+ ADDR_WIDTH = 8, // memory mesh address width, should be at least clog2(MEM_DEPTH)
+ INSTR_WIDTH = 32, // opcode width including args, should be fixed at 32 or opcode handling needs to be changed
+ INSTR_DEPTH = 32, // minimum number of instructions in program memory (some cores will have a multiple of it)
+ IO_PINS = 16, // number of io pins usable by code on cpu cores
+ IO_PADS = 38, // number of caravel io pads
+ FIRST_PAD = 12, // map io pin 0 to caravel io pad FIRST_PAD
+ LOGIC_PROBES = 128, // number of caravel logic analyzer probes
+ WB_WIDTH = 32 // wishbone bus width, fixed to 32
+)(
+ input wb_clk_i, // wishbone clock
+ input wb_rst_i, // wb reset, active high
+ input wb_stb_i, // wb strobe
+ input wb_cyc_i, // wb cycle
+ input wb_we_i, // wb write enable
+ input [WB_WIDTH-1:0] wb_adr_i, // wb address
+ input [WB_WIDTH-1:0] wb_dat_i, // wb input data
+ output wbs_ack_o, // wb acknowledge
+ output [WB_WIDTH-1:0] wbs_dat_o, // wb output data
+ input [LOGIC_PROBES-1:0] la_data_in, // logic analyzer probes input
+ output [LOGIC_PROBES-1:0] la_data_out, // la probes output
+ input [LOGIC_PROBES-1:0] la_oenb, // la probes direction, 0=input (write by la), 1=output (read by la)
+ input [IO_PADS-1:0] io_in, // io pads input
+ output [IO_PADS-1:0] io_out, // io pads output
+ output [IO_PADS-1:0] io_oeb // io pads direction, 0=output (write by mcu), 1=input (read by mcu)
+);
+
+localparam SPREAD_LAYERS = LOG_CORES;
+localparam SPREAD_WIDTH = $clog2(2 + SPREAD_LAYERS);
+localparam MEM_IO_PORTS = 2 + IO_PINS;
+localparam MEM_IO_FIRST = MEM_DEPTH - MEM_IO_PORTS;
+
+// clock and reset signals, set by io_pads using wb_clk_i, wb_rst_i and logic probes
+wire clk;
+wire rst_hard_n;
+wire rst_soft_n;
+wire rst_prng_n;
+
+// between io pads and io filter
+wire [IO_PINS-1:0] pin_dir; // pads > iof
+wire [IO_PINS-1:0] pin_data_in; // pads > iof
+wire [IO_PINS-1:0] pin_data_out; // pads < iof
+
+// between cpu core and corresponding instruction memory
+wire [INSTR_WIDTH-1:0] opcode[CORES-1:0]; // cpu < im
+wire [PC_WIDTH-1:0] progctr[CORES-1:0]; // cpu > im
+
+// between cpu core and memory mesh (unpacked versions for cpu cores)
+wire [DATA_WIDTH-1:0] mem_rdata[CORES-1:0]; // cpu < mesh
+wire mem_we[CORES-1:0]; // cpu > mesh
+wire [ADDR_WIDTH-1:0] mem_waddr[CORES-1:0]; // cpu > mesh
+wire [SPREAD_WIDTH-1:0] mem_wspread[CORES-1:0]; // cpu > mesh
+wire [DATA_WIDTH-1:0] mem_wdata[CORES-1:0]; // cpu > mesh
+wire [ADDR_WIDTH-1:0] mem_raddr[CORES-1:0]; // cpu > mesh
+
+// between cpu core and memory mesh (packed versions for memory mesh)
+wire [CORES*DATA_WIDTH-1:0] mem_rdata_raw; // cpu < mesh
+wire [CORES-1:0] mem_we_raw; // cpu > mesh
+wire [CORES*ADDR_WIDTH-1:0] mem_waddr_raw; // cpu > mesh
+wire [CORES*SPREAD_WIDTH-1:0] mem_wspread_raw; // cpu > mesh
+wire [CORES*DATA_WIDTH-1:0] mem_wdata_raw; // cpu > mesh
+wire [CORES*ADDR_WIDTH-1:0] mem_raddr_raw; // cpu > mesh
+
+// between cpu core and corresponding prng
+wire [DATA_WIDTH-1:0] prng_random[CORES-1:0]; // cpu < prng
+
+// between instruction memory and programming multiplexer (unpacked versions for instruction memory)
+wire im_we[CORES-1:0]; // im < pmux
+wire [PC_WIDTH-1:0] im_waddr[CORES-1:0]; // im < pmux
+wire [INSTR_WIDTH-1:0] im_wdata[CORES-1:0]; // im < pmux
+
+// between instruction memory and programming multiplexer (packed versions for programming multiplexer)
+wire [CORES-1:0] im_we_raw; // im < pmux
+wire [CORES*PC_WIDTH-1:0] im_waddr_raw; // im < pmux
+wire [CORES*INSTR_WIDTH-1:0] im_wdata_raw; // im < pmux
+
+// between memory mesh and io filter
+wire [MEM_IO_PORTS-1:0] mem_io_active_in; // mesh < iof
+wire [MEM_IO_PORTS-1:0] mem_io_active_out; // mesh > iof
+wire [MEM_IO_PORTS*DATA_WIDTH-1:0] mem_io_data_in; // mesh < iof
+wire [MEM_IO_PORTS*DATA_WIDTH-1:0] mem_io_data_out; // mesh > iof
+
+// between debugging multiplexer and cpu core (unpacked versions for cpu core)
+wire [1:0] debug_cpu_mode[CORES-1:0]; // dmux > cpu
+wire [3:0] debug_reg_sel[CORES-1:0]; // dmux > cpu
+wire debug_reg_we[CORES-1:0]; // dmux > cpu
+wire [DATA_WIDTH-1:0] debug_reg_wdata[CORES-1:0]; // dmux > cpu
+wire debug_reg_stopped[CORES-1:0]; // dmux < cpu
+wire [DATA_WIDTH-1:0] debug_reg_rdata[CORES-1:0]; // dmux < cpu
+
+// between debugging multiplexer and cpu core (packed versions for debugging multiplexer)
+wire [CORES*2-1:0] debug_cpu_mode_raw; // dmux > cpu
+wire [CORES*4-1:0] debug_reg_sel_raw; // dmux > cpu
+wire [CORES-1:0] debug_reg_we_raw; // dmux > cpu
+wire [CORES*DATA_WIDTH-1:0] debug_reg_wdata_raw; // dmux > cpu
+wire [CORES-1:0] debug_reg_stopped_raw; // dmux < cpu
+wire [CORES*DATA_WIDTH-1:0] debug_reg_rdata_raw; // dmux < cpu
+
+// between wishbone multiplexer and programming multiplexer
+wire prog_we; // wbmux > pmux
+wire [LOG_CORES-1:0] prog_sel; // wbmux > pmux
+wire [PC_WIDTH-1:0] prog_waddr; // wbmux > pmux
+wire [INSTR_WIDTH-1:0] prog_wdata; // wbmux > pmux
+
+// between wishbone multiplexer and io pads
+wire pads_we; // wbmux > pads
+wire pads_waddr; // wbmux > pads
+wire [IO_PINS-1:0] pads_wdata; // wbmux > pads
+
+// between wishbone multiplexer and debugging multiplexer
+wire [LOG_CORES-1:0] debug_sel; // wbmux > dmux
+wire [4:0] debug_addr; // wbmux > dmux
+wire debug_we; // wbmux > dmux
+wire [DATA_WIDTH-1:0] debug_wdata; // wbmux > dmux
+wire [DATA_WIDTH-1:0] debug_rdata; // wbmux < dmux
+
+// between wishbone multiplexer and entropy pool
+wire [WB_WIDTH-1:0] entropy_word; // wbmux > ep
+
+// between entropy pool and prng's
+wire entropy_bit; // ep > prng
+
+// repeat for each cpu core
+generate genvar core;
+for(core=0; core<CORES; core=core+1) begin:g_core
+
+ // add the cpu core itself
+ cpu_core #(
+ .DATA_WIDTH(DATA_WIDTH),
+ .PC_WIDTH(PC_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .SPREAD_WIDTH(SPREAD_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .CPU_NUM(core)
+ ) cpu_core_inst (
+ .clk(clk),
+ .rst_n(rst_soft_n),
+ .opcode(opcode[core]),
+ .mem_rdata(mem_rdata[core]),
+ .prng_in(prng_random[core]),
+ .debug_mode(debug_cpu_mode[core]),
+ .debug_sel(debug_reg_sel[core]),
+ .debug_we(debug_reg_we[core]),
+ .debug_wdata(debug_reg_wdata[core]),
+ .progctr(progctr[core]),
+ .mem_we(mem_we[core]),
+ .mem_waddr(mem_waddr[core]),
+ .mem_wspread(mem_wspread[core]),
+ .mem_wdata(mem_wdata[core]),
+ .mem_raddr(mem_raddr[core]),
+ .debug_stopped(debug_reg_stopped[core]),
+ .debug_rdata(debug_reg_rdata[core])
+ );
+
+ // add corresponding instruction memory
+ localparam CORES_RNDUP = 1 << LOG_CORES;
+ localparam DEPTH_MULT = (core + CORES_RNDUP) & ~(core + CORES_RNDUP-1);
+ // e.g. for 8 cores, depths are multiplied by 8, 1, 2, 1, 4, 1, 2, 1
+ // so that we have a few cores that accept longer programs but the total
+ // memory required is still kept reasonably low
+ instr_mem #(
+ .PC_WIDTH(PC_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .DEPTH(INSTR_DEPTH * DEPTH_MULT)
+ ) instr_mem_inst (
+ .clk(clk),
+ .rst_n(rst_hard_n),
+ .raddr(progctr[core]),
+ .rdata(opcode[core]),
+ .we(im_we[core]),
+ .waddr(im_waddr[core]),
+ .wdata(im_wdata[core])
+ );
+
+ // add its own pseudorandom number generator
+ prng_wrap #(
+ .INDEX(core),
+ .OUTPUT_BITS(DATA_WIDTH)
+ ) prng_inst (
+ .clk(clk),
+ .rst_n(rst_prng_n),
+ .entropy(entropy_bit),
+ .random(prng_random[core])
+ );
+
+ // convert memory mesh inputs: unpacked to packed
+ assign mem_we_raw[core] = mem_we[core];
+ assign mem_waddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = mem_waddr[core];
+ assign mem_wspread_raw[core*SPREAD_WIDTH +: SPREAD_WIDTH] = mem_wspread[core];
+ assign mem_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = mem_wdata[core];
+ assign mem_raddr_raw[core*ADDR_WIDTH +: ADDR_WIDTH] = mem_raddr[core];
+
+ // convert memory mesh outputs: packed to unpacked
+ assign mem_rdata[core] = mem_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+
+ // convert programming multiplexer outputs: packed to unpacked
+ assign im_we[core] = im_we_raw[core];
+ assign im_waddr[core] = im_waddr_raw[core*PC_WIDTH +: PC_WIDTH];
+ assign im_wdata[core] = im_wdata_raw[core*INSTR_WIDTH +: INSTR_WIDTH];
+
+ // convert debugging multiplexer inputs: unpacked to packed
+ assign debug_reg_stopped_raw[core] = debug_reg_stopped[core];
+ assign debug_reg_rdata_raw[core*DATA_WIDTH +: DATA_WIDTH] = debug_reg_rdata[core];
+
+ // convert debugging multiplexer outputs: packed to unpacked
+ assign debug_cpu_mode[core] = debug_cpu_mode_raw[core*2 +: 2];
+ assign debug_reg_sel[core] = debug_reg_sel_raw[core*4 +: 4];
+ assign debug_reg_we[core] = debug_reg_we_raw[core];
+ assign debug_reg_wdata[core] = debug_reg_wdata_raw[core*DATA_WIDTH +: DATA_WIDTH];
+
+end
+endgenerate
+
+// add the memory mesh, with a packed bus towards the cpu cores
+mem_mesh #(
+ .CORES(CORES),
+ .DEPTH(MEM_DEPTH),
+ .DATA_WIDTH(DATA_WIDTH),
+ .ADDR_WIDTH(ADDR_WIDTH),
+ .SPREAD_LAYERS(SPREAD_LAYERS),
+ .SPREAD_WIDTH(SPREAD_WIDTH),
+ .USE_IO(1),
+ .IO_PORTS(MEM_IO_PORTS),
+ .IO_FIRST(MEM_IO_FIRST)
+) mem_mesh_inst (
+ .clk(clk),
+ .rst_n(rst_soft_n),
+ .we(mem_we_raw),
+ .waddr(mem_waddr_raw),
+ .wspread(mem_wspread_raw),
+ .wdata(mem_wdata_raw),
+ .raddr(mem_raddr_raw),
+ .rdata(mem_rdata_raw),
+ .io_active_in(mem_io_active_in),
+ .io_active_out(mem_io_active_out),
+ .io_data_in(mem_io_data_in),
+ .io_data_out(mem_io_data_out)
+);
+
+// add the io filter connected to the memory mesh
+io_filter_rev #(
+ .IO_PINS(IO_PINS),
+ .DATA_WIDTH(DATA_WIDTH)
+) io_filter_inst (
+ .clk(clk),
+ .rst_n(rst_soft_n),
+ .pin_dir(pin_dir),
+ .pin_data_in(pin_data_in),
+ .pin_data_out(pin_data_out),
+ .port_active_in(mem_io_active_in),
+ .port_active_out(mem_io_active_out),
+ .port_data_in(mem_io_data_in),
+ .port_data_out(mem_io_data_out)
+);
+
+// add the programming multiplexer, with a packed bus towards instruction memories
+prog_mux #(
+ .CORES(CORES),
+ .LOG_CORES(LOG_CORES),
+ .PC_WIDTH(PC_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH)
+) prog_mux_inst (
+ .we(prog_we),
+ .sel(prog_sel),
+ .waddr(prog_waddr),
+ .wdata(prog_wdata),
+ .cwe(im_we_raw),
+ .cwaddr(im_waddr_raw),
+ .cwdata(im_wdata_raw)
+);
+
+// add the debugging multiplexer, with a packed bus towards cpu cores
+debug_mux #(
+ .CORES(CORES),
+ .LOG_CORES(LOG_CORES),
+ .DATA_WIDTH(DATA_WIDTH)
+) debug_mux_inst (
+ .sel(debug_sel),
+ .addr(debug_addr),
+ .we(debug_we),
+ .wdata(debug_wdata),
+ .rdata(debug_rdata),
+ .reg_stopped(debug_reg_stopped_raw),
+ .reg_rdata(debug_reg_rdata_raw),
+ .cpu_mode(debug_cpu_mode_raw),
+ .reg_sel(debug_reg_sel_raw),
+ .reg_we(debug_reg_we_raw),
+ .reg_wdata(debug_reg_wdata_raw)
+);
+
+// add the entropy pool
+entropy_pool #(
+ .WIDTH(WB_WIDTH)
+) entropy_pool_inst (
+ .clk(clk),
+ .rst_n(rst_prng_n),
+ .e_word(entropy_word),
+ .e_bit(entropy_bit)
+);
+
+// add the wishbone multiplexer
+wb_mux #(
+ .LOG_CORES(LOG_CORES),
+ .PC_WIDTH(PC_WIDTH),
+ .INSTR_WIDTH(INSTR_WIDTH),
+ .DATA_WIDTH(DATA_WIDTH),
+ .IO_PINS(IO_PINS),
+ .WB_WIDTH(WB_WIDTH)
+) wb_mux_inst (
+ .wb_stb_i(wb_stb_i),
+ .wb_cyc_i(wb_cyc_i),
+ .wb_we_i(wb_we_i),
+ .wb_adr_i(wb_adr_i),
+ .wb_dat_i(wb_dat_i),
+ .wbs_ack_o(wbs_ack_o),
+ .wbs_dat_o(wbs_dat_o),
+ .prog_we(prog_we),
+ .prog_sel(prog_sel),
+ .prog_waddr(prog_waddr),
+ .prog_wdata(prog_wdata),
+ .pads_we(pads_we),
+ .pads_waddr(pads_waddr),
+ .pads_wdata(pads_wdata),
+ .debug_sel(debug_sel),
+ .debug_addr(debug_addr),
+ .debug_we(debug_we),
+ .debug_wdata(debug_wdata),
+ .debug_rdata(debug_rdata),
+ .entropy_word(entropy_word)
+);
+
+// add the io pads & logic analyzer probes
+// (this includes some reset & clock logic as well)
+io_pads #(
+ .IO_PINS(IO_PINS),
+ .IO_PADS(IO_PADS),
+ .LOGIC_PROBES(LOGIC_PROBES),
+ .FIRST_PAD(FIRST_PAD)
+) io_pads_inst (
+ .wb_clk_i(wb_clk_i),
+ .wb_rst_i(wb_rst_i),
+ .la_data_in(la_data_in),
+ .la_data_out(la_data_out),
+ .la_oenb(la_oenb),
+ .io_in(io_in),
+ .io_out(io_out),
+ .io_oeb(io_oeb),
+ .clk(clk),
+ .rst_hard_n(rst_hard_n),
+ .rst_soft_n(rst_soft_n),
+ .rst_prng_n(rst_prng_n),
+ .pin_dir(pin_dir),
+ .pin_data_in(pin_data_in),
+ .pin_data_out(pin_data_out),
+ .cfg_we(pads_we),
+ .cfg_addr(pads_waddr),
+ .cfg_wdata(pads_wdata)
+);
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/mem_mesh.v b/verilog/rtl/mem_mesh.v
new file mode 100644
index 0000000..0afa424
--- /dev/null
+++ b/verilog/rtl/mem_mesh.v
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Generates a DFF RAM block for each core with a tree-like interconnect mesh between them
+
+Parameters:
+CORES = number of cpu cores, also specifies the number of ram blocks
+DEPTH = number of words per ram block
+DATA_WIDTH = word size, number of bits per memory cell
+ADDR_WIDTH = address bus width, should be clog2(DEPTH)
+SPREAD_LAYERS = number of spread layers, should be clog2(CORES)
+SPREAD_WIDTH = spread bus width, should be clog2(2+SPREAD_LAYERS)
+IO_PORTS = number of io ports, should be <= DEPTH
+IO_FIRST = memory cell mapped to the first io port, should be <= DEPTH - IO_PORTS
+
+A value of wspread > 0 on write operations specifies that the same address should also be written in some
+other memory blocks. In particular, blocks whose number only differ in the lowest wspread bits are affected.
+If several simultaneous write operations affect the same memory cell, writes with higher wspread have
+priority. For writes having equal wspread the core with the lowest number wins.
+
+If addresses < IO_BUS_WIDTH are written with wspread > SPREAD_LAYERS, wdata is also sent to the io bus.
+Incoming data on the io bus is written to the respective cells with maximal spread (affecting all cores).
+*/
+
+module mem_mesh #(parameter CORES=8, DEPTH=256, DATA_WIDTH=16, ADDR_WIDTH=8, SPREAD_LAYERS=3, SPREAD_WIDTH=3, USE_IO=1, IO_PORTS=16, IO_FIRST=240) (
+ input clk, // clock signal
+ input rst_n, // reset, active low
+ input [CORES-1:0] we, // write enable
+ input [CORES*ADDR_WIDTH-1:0] waddr, // write address
+ input [CORES*SPREAD_WIDTH-1:0] wspread, // write spread
+ input [CORES*DATA_WIDTH-1:0] wdata, // write data
+ input [CORES*ADDR_WIDTH-1:0] raddr, // read address
+ output [CORES*DATA_WIDTH-1:0] rdata, // read data
+ input [IO_PORTS-1:0] io_active_in, // is receiving data on io bus
+ output [IO_PORTS-1:0] io_active_out, // is sending data on io bus
+ input [IO_PORTS*DATA_WIDTH-1:0] io_data_in, // io bus input
+ output [IO_PORTS*DATA_WIDTH-1:0] io_data_out // io bus output
+);
+
+reg [DATA_WIDTH-1:0] mem[CORES-1:0][DEPTH-1:0]; // memory cells
+wire presel[CORES-1:0][DEPTH-1:0]; // is address selected before spreading
+wire uspread[CORES-1:0][SPREAD_LAYERS+1-1:0]; // is spreading to layer
+wire postsel[CORES-1:0][DEPTH-1:0]; // is address selected after spreading
+wire [DATA_WIDTH-1:0] postdata[CORES-1:0][DEPTH-1:0]; // data to be written after spreading
+
+generate genvar core, addr, layer, group, spl;
+
+// convert spread to unary
+for (core=0; core<CORES; core=core+1) begin:g_core
+ for(layer=0; layer<=SPREAD_LAYERS; layer=layer+1) begin:g_layer
+ assign uspread[core][layer] = we[core] & wspread[core*SPREAD_WIDTH +: SPREAD_WIDTH] > layer;
+ end
+end
+
+for (addr=0; addr<DEPTH; addr=addr+1) begin:g_cell
+
+ // convert write address to one-hot encoding
+ for (core=0; core<CORES; core=core+1) begin:g_core_m
+ assign presel[core][addr] = we[core] & (waddr[core*ADDR_WIDTH +: ADDR_WIDTH] == addr);
+ end
+
+ // calculate spreading from individual cores to groups of cores
+ for (layer=0; layer<=SPREAD_LAYERS; layer=layer+1) begin:spread
+ localparam GROUPS = CORES >> layer;
+ wire gsel[GROUPS-1:0];
+ wire [DATA_WIDTH-1:0] gdata[GROUPS-1:0];
+ wire gspread[GROUPS-1:0][SPREAD_LAYERS+1-layer-1:0];
+ if (layer == 0) begin:i_layerz
+ for (group=0; group<GROUPS; group=group+1) begin:g_group
+ assign gsel[group] = presel[group][addr];
+ assign gdata[group] = {(DATA_WIDTH){we[group]}} & wdata[group*DATA_WIDTH +: DATA_WIDTH];
+ for (spl=0; spl<=SPREAD_LAYERS; spl=spl+1) begin:cspread
+ assign gspread[group][spl] = uspread[group][spl];
+ end
+ end
+ end else begin:i_layernz
+ for (group=0; group<GROUPS; group=group+1) begin:g_group
+ wire gs1 = spread[layer-1].gsel[group*2] & spread[layer-1].gspread[group*2][0];
+ wire gs2 = spread[layer-1].gsel[group*2+1] & spread[layer-1].gspread[group*2+1][0];
+ wire [DATA_WIDTH-1:0] gd1 = spread[layer-1].gdata[group*2];
+ wire [DATA_WIDTH-1:0] gd2 = spread[layer-1].gdata[group*2+1];
+ assign gsel[group] = gs1 | gs2;
+ assign gdata[group] = gs1 ? gd1 : gd2;
+ for (spl=0; spl<=SPREAD_LAYERS-layer; spl=spl+1) begin:g_spread
+ wire gsp1 = spread[layer-1].gspread[group*2][spl+1];
+ wire gsp2 = spread[layer-1].gspread[group*2+1][spl+1];
+ assign gspread[group][spl] = gs1 ? gsp1 : gsp2;
+ end
+ end
+ end
+ end
+
+ // mix in io logic at the highest spreading level
+ wire gs_i;
+ wire [DATA_WIDTH-1:0] gd_i;
+ if (USE_IO && IO_FIRST <= addr && addr < IO_FIRST + IO_PORTS) begin:i_io
+ localparam io = addr - IO_FIRST;
+ wire gs_o = spread[SPREAD_LAYERS].gsel[0] & spread[SPREAD_LAYERS].gspread[0][0];
+ wire [DATA_WIDTH-1:0] gd_o = {(DATA_WIDTH){gs_o}} & spread[SPREAD_LAYERS].gdata[0];
+ assign io_active_out[io] = gs_o;
+ assign io_data_out[io*DATA_WIDTH +: DATA_WIDTH] = gd_o;
+ assign gs_i = io_active_in[io] ? 1'b1 : spread[SPREAD_LAYERS].gsel[0];
+ assign gd_i = io_active_in[io] ? io_data_in[io*DATA_WIDTH +: DATA_WIDTH] : spread[SPREAD_LAYERS].gdata[0];
+ end else begin:i_nio
+ assign gs_i = spread[SPREAD_LAYERS].gsel[0];
+ assign gd_i = spread[SPREAD_LAYERS].gdata[0];
+ end
+
+ // calculate spreading back from groups of cores to individual cores
+ for (layer=SPREAD_LAYERS; layer>=0; layer=layer-1) begin:collect
+ localparam GROUPS = CORES >> layer;
+ wire pgsel[GROUPS-1:0];
+ wire [DATA_WIDTH-1:0] pgdata[GROUPS-1:0];
+ if (layer == SPREAD_LAYERS) begin:i_layerl
+ assign pgsel[0] = gs_i;
+ assign pgdata[0] = gd_i;
+ for (group=1; group<GROUPS; group=group+1) begin:g_group
+ assign pgsel[group] = spread[layer].gsel[group];
+ assign pgdata[group] = spread[layer].gdata[group];
+ end
+ end else begin:i_layernl
+ for (group=0; group<GROUPS; group=group+1) begin:g_group
+ wire gs = spread[layer].gsel[group];
+ wire [DATA_WIDTH-1:0] gd = spread[layer].gdata[group];
+ wire cgs = collect[layer+1].pgsel[group/2];
+ wire [DATA_WIDTH-1:0] cgd = collect[layer+1].pgdata[group/2];
+ assign pgsel[group] = cgs | gs;
+ assign pgdata[group] = cgs ? cgd : gd;
+ end
+ end
+ end
+ for (core=0; core<CORES; core=core+1) begin:g_core_c
+ assign postsel[core][addr] = collect[0].pgsel[core];
+ assign postdata[core][addr] = collect[0].pgdata[core];
+ end
+
+ // sequential write logic
+ for (core=0; core<CORES; core=core+1) begin:g_core_w
+ always @(posedge clk) begin
+ if (!rst_n) begin
+ mem[core][addr] <= 0;
+ end else begin
+ if (postsel[core][addr]) begin
+ mem[core][addr] <= postdata[core][addr];
+ end
+ end
+ end
+ end
+
+end
+
+// read logic
+for (core=0; core<CORES; core=core+1) begin:g_core_r
+ wire [ADDR_WIDTH-1:0] craddr = raddr[core*ADDR_WIDTH +: ADDR_WIDTH];
+ assign rdata[core*DATA_WIDTH +: DATA_WIDTH] = mem[core][craddr];
+end
+
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/pin_compress.v b/verilog/rtl/pin_compress.v
new file mode 100644
index 0000000..b508527
--- /dev/null
+++ b/verilog/rtl/pin_compress.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial circuit shifting input bits from the mask bit positions
+
+E.g.
+data = 1001110100110101
+mask = 0100100101000101
+ 0 1 1 0 1 1
+result = 0000000000011011
+*/
+
+module pin_compress #(parameter WIDTH=16) (
+ input [WIDTH-1:0] data,
+ input [WIDTH-1:0] mask,
+ output [WIDTH-1:0] result
+);
+
+generate genvar layer;
+ for (layer=0; layer<WIDTH; layer=layer+1) begin:comp
+ wire [WIDTH-1:0] sd;
+ if (layer == 0) begin:i_first
+ assign sd = {{(WIDTH-1){1'b0}}, data[WIDTH-1] & mask[WIDTH-1]};
+ end else begin:i_nfirst
+ wire [WIDTH-1:0] sdp = comp[layer-1].sd;
+ assign sd = mask[WIDTH-1-layer] ? {sdp[WIDTH-2:0], data[WIDTH-1-layer]} : sdp;
+ end
+ end
+ assign result = comp[WIDTH-1].sd;
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/pin_decompress.v b/verilog/rtl/pin_decompress.v
new file mode 100644
index 0000000..bea24eb
--- /dev/null
+++ b/verilog/rtl/pin_decompress.v
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial circuit shifting input bits to the mask bit positions
+
+E.g.
+data = 0000000000001011
+mask = 0101000101000101
+ 0 0 1 0 1 1
+result = 0000000100000101
+*/
+
+module pin_decompress #(parameter WIDTH=16) (
+ input [WIDTH-1:0] data,
+ input [WIDTH-1:0] mask,
+ output [WIDTH-1:0] result
+);
+
+generate genvar layer;
+ for (layer=0; layer<WIDTH; layer=layer+1) begin:decomp
+ wire [WIDTH-1:0] sd;
+ if (layer == 0) begin:i_first
+ assign sd = data;
+ end else begin:i_nfirst
+ wire [WIDTH-1:0] sdp = decomp[layer-1].sd;
+ assign sd = mask[layer-1] ? sdp >> 1 : sdp;
+ end
+ assign result[layer] = mask[layer] & sd[0];
+ end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prng.v b/verilog/rtl/prng.v
new file mode 100644
index 0000000..30d647a
--- /dev/null
+++ b/verilog/rtl/prng.v
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Pseudorandom number generator using a Fibonacci-style XNOR linear feedback shift register
+
+STATE_BITS = number of bits for prng state
+POLYNOMIAL = bit mask used for feedback, should be chosen so that the prng repeats ifself after 2^(STATE_BITS-1) cycles
+STATE_INIT = used to seed the prng on reset
+OUTPUT_BITS = number of bits shifted out every clock cycle
+*/
+
+module prng #(parameter STATE_BITS = 4, POLYNOMIAL = 4'b1001, STATE_INIT = 4'b0000, OUTPUT_BITS = 2) (
+ input clk,
+ input rst_n,
+ input entropy, // optional external entropy for more randomness
+ output [OUTPUT_BITS-1:0] random
+);
+
+localparam SCRAMBLE_CYCLES = STATE_BITS;
+reg [STATE_BITS-1:0] state;
+
+generate genvar shift;
+
+// shift register for generating next OUTPUT_BITS states
+for (shift=0; shift<OUTPUT_BITS; shift=shift+1) begin:g_shift
+ wire [STATE_BITS-1:0] prev_state;
+ wire feedback;
+ if (shift == 0) begin:i_first
+ assign prev_state = state;
+ assign feedback = ^(prev_state & POLYNOMIAL) ^ entropy;
+ end else begin:i_nfirst
+ assign prev_state = g_shift[shift-1].new_state;
+ assign feedback = ^(prev_state & POLYNOMIAL);
+ end
+ wire [STATE_BITS-1:0] new_state = {prev_state[STATE_BITS-2:0], ~feedback};
+ assign random[OUTPUT_BITS-shift-1] = prev_state[STATE_BITS-1];
+end
+wire [STATE_BITS-1:0] final_state = g_shift[OUTPUT_BITS-1].new_state;
+
+// reuse the same shift register to shift out a couple of bits in the beginning so that
+// we can use a very simple seed without affecting the quality of the first few cycles
+// (this happens at synth time, so it's practically free)
+for (shift=0; shift<SCRAMBLE_CYCLES; shift=shift+1) begin:g_scramble
+ wire [STATE_BITS-1:0] prev_state;
+ if (shift == 0) begin:i_first
+ assign prev_state = STATE_INIT;
+ end else begin:i_nfirst
+ assign prev_state = g_scramble[shift-1].new_state;
+ end
+ wire feedback = ^(prev_state & POLYNOMIAL);
+ wire [STATE_BITS-1:0] new_state = {prev_state[STATE_BITS-2:0], ~feedback};
+end
+wire [STATE_BITS-1:0] scrambled_init = g_scramble[SCRAMBLE_CYCLES-1].new_state;
+
+endgenerate
+
+always @(posedge clk) begin
+ if (!rst_n) begin
+ state <= scrambled_init;
+ end else begin
+ state <= final_state;
+ end
+end
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prng_wrap.v b/verilog/rtl/prng_wrap.v
new file mode 100644
index 0000000..9a635d3
--- /dev/null
+++ b/verilog/rtl/prng_wrap.v
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Wrapper for prng with known good polynomials (having a cycle length of 2^32-1 and a minimal bit count)
+
+Different choices of 0 <= INDEX < 256 generate independent prng's. For even more, the table below should be extended.
+*/
+
+module prng_wrap #(parameter INDEX = 0, OUTPUT_BITS = 16) (
+ input clk,
+ input rst_n,
+ input entropy,
+ output [OUTPUT_BITS-1:0] random
+);
+
+localparam STATE_BITS = 32;
+localparam POLY_ARRAY_LEN = 256;
+localparam POLY_ARRAY = {
+ 32'h80000062, 32'h80000092, 32'h80000106, 32'h80000114, 32'h80000412, 32'h80000414, 32'h80000806, 32'h80000850,
+ 32'h8000100C, 32'h80001050, 32'h80001C00, 32'h80002021, 32'h80002204, 32'h80002810, 32'h80004050, 32'h80004201,
+ 32'h80008006, 32'h80008042, 32'h80008102, 32'h80008401, 32'h80008500, 32'h80009004, 32'h80010006, 32'h80010048,
+ 32'h80010240, 32'h80014004, 32'h80014800, 32'h80020030, 32'h80020102, 32'h80020402, 32'h80022010, 32'h80022100,
+ 32'h80030010, 32'h80040022, 32'h80040280, 32'h80042020, 32'h80043000, 32'h80050008, 32'h80060040, 32'h80061000,
+ 32'h80080012, 32'h80080120, 32'h80094000, 32'h800A0010, 32'h80100048, 32'h80100820, 32'h801C0000, 32'h80200003,
+ 32'h80200060, 32'h80200101, 32'h80202001, 32'h80210001, 32'h80400021, 32'h80401020, 32'h80420010, 32'h80422000,
+ 32'h80508000, 32'h80800012, 32'h80801002, 32'h80810004, 32'h80840001, 32'h80900002, 32'h80A01000, 32'h81000021,
+ 32'h81000050, 32'h810000C0, 32'h81000220, 32'h81001020, 32'h81003000, 32'h81004040, 32'h81010020, 32'h81010040,
+ 32'h81204000, 32'h81400001, 32'h81400008, 32'h81800040, 32'h82000014, 32'h82000024, 32'h82000044, 32'h82000048,
+ 32'h82000108, 32'h82000110, 32'h82000410, 32'h82004040, 32'h82010002, 32'h82021000, 32'h82040040, 32'h82040100,
+ 32'h82080400, 32'h82200040, 32'h82400800, 32'h82800010, 32'h83000200, 32'h84000050, 32'h840000A0, 32'h84000401,
+ 32'h84002100, 32'h84002800, 32'h84006000, 32'h84022000, 32'h840A0000, 32'h84100002, 32'h84100020, 32'h84400020,
+ 32'h85000010, 32'h85000040, 32'h85010000, 32'h85040000, 32'h85080000, 32'h86000004, 32'h86002000, 32'h88000102,
+ 32'h88000140, 32'h88001002, 32'h88005000, 32'h88020001, 32'h88400020, 32'h89000002, 32'h89000020, 32'h89000400,
+ 32'h89004000, 32'h8A000004, 32'h8C000001, 32'h90000028, 32'h90000030, 32'h90004002, 32'h90004080, 32'h90014000,
+ 32'h90048000, 32'h90220000, 32'h90800002, 32'h91000020, 32'h92000020, 32'h94000020, 32'h94100000, 32'h94400000,
+ 32'h98040000, 32'hA0000048, 32'hA0000084, 32'hA0000410, 32'hA0000480, 32'hA0004020, 32'hA0008001, 32'hA0010004,
+ 32'hA0040008, 32'hA0040080, 32'hA0102000, 32'hA0400008, 32'hA0402000, 32'hA0408000, 32'hA1008000, 32'hA2001000,
+ 32'hA3000000, 32'hA4000080, 32'hA4000800, 32'hA4100000, 32'hA4800000, 32'hB0004000, 32'hB0008000, 32'hB0080000,
+ 32'hB0400000, 32'hC0000005, 32'hC0000018, 32'hC0000140, 32'hC0001080, 32'hC0002008, 32'hC0004200, 32'hC0008002,
+ 32'hC0020200, 32'hC0100010, 32'hC0108000, 32'hC0210000, 32'hC0400200, 32'hC2000040, 32'hC2000100, 32'hC2020000,
+ 32'hD0000001, 32'hE0000200, 32'h80000057, 32'h8000007A, 32'h800000B9, 32'h800000BA, 32'h8000012D, 32'h8000014E,
+ 32'h8000016C, 32'h800001A6, 32'h8000020F, 32'h800002CC, 32'h80000349, 32'h80000370, 32'h80000392, 32'h80000398,
+ 32'h80000417, 32'h80000465, 32'h8000046A, 32'h80000478, 32'h800004D4, 32'h8000050B, 32'h80000526, 32'h8000054C,
+ 32'h800005C1, 32'h8000060D, 32'h8000060E, 32'h80000629, 32'h80000638, 32'h80000662, 32'h800006B0, 32'h80000748,
+ 32'h8000088D, 32'h800008E1, 32'h80000923, 32'h80000931, 32'h80000934, 32'h80000958, 32'h80000A25, 32'h80000A26,
+ 32'h80000A54, 32'h80000A92, 32'h80000AC4, 32'h80000B28, 32'h80000B84, 32'h80000C34, 32'h80000C43, 32'h80000CA2,
+ 32'h80000D22, 32'h80000D28, 32'h80000E24, 32'h8000100F, 32'h80001027, 32'h80001035, 32'h80001047, 32'h80001071,
+ 32'h80001078, 32'h8000108E, 32'h800010C9, 32'h80001126, 32'h80001164, 32'h80001231, 32'h8000140E, 32'h80001485,
+ 32'h80001491, 32'h80001560, 32'h80001614, 32'h80001624, 32'h80001684, 32'h80001702, 32'h80001813, 32'h80001851,
+ 32'h80001870, 32'h800018C1, 32'h80001928, 32'h80001A06, 32'h80001A12, 32'h80001C50, 32'h80001C88, 32'h80002053
+};
+
+prng #(
+ .STATE_BITS(STATE_BITS),
+ .POLYNOMIAL(POLY_ARRAY[(POLY_ARRAY_LEN-1-(INDEX % POLY_ARRAY_LEN))*STATE_BITS +: STATE_BITS]),
+ .STATE_INIT(INDEX),
+ .OUTPUT_BITS(OUTPUT_BITS)
+) prng_inst (
+ .clk(clk),
+ .rst_n(rst_n),
+ .entropy(entropy),
+ .random(random)
+);
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/prog_mux.v b/verilog/rtl/prog_mux.v
new file mode 100644
index 0000000..b358bca
--- /dev/null
+++ b/verilog/rtl/prog_mux.v
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Fully combinatorial programming multiplexer
+*/
+
+module prog_mux #(parameter CORES=8, LOG_CORES=3, PC_WIDTH=8, INSTR_WIDTH=32) (
+ input we,
+ input [LOG_CORES-1:0] sel,
+ input [PC_WIDTH-1:0] waddr,
+ input [INSTR_WIDTH-1:0] wdata,
+ output [CORES-1:0] cwe,
+ output [CORES*PC_WIDTH-1:0] cwaddr,
+ output [CORES*INSTR_WIDTH-1:0] cwdata
+);
+
+generate genvar core;
+for (core=0; core<CORES; core=core+1) begin:g_core
+ wire active = we && sel==core;
+ assign cwe[core] = active;
+ assign cwaddr[core*PC_WIDTH +: PC_WIDTH] = {(PC_WIDTH){active}} & waddr;
+ assign cwdata[core*INSTR_WIDTH +: INSTR_WIDTH] = {(INSTR_WIDTH){active}} & wdata;
+end
+endgenerate
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/uprj_netlists.v b/verilog/rtl/uprj_netlists.v
index 3537de8..b7ceb05 100644
--- a/verilog/rtl/uprj_netlists.v
+++ b/verilog/rtl/uprj_netlists.v
@@ -21,8 +21,11 @@
// Assume default net type to be wire because GL netlists don't have the wire definitions
`default_nettype wire
`include "gl/user_project_wrapper.v"
- `include "gl/user_proj_example.v"
+ `include "gl/user_project.v"
`else
`include "user_project_wrapper.v"
- `include "user_proj_example.v"
-`endif
\ No newline at end of file
+ `include "user_project.v"
+`endif
+
+`default_nettype wire
+
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
deleted file mode 100644
index 26081e9..0000000
--- a/verilog/rtl/user_proj_example.v
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-FileCopyrightText: 2020 Efabless Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// SPDX-License-Identifier: Apache-2.0
-
-`default_nettype none
-/*
- *-------------------------------------------------------------
- *
- * user_proj_example
- *
- * This is an example of a (trivially simple) user project,
- * showing how the user project can connect to the logic
- * analyzer, the wishbone bus, and the I/O pads.
- *
- * This project generates an integer count, which is output
- * on the user area GPIO pads (digital output only). The
- * wishbone connection allows the project to be controlled
- * (start and stop) from the management SoC program.
- *
- * See the testbenches in directory "mprj_counter" for the
- * example programs that drive this user project. The three
- * testbenches are "io_ports", "la_test1", and "la_test2".
- *
- *-------------------------------------------------------------
- */
-
-module user_proj_example #(
- parameter BITS = 32
-)(
-`ifdef USE_POWER_PINS
- inout vccd1, // User area 1 1.8V supply
- inout vssd1, // User area 1 digital ground
-`endif
-
- // Wishbone Slave ports (WB MI A)
- input wb_clk_i,
- input wb_rst_i,
- input wbs_stb_i,
- input wbs_cyc_i,
- input wbs_we_i,
- input [3:0] wbs_sel_i,
- input [31:0] wbs_dat_i,
- input [31:0] wbs_adr_i,
- output wbs_ack_o,
- output [31:0] wbs_dat_o,
-
- // Logic Analyzer Signals
- input [127:0] la_data_in,
- output [127:0] la_data_out,
- input [127:0] la_oenb,
-
- // IOs
- input [`MPRJ_IO_PADS-1:0] io_in,
- output [`MPRJ_IO_PADS-1:0] io_out,
- output [`MPRJ_IO_PADS-1:0] io_oeb,
-
- // IRQ
- output [2:0] irq
-);
- wire clk;
- wire rst;
-
- wire [`MPRJ_IO_PADS-1:0] io_in;
- wire [`MPRJ_IO_PADS-1:0] io_out;
- wire [`MPRJ_IO_PADS-1:0] io_oeb;
-
- wire [31:0] rdata;
- wire [31:0] wdata;
- wire [BITS-1:0] count;
-
- wire valid;
- wire [3:0] wstrb;
- wire [31:0] la_write;
-
- // WB MI A
- assign valid = wbs_cyc_i && wbs_stb_i;
- assign wstrb = wbs_sel_i & {4{wbs_we_i}};
- assign wbs_dat_o = rdata;
- assign wdata = wbs_dat_i;
-
- // IO
- assign io_out = count;
- assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};
-
- // IRQ
- assign irq = 3'b000; // Unused
-
- // LA
- assign la_data_out = {{(127-BITS){1'b0}}, count};
- // Assuming LA probes [63:32] are for controlling the count register
- assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
- // Assuming LA probes [65:64] are for controlling the count clk & reset
- assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
- assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
-
- counter #(
- .BITS(BITS)
- ) counter(
- .clk(clk),
- .reset(rst),
- .ready(wbs_ack_o),
- .valid(valid),
- .rdata(rdata),
- .wdata(wbs_dat_i),
- .wstrb(wstrb),
- .la_write(la_write),
- .la_input(la_data_in[63:32]),
- .count(count)
- );
-
-endmodule
-
-module counter #(
- parameter BITS = 32
-)(
- input clk,
- input reset,
- input valid,
- input [3:0] wstrb,
- input [BITS-1:0] wdata,
- input [BITS-1:0] la_write,
- input [BITS-1:0] la_input,
- output ready,
- output [BITS-1:0] rdata,
- output [BITS-1:0] count
-);
- reg ready;
- reg [BITS-1:0] count;
- reg [BITS-1:0] rdata;
-
- always @(posedge clk) begin
- if (reset) begin
- count <= 0;
- ready <= 0;
- end else begin
- ready <= 1'b0;
- if (~|la_write) begin
- count <= count + 1;
- end
- if (valid && !ready) begin
- ready <= 1'b1;
- rdata <= count;
- if (wstrb[0]) count[7:0] <= wdata[7:0];
- if (wstrb[1]) count[15:8] <= wdata[15:8];
- if (wstrb[2]) count[23:16] <= wdata[23:16];
- if (wstrb[3]) count[31:24] <= wdata[31:24];
- end else if (|la_write) begin
- count <= la_write & la_input;
- end
- end
- end
-
-endmodule
-`default_nettype wire
diff --git a/verilog/rtl/user_project.v b/verilog/rtl/user_project.v
new file mode 100644
index 0000000..3b1b0ed
--- /dev/null
+++ b/verilog/rtl/user_project.v
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+module user_project (
+`ifdef USE_POWER_PINS
+ inout vccd1, // User area 1 1.8V supply
+ inout vssd1, // User area 1 digital ground
+`endif
+
+ // Wishbone Slave ports (WB MI A)
+ input wb_clk_i,
+ input wb_rst_i,
+ input wbs_stb_i,
+ input wbs_cyc_i,
+ input wbs_we_i,
+ input [3:0] wbs_sel_i,
+ input [31:0] wbs_dat_i,
+ input [31:0] wbs_adr_i,
+ output wbs_ack_o,
+ output [31:0] wbs_dat_o,
+
+ // Logic Analyzer Signals
+ input [127:0] la_data_in,
+ output [127:0] la_data_out,
+ input [127:0] la_oenb,
+
+ // IOs
+ input [`MPRJ_IO_PADS-1:0] io_in,
+ output [`MPRJ_IO_PADS-1:0] io_out,
+ output [`MPRJ_IO_PADS-1:0] io_oeb,
+
+ // IRQ
+ output [2:0] irq
+);
+
+mcu #(
+ .CORES(4),
+ .LOG_CORES(2),
+ .MEM_DEPTH(32),
+ .DATA_WIDTH(16),
+ .PC_WIDTH(6),
+ .ADDR_WIDTH(5),
+ .INSTR_WIDTH(32),
+ .INSTR_DEPTH(16),
+ .IO_PINS(16),
+ .IO_PADS(`MPRJ_IO_PADS),
+ .FIRST_PAD(12),
+ .LOGIC_PROBES(128),
+ .WB_WIDTH(32)
+) mcu_inst (
+ .wb_clk_i(wb_clk_i),
+ .wb_rst_i(wb_rst_i),
+ .wb_stb_i(wb_stb_i),
+ .wb_cyc_i(wb_cyc_i),
+ .wb_we_i(wb_we_i),
+ .wb_adr_i(wb_adr_i),
+ .wb_dat_i(wb_dat_i),
+ .wbs_ack_o(wbs_ack_o),
+ .wbs_dat_o(wbs_dat_o),
+ .la_data_in(la_data_in),
+ .la_data_out(la_data_out),
+ .la_oenb(la_oenb),
+ .io_in(io_in),
+ .io_out(io_out),
+ .io_oeb(io_oeb)
+);
+
+assign irq = 3'b000; // unused
+
+endmodule
+
+`default_nettype wire
+
diff --git a/verilog/rtl/user_project_wrapper.v b/verilog/rtl/user_project_wrapper.v
index 5ee1cee..5057915 100644
--- a/verilog/rtl/user_project_wrapper.v
+++ b/verilog/rtl/user_project_wrapper.v
@@ -29,9 +29,7 @@
*-------------------------------------------------------------
*/
-module user_project_wrapper #(
- parameter BITS = 32
-) (
+module user_project_wrapper (
`ifdef USE_POWER_PINS
inout vdda1, // User area 1 3.3V supply
inout vdda2, // User area 2 3.3V supply
@@ -82,7 +80,7 @@
/* User project is instantiated here */
/*--------------------------------------*/
-user_proj_example mprj (
+user_project mprj (
`ifdef USE_POWER_PINS
.vccd1(vccd1), // User area 1 1.8V power
.vssd1(vssd1), // User area 1 digital ground
@@ -121,3 +119,4 @@
endmodule // user_project_wrapper
`default_nettype wire
+
diff --git a/verilog/rtl/wb_mux.v b/verilog/rtl/wb_mux.v
new file mode 100644
index 0000000..e24244e
--- /dev/null
+++ b/verilog/rtl/wb_mux.v
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2021 Tamas Hubai
+
+`default_nettype none
+
+/*
+Wishbone multiplexer to process messages from Caravel
+
+We use wishbone in classic mode with the simplest possible interface:
+- all operations complete in a single cycle
+- input is valid if STB_I && CYC_I is asserted
+- for valid inputs, ACK_O is held asserted
+- if WE_I is asserted, a write operation is performed using ADR_I and DAT_I
+- if WE_I is negated, a read operation is performed using ADR_I with the result in DAT_O
+- all other ports are unused
+
+The wishbone bus width (WB_WIDTH below) is fixed to 32 by the platform and our code
+assumes that all other widths fit into it.
+
+This module (like other muxes in this project) is fully combinatorial.
+Registered logic happens in connected cpu cores, instruction memories and the entropy pool.
+Therefore CLK_I and RST_I are not directly used here. However, it is used in the
+parent module as the main clock and reset signal and thus affect the modules
+connected to the other interfaces.
+*/
+
+module wb_mux #(parameter
+ LOG_CORES=3,
+ PC_WIDTH=8,
+ INSTR_WIDTH=32,
+ DATA_WIDTH=16,
+ IO_PINS=16,
+ WB_WIDTH=32
+) (
+ // wishbone interface
+ //input wb_clk_i, // wb clock
+ //input wb_rst_i, // wb reset, active high
+ input wb_stb_i, // wb strobe signal
+ input wb_cyc_i, // wb cycle signal, sending on the bus requires wb_stb_i && wb_cyc_i
+ input wb_we_i, // wb write enable signal, 0=input 1=output
+ input [WB_WIDTH-1:0] wb_adr_i, // wb address
+ input [WB_WIDTH-1:0] wb_dat_i, // wb input data
+ output wbs_ack_o, // wb acknowledge
+ output [WB_WIDTH-1:0] wbs_dat_o, // wb output data
+ // programmer interface
+ output prog_we,
+ output [LOG_CORES-1:0] prog_sel,
+ output [PC_WIDTH-1:0] prog_waddr,
+ output [INSTR_WIDTH-1:0] prog_wdata,
+ // pads & soft reset interface
+ output pads_we,
+ output pads_waddr,
+ output [IO_PINS-1:0] pads_wdata,
+ // debugger interface
+ output [LOG_CORES-1:0] debug_sel,
+ output [4:0] debug_addr,
+ output debug_we,
+ output [DATA_WIDTH-1:0] debug_wdata,
+ input [DATA_WIDTH-1:0] debug_rdata,
+ // entropy pool interface
+ output[WB_WIDTH-1:0] entropy_word
+);
+
+// minimal wishbone logic
+wire valid = wb_stb_i && wb_cyc_i;
+assign wbs_ack_o = valid;
+
+// interface selection
+wire[1:0] interface = wb_adr_i[WB_WIDTH-2 +: 2];
+wire if_prog = valid && interface == 2'b00;
+wire if_pads = valid && interface == 2'b01;
+wire if_debug = valid && interface == 2'b10;
+wire if_entropy = valid && interface == 2'b11;
+
+// programmer interface
+assign prog_we = if_prog && wb_we_i;
+assign {prog_sel, prog_waddr} = prog_we ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign prog_wdata = prog_we ? wb_dat_i : 0;
+
+// pads interface
+assign pads_we = if_pads && wb_we_i;
+assign pads_waddr = pads_we ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign pads_wdata = pads_we ? wb_dat_i : 0;
+
+// debugger interface, input
+assign {debug_sel, debug_addr} = if_debug ? wb_adr_i[WB_WIDTH-3:0] : 0;
+assign debug_we = if_debug && wb_we_i;
+assign debug_wdata = debug_we ? wb_dat_i : 0;
+
+// debugger interface, output
+assign wbs_dat_o = (if_debug && !wb_we_i) ? debug_rdata : 0;
+
+// entropy pool interface
+assign entropy_word = (if_entropy && wb_we_i) ? wb_dat_i : 0;
+
+endmodule
+
+`default_nettype wire
+