Add verilog sources
diff --git a/verilog/rtl/actfn.v b/verilog/rtl/actfn.v
new file mode 100644
index 0000000..f7e5214
--- /dev/null
+++ b/verilog/rtl/actfn.v
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// activation functions used in the neural network
+
+
+// leaky ReLU
+
+module leaky_relu_comb (
+    input [`NUM_WIDTH-1:0] x,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_x = x[`NUM_WIDTH-1];
+assign res = sig_x ? {{(`LEAK_SHIFT){1'b1}}, x[`NUM_WIDTH-1:`LEAK_SHIFT]} : x;
+
+endmodule
+
+
+// derivative of leaky ReLU
+
+module leaky_relu_diff_comb (
+    input [`NUM_WIDTH-1:0] x,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_x = x[`NUM_WIDTH-1];
+
+assign res = sig_x ?
+    {{(`INT_WIDTH+`LEAK_SHIFT-1){1'b0}}, 1'b1, {(`FRAC_WIDTH-`LEAK_SHIFT){1'b0}}} :
+    {{(`INT_WIDTH-1){1'b0}}, 1'b1, {(`FRAC_WIDTH){1'b0}}};
+
+endmodule
+
+
+// very rough approximation of 2^x, used in softmax
+
+module approx_exp_comb (
+    input [`NUM_WIDTH-1:0] x,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire saturated = ~x[`NUM_WIDTH-1] & (x[`NUM_WIDTH-1:`FRAC_WIDTH] > `INT_WIDTH - 2);
+
+assign res[`NUM_WIDTH-1] = 1'b0;
+generate genvar g;
+for (g=0; g<`NUM_WIDTH-1; g=g+1) begin:g_exp
+    assign res[g] = saturated | (x[`NUM_WIDTH-1:`FRAC_WIDTH] == g - `FRAC_WIDTH);
+end
+endgenerate
+
+endmodule
+
+
+// piecewise linear approximation of 1/x, used in softmax
+
+module approx_inv_comb (
+    input [`NUM_WIDTH-1:0] x,   // assuming x > 0
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire [`NUM_WIDTH:0] bnd;
+wire [`NUM_WIDTH-1:0] msb;
+wire [`FRAC_WIDTH-1:0] m;
+
+assign bnd[`NUM_WIDTH] = 0;
+
+generate genvar g;
+
+for (g=`NUM_WIDTH-1; g>=0; g=g-1) begin:g_msb
+    assign bnd[g] = bnd[g+1] | x[g];
+    assign msb[g] = bnd[g] & ~bnd[g+1];
+end
+for (g=0; g<`NUM_WIDTH; g=g+1) begin:g_mant
+    wire [`FRAC_WIDTH-1:0] mc = msb[g] ? ({x, {(`FRAC_WIDTH){1'b0}}} >> g) : {(`FRAC_WIDTH){1'b0}};
+    wire [`FRAC_WIDTH-1:0] ms;
+    if (g==0) begin:i_mantz
+        assign ms = mc;
+    end else begin:i_mantnz
+        assign ms = g_mant[g-1].ms | mc;
+    end
+end
+assign m = g_mant[`NUM_WIDTH-1].ms;
+
+// m contains the input bit-shifted to within [1, 2), with its integer part (i.e. 1) removed
+// for 1 <= x < 1.25 we use 1/x ~= 115/64 - 51/64 x
+wire [`FRAC_WIDTH:0] minv_a = {7'd115, {(`FRAC_WIDTH-6){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 7'd51} * {7'd1, m}) >> 6);
+// for 1.25 <= x < 1.5 we use 1/x ~= 95/64 - 35/64 x
+wire [`FRAC_WIDTH:0] minv_b = {7'd95, {(`FRAC_WIDTH-6){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 7'd35} * {7'd1, m}) >> 6);
+// for 1.5 <= x < 1.75 we use 1/x ~= 157/128 - 3/8 x
+wire [`FRAC_WIDTH:0] minv_c = {8'd157, {(`FRAC_WIDTH-7){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 3'd3} * {3'd1, m}) >> 3);
+// for 1.75 <= x < 2 we use 1/x ~= 17/16 - 9/32 x
+wire [`FRAC_WIDTH:0] minv_d = {5'd17, {(`FRAC_WIDTH-4){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 5'd9} * {5'd1, m}) >> 5);
+wire [`FRAC_WIDTH:0] minv = m[`FRAC_WIDTH-1] ? (m[`FRAC_WIDTH-2] ? minv_d : minv_c) : (m[`FRAC_WIDTH-2] ? minv_b : minv_a);
+
+for (g=0; g<`NUM_WIDTH; g=g+1) begin:g_mrec
+    wire [`NUM_WIDTH-1:0] mrc = msb[g] ? ({{(`INT_WIDTH){1'b0}}, minv, {(`FRAC_WIDTH){1'b0}}} >> g) : {(`NUM_WIDTH){1'b0}};    
+    wire [`NUM_WIDTH-1:0] mrs;
+    if (g==0) begin:i_mrecz
+        assign mrs = mrc;
+    end else begin:i_mrecnz
+        assign mrs = g_mrec[g-1].mrs | mrc;
+    end
+end
+assign res = g_mrec[`NUM_WIDTH-1].mrs;
+
+endgenerate
+
+endmodule
+
+
+// softmax using approximated 2^x and 1/x
+
+module approx_softmax_comb (
+    input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+    output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk
+);
+
+wire [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+wire [`NUM_WIDTH-1:0] xmax;
+wire [`INDEX_WIDTH-1:0] _ignore;
+
+max_comb i_max (
+    .x_pk,
+    .res_val(xmax),
+    .res_pos(_ignore)
+);
+
+wire [`NUM_WIDTH-1:0] dexp[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] esum;
+
+generate genvar g;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_expsum
+    wire [`NUM_WIDTH-1:0] diff;
+    sub_sat_comb i_sub (
+        .a(x[g]),
+        .b(xmax),
+        .res(diff)
+    );
+    approx_exp_comb i_exp (
+        .x(diff),
+        .res(dexp[g])
+    );
+    wire [`NUM_WIDTH-1:0] psum;
+    if (g==0) begin
+        assign psum = dexp[g];
+    end else begin
+        assign psum = g_expsum[g-1].psum + dexp[g];
+    end
+end
+assign esum = g_expsum[`OUTPUT_SIZE-1].psum;
+
+endgenerate
+
+wire [`NUM_WIDTH-1:0] isum;
+approx_inv_comb i_inv (
+    .x(esum),
+    .res(isum)
+);
+
+generate
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_div
+    mul_sat_comb i_mul (
+        .a(dexp[g]),
+        .b(isum),
+        .res(res[g])
+    );
+end
+
+endgenerate
+
+endmodule
+
+
+// derivative of softmax
+
+module approx_softmax_diff_comb (
+    input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+    output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk
+);
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] sm_pk;
+
+approx_softmax_comb i_sm (
+    .x_pk,
+    .res_pk(sm_pk)
+);
+
+wire [`NUM_WIDTH-1:0] sm[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, sm, sm_pk)
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+generate genvar g;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin
+    wire [`NUM_WIDTH-1:0] sqr;
+    mul_sat_comb i_mul (
+        .a(sm[g]),
+        .b(sm[g]),
+        .res(sqr)
+    );
+    sub_sat_comb i_sub (
+        .a(sm[g]),
+        .b(sqr),
+        .res(res[g])
+    );
+end
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/actfn_tb.v b/verilog/rtl/actfn_tb.v
new file mode 100644
index 0000000..ba2c805
--- /dev/null
+++ b/verilog/rtl/actfn_tb.v
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for actfn.v
+
+
+// leaky ReLU
+
+module leaky_relu_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+leaky_relu_comb dut (
+    .x,
+    .res
+);
+
+initial begin
+    $monitor("time %4t x %64b res %64b", $time, x, res);
+    x <= 3;
+    #10
+    x <= -3;
+    #10
+    x <= -3 << `LEAK_SHIFT;
+    $finish;
+end
+
+endmodule
+
+
+// derivative of leaky ReLU
+
+module leaky_relu_diff_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+leaky_relu_diff_comb dut (
+    .x,
+    .res
+);
+
+initial begin
+    $monitor("time %4t x %64b res %64b", $time, x, res);
+    x <= 3;
+    #10
+    x <= -3;
+    #10
+    x <= -3 << `LEAK_SHIFT;
+    $finish;
+end
+
+endmodule
+
+
+// very rough approximation of 2^x, used in softmax
+
+module approx_exp_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+approx_exp_comb dut (
+    .x,
+    .res
+);
+
+initial begin
+    $monitor("time %4t x %64b res %64b", $time, x, res);
+    x <= 3;
+    #10
+    x <= 3 << `FRAC_WIDTH;
+    #10
+    x <= -3 << `FRAC_WIDTH;
+    #10
+    x <= (`INT_WIDTH-2) << `FRAC_WIDTH;
+    #10
+    x <= (`INT_WIDTH-1) << `FRAC_WIDTH;
+    #10
+    x <= (-`FRAC_WIDTH) << `FRAC_WIDTH;
+    #10
+    x <= (-`FRAC_WIDTH-1) << `FRAC_WIDTH;
+    $finish;
+end
+
+endmodule
+
+
+// piecewise linear approximation of 1/x, used in softmax
+
+module approx_inv_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+approx_inv_comb dut (
+    .x,
+    .res
+);
+
+initial begin
+    $monitor("time %4t x %64b res %64b m 1%24b minv %25b", $time, x, res, dut.m, dut.minv);
+    x <= 1;
+    #10
+    x <= 1 << `FRAC_WIDTH;
+    #10
+    x <= 1 << (`FRAC_WIDTH + 2);
+    #10
+    x <= 1 << (`FRAC_WIDTH - 3);
+    #10
+    x <= 4'b1000 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1001 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1010 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1011 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1100 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1101 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1110 << `FRAC_WIDTH;
+    #10
+    x <= 4'b1111 << `FRAC_WIDTH;
+    #10
+    $finish;
+end
+
+endmodule
+
+
+// softmax using approximated 2^x and 1/x
+
+module approx_softmax_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+approx_softmax_comb dut (
+    .x_pk,
+    .res_pk
+);
+
+wire [`NUM_WIDTH-1:0] res0 = res[0];    // workaround for segfault in vvp
+wire [`NUM_WIDTH-1:0] res1 = res[1];
+wire [`NUM_WIDTH-1:0] res2 = res[2];
+
+reg [`INDEX_WIDTH-1:0] i;
+wire [31:0] hint = 32'b1 << `FRAC_WIDTH;
+
+initial begin
+    $display("TIME vvvv X[0] %32b X[1] %32b X[2] %32b RES[0] %32b RES[1] %32b RES[2] %32b", hint, hint, hint, hint, hint, hint);
+    $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res[0] %32b res[1] %32b res[2] %32b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res0[31:0], res1[31:0], res2[31:0]);
+    for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+        x[i] <= 0;
+    end
+    x[0] <= 0;
+    x[1] <= 0;
+    x[2] <= 0;
+    #10
+    x[0] <= 1 << (`FRAC_WIDTH-4);
+    x[1] <= 2 << (`FRAC_WIDTH-4);
+    x[2] <= 3 << (`FRAC_WIDTH-4);
+    #10
+    x[0] <= 1 << `FRAC_WIDTH;
+    #10
+    x[1] <= 1 << `FRAC_WIDTH;
+    #10
+    x[2] <= 2 << `FRAC_WIDTH;
+    #10
+    x[1] <= 2 << `FRAC_WIDTH;
+    #10
+    x[0] <= 2 << `FRAC_WIDTH;
+    #10
+    x[0] <= 1 << (`FRAC_WIDTH+4);
+    #10
+    x[1] <= 2 << (`FRAC_WIDTH+4);
+    #10
+    x[2] <= 3 << (`FRAC_WIDTH+4);
+    $finish;
+end
+
+endmodule
+
+
+// derivative of softmax
+
+module approx_softmax_diff_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+approx_softmax_diff_comb dut (
+    .x_pk,
+    .res_pk
+);
+
+wire [`NUM_WIDTH-1:0] res0 = res[0];    // workaround for segfault in vvp
+wire [`NUM_WIDTH-1:0] res1 = res[1];
+wire [`NUM_WIDTH-1:0] res2 = res[2];
+
+reg [`INDEX_WIDTH-1:0] i;
+wire [31:0] hint = 32'b1 << `FRAC_WIDTH;
+
+initial begin
+    $display("TIME vvvv X[0] %32b X[1] %32b X[2] %32b RES[0] %32b RES[1] %32b RES[2] %32b", hint, hint, hint, hint, hint, hint);
+    $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res[0] %32b res[1] %32b res[2] %32b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res0[31:0], res1[31:0], res2[31:0]);
+    for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+        x[i] <= 0;
+    end
+    x[0] <= 0;
+    x[1] <= 0;
+    x[2] <= 0;
+    #10
+    x[0] <= 1 << (`FRAC_WIDTH-4);
+    x[1] <= 2 << (`FRAC_WIDTH-4);
+    x[2] <= 3 << (`FRAC_WIDTH-4);
+    #10
+    x[0] <= 1 << `FRAC_WIDTH;
+    #10
+    x[1] <= 1 << `FRAC_WIDTH;
+    #10
+    x[2] <= 2 << `FRAC_WIDTH;
+    #10
+    x[1] <= 2 << `FRAC_WIDTH;
+    #10
+    x[0] <= 2 << `FRAC_WIDTH;
+    #10
+    x[0] <= 1 << (`FRAC_WIDTH+4);
+    #10
+    x[1] <= 2 << (`FRAC_WIDTH+4);
+    #10
+    x[2] <= 3 << (`FRAC_WIDTH+4);
+    $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/config.v b/verilog/rtl/config.v
new file mode 100644
index 0000000..e2c80d1
--- /dev/null
+++ b/verilog/rtl/config.v
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// numbers are represented as fixed-point fractions
+// with an integral part of INT_WIDTH bits
+// and a fractional part of FRAC_WIDTH bits
+// (64 bits are an overkill here, but simulations were run on a 64-bit platform)
+`define INT_WIDTH 40
+`define FRAC_WIDTH 24
+`define NUM_WIDTH (`INT_WIDTH + `FRAC_WIDTH)
+
+// multiplication is the main bottleneck in the circuit complexity
+// so we recude integer & fractional widths for multiplications
+// (this didn't significantly affect learning speed in our tests)
+
+`define MUL_INT_WIDTH 6
+`define PRE_MUL_FRAC_WIDTH 12
+`define POST_MUL_FRAC_WIDTH 16
+
+// number of neurons in input, hidden 1, hidden 2 & output layers
+`define INPUT_SIZE 1
+`define HIDDEN1_SIZE 1
+`define HIDDEN2_SIZE 1
+`define OUTPUT_SIZE 1
+
+// bits required to describe the sizes above
+`define INDEX_WIDTH 10
+
+// power of 1/2 used an the slope of leaky ReLU's negative part
+`define LEAK_SHIFT 7
+
+// power of 1/2 used as the learning rate
+`define LEARN_SHIFT 7
+
diff --git a/verilog/rtl/interface.v b/verilog/rtl/interface.v
new file mode 100644
index 0000000..4b4abab
--- /dev/null
+++ b/verilog/rtl/interface.v
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// wrapper for neural_network that allows
+// - setting initial weights, inputs & ground truth
+// - retrieving outputs
+// - starting forward or backward propagation
+// - detecting when forward or backward propagation finishes
+// by using memory i/o within a single virtual address space
+
+module neural_interface (
+    input clk,
+    input [23:0] addr,
+    input [`NUM_WIDTH-1:0] data_in,
+    input we,
+    output reg [`NUM_WIDTH-1:0] data_out
+);
+
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+reg wu;
+reg [1:0] w_layer;
+reg [`INDEX_WIDTH-1:0] w_i;
+reg [`INDEX_WIDTH-1:0] w_j;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+wire [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk;
+`PACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+neural_network i_nn (
+    .clk,
+    .fp,
+    .fp_out,
+    .a0_pk,
+    .a3_pk,
+    .bp,
+    .bp_out,
+    .g3_pk,
+    .wu,
+    .w_layer,
+    .w_i,
+    .w_j,
+    .w_in,
+    .w_out
+);
+
+wire [`NUM_WIDTH-1:0] _ignore_a3;
+wire [`INDEX_WIDTH-1:0] argmax_a3;
+max_comb i_max_a3 (
+    .x_pk(a3_pk),
+    .res_val(_ignore_a3),
+    .res_pos(argmax_a3)
+);
+
+wire [`NUM_WIDTH-1:0] _ignore_g3;
+wire [`INDEX_WIDTH-1:0] argmax_g3;
+max_comb i_max_g3 (
+    .x_pk(g3_pk),
+    .res_val(_ignore_g3),
+    .res_pos(argmax_g3)
+);
+
+reg fp_out_hold;
+reg bp_out_hold;
+wire fp_out_hn = fp_out_hold | fp_out;
+wire bp_out_hn = bp_out_hold | bp_out;
+
+wire [3:0] sel = addr[23:20];
+wire [9:0] index_h = addr[19:10];
+wire [9:0] index_l = addr[9:0];
+
+always @(posedge clk) begin
+    fp <= 0;
+    bp <= 0;
+    wu <= 0;
+    fp_out_hold <= fp_out_hn;
+    bp_out_hold <= bp_out_hn;
+    data_out <= {(`NUM_WIDTH){1'b0}};
+    if (sel[3:2]==0) begin
+        w_layer <= sel[1:0];
+        w_i <= index_h;
+        w_j <= index_l;
+        data_out <= w_out;
+        if (we) begin
+            w_in <= data_in;
+            wu <= 1;
+        end
+    end else if (sel==4) begin
+        data_out <= a0[index_l];
+        if (we) begin
+            a0[index_l] <= data_in;
+        end 
+    end else if (sel==5) begin
+        data_out <= a3[index_l];
+    end else if (sel==6) begin
+        data_out <= g3[index_l];
+        if (we) begin
+            g3[index_l] <= data_in;
+        end
+    end else if (sel==7) begin
+        if (index_l == 0) begin
+            data_out <= {(`NUM_WIDTH){fp_out_hn}};
+            if (we) begin
+                if (|data_in) begin
+                    fp <= 1;
+                end else begin
+                    fp_out_hold <= 0;
+                end
+            end
+        end else if (index_l == 1) begin
+            data_out <= {(`NUM_WIDTH){bp_out_hn}};
+            if (we) begin
+                if (|data_in) begin
+                    bp <= 1;
+                end else begin
+                    bp_out_hold <= 0;
+                end
+            end
+        end else if (index_l == 2) begin
+            data_out <= argmax_a3 << `FRAC_WIDTH;
+        end else if (index_l == 3) begin
+            data_out <= argmax_g3 << `FRAC_WIDTH;
+        end
+    end
+end
+
+endmodule
+
diff --git a/verilog/rtl/interface_tb.v b/verilog/rtl/interface_tb.v
new file mode 100644
index 0000000..db60498
--- /dev/null
+++ b/verilog/rtl/interface_tb.v
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbench for the single neural_interface module in interface.v
+
+module neural_interface_tb ();
+
+reg clk;
+reg [23:0] addr;
+reg [`NUM_WIDTH-1:0] data_in;
+reg we;
+wire [`NUM_WIDTH-1:0] data_out;
+
+neural_interface dut (
+    .clk,
+    .addr,
+    .data_in,
+    .we,
+    .data_out
+);
+
+generate genvar i; genvar j;
+
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+    for (j=0; j<`HIDDEN1_SIZE; j=j+1) begin
+        initial begin
+            dut.i_nn.g_syn01_o[i].g_syn01_i[j].i_syn01.w <= 0;
+        end
+    end
+end
+for (i=0; i<`HIDDEN1_SIZE; i=i+1) begin
+    for (j=0; j<`HIDDEN2_SIZE; j=j+1) begin
+        initial begin
+            dut.i_nn.g_syn12_o[i].g_syn12_i[j].i_syn12.w <= 0;
+        end
+    end
+end
+for (i=0; i<`HIDDEN2_SIZE; i=i+1) begin
+    for (j=0; j<`OUTPUT_SIZE; j=j+1) begin
+        initial begin
+            dut.i_nn.g_syn23_o[i].g_syn23_i[j].i_syn23.w <= 0;
+        end
+    end
+end
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+    initial begin
+        dut.a0[i] <= 1;
+    end
+end
+for (i=0; i<`OUTPUT_SIZE; i=i+1) begin
+    initial begin
+        dut.i_nn.g_layer3[i].i_neu_3.a <= 2;
+        dut.g3[i] <= 3;
+    end
+end
+
+endgenerate
+
+initial begin
+    clk <= 0;
+    we <= 0;
+    $monitor("time %4t addr %24b data_in %64b we %1b data_out %64b", $time, addr, data_in, we, data_out);
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0010_0000000111_0000000011;
+    #5 clk<=1; #5 clk<=0;
+    data_in <= 15;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0100_0000000000_0000000101;
+    data_in <= 33;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0101_0000000000_0000000101;
+    data_in <= 17;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0110_0000000000_0000000101;
+    data_in <= 9;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    dut.fp_out_hold <= 0;
+    addr <= 24'b0111_0000000000_0000000000;
+    data_in <= 1;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    data_in <= 0;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    dut.bp_out_hold <= 0;
+    addr <= 24'b0111_0000000000_0000000001;
+    data_in <= 1;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    data_in <= 0;
+    we <= 1;
+    #5 clk<=1; #5 clk<=0;
+    we <= 0;
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0111_0000000000_0000000010;
+    #5 clk<=1; #5 clk<=0;
+    addr <= 24'b0111_0000000000_0000000011;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/macros.v b/verilog/rtl/macros.v
new file mode 100644
index 0000000..26d3132
--- /dev/null
+++ b/verilog/rtl/macros.v
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// macros for passing bus arrays to modules
+`define PACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,VAR) \
+    generate genvar VAR; \
+    for (VAR=0; VAR<(LEN); VAR=VAR+1) begin \
+        assign DEST[((WIDTH)*VAR+((WIDTH)-1)):((WIDTH)*VAR)] = SRC[VAR][((WIDTH)-1):0]; \
+    end \
+    endgenerate
+`define PACK_ARRAY(WIDTH,LEN,SRC,DEST) `PACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,pa_``SRC)
+`define UNPACK_ARRAY_INTERNAL(WIDTH,LEN,DEST,SRC,VAR) \
+    generate genvar VAR; \
+    for (VAR=0; VAR<(LEN); VAR=VAR+1) begin \
+        assign DEST[VAR][((WIDTH)-1):0] = SRC[((WIDTH)*VAR+(WIDTH-1)):((WIDTH)*VAR)]; \
+    end \
+    endgenerate
+`define UNPACK_ARRAY(WIDTH,LEN,SRC,DEST) `UNPACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,ua_``SRC)
+
diff --git a/verilog/rtl/math.v b/verilog/rtl/math.v
new file mode 100644
index 0000000..a34db07
--- /dev/null
+++ b/verilog/rtl/math.v
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// basic arithmetics with saturation
+
+
+// addition
+// res = a + b
+
+module add_sat_comb (
+    input [`NUM_WIDTH-1:0] a,
+    input [`NUM_WIDTH-1:0] b,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sig_sum;
+wire [`NUM_WIDTH-1:0] sum;
+assign {sig_sum, sum} = {sig_a, a} + {sig_b, b};
+wire saturated = sig_sum != sum[`NUM_WIDTH-1];
+assign res = saturated ? {sig_sum, {(`NUM_WIDTH-1){~sig_sum}}} : sum;
+
+endmodule
+
+
+// subtraction
+// res = a - b
+
+module sub_sat_comb (
+    input [`NUM_WIDTH-1:0] a,
+    input [`NUM_WIDTH-1:0] b,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sig_sum;
+wire [`NUM_WIDTH-1:0] sum;
+assign {sig_sum, sum} = {sig_a, a} - {sig_b, b};
+wire saturated = sig_sum != sum[`NUM_WIDTH-1];
+assign res = saturated ? {sig_sum, {(`NUM_WIDTH-1){~sig_sum}}} : sum;
+
+endmodule
+
+
+// multiplication
+// res = a * b
+
+module mul_sat_comb (
+    input [`NUM_WIDTH-1:0] a,
+    input [`NUM_WIDTH-1:0] b,
+    output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sat_a = |a[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1] & ~&a[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1];
+wire sat_b = |b[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1] & ~&b[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1];
+wire [`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1:0] short_a = sat_a ? {sig_a, {(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1){~sig_a}}} : a[`MUL_INT_WIDTH+`FRAC_WIDTH-1:`FRAC_WIDTH-`PRE_MUL_FRAC_WIDTH];
+wire [`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1:0] short_b = sat_b ? {sig_b, {(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1){~sig_b}}} : b[`MUL_INT_WIDTH+`FRAC_WIDTH-1:`FRAC_WIDTH-`PRE_MUL_FRAC_WIDTH];
+wire sig_mul;
+wire [`MUL_INT_WIDTH-1:0] mul_hi;
+wire [`MUL_INT_WIDTH+`POST_MUL_FRAC_WIDTH-2:0] mul_md;
+wire [2*`PRE_MUL_FRAC_WIDTH-`POST_MUL_FRAC_WIDTH-1:0] mul_lo;
+assign {sig_mul, mul_hi, mul_md, mul_lo} = {{(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH){sig_a}}, short_a} * {{(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH){sig_b}}, short_b};
+wire saturated = |{sig_mul, mul_hi} & ~&{sig_mul, mul_hi};
+assign res = saturated ? {sig_mul, {(`NUM_WIDTH-1){~sig_mul}}} : {{(`INT_WIDTH-`MUL_INT_WIDTH+1){sig_mul}}, mul_md, {(`FRAC_WIDTH-`POST_MUL_FRAC_WIDTH){1'b0}}};
+
+endmodule
+
+
+// maximum & argmax
+// res_val = max(x)
+// res_pos = argmax(x)
+
+module max_comb (
+    input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+    output [`NUM_WIDTH-1:0] res_val,
+    output [`INDEX_WIDTH-1:0] res_pos
+);
+
+wire [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+generate genvar g; genvar h;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_max_o
+    wire [`OUTPUT_SIZE-1:0] is_greater;
+    for (h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_max_i
+        assign is_greater[h] = $signed(x[g]) >= $signed(x[h]);
+    end
+    wire is_max = &is_greater;
+    wire [`NUM_WIDTH-1:0] cur_val = is_max ? x[g] : {(`NUM_WIDTH){1'b0}};
+    wire [`NUM_WIDTH-1:0] max_val;
+    wire [`INDEX_WIDTH-1:0] pos;
+    if (g==0) begin
+        assign max_val = cur_val;
+        assign pos = 0;
+    end else begin
+        assign max_val = g_max_o[g-1].max_val | cur_val;
+        assign pos = is_max ? g : g_max_o[g-1].pos;
+    end
+end
+assign res_val = g_max_o[`OUTPUT_SIZE-1].max_val;
+assign res_pos = g_max_o[`OUTPUT_SIZE-1].pos;
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/math_tb.v b/verilog/rtl/math_tb.v
new file mode 100644
index 0000000..242274a
--- /dev/null
+++ b/verilog/rtl/math_tb.v
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for math.v
+
+
+// addition
+
+module add_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+add_sat_comb dut (
+    .a,
+    .b,
+    .res
+);
+
+initial begin
+    $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+    a <= 1;
+    b <= 2;
+    #10
+    a <= -1;
+    b <= -2;
+    #10
+    a[`NUM_WIDTH-1] <= 0;
+    b[`NUM_WIDTH-1] <= 0;
+    #10
+    b = 2;
+    b[`NUM_WIDTH-1] <= 1;
+    #10
+    a = 1;
+    a[`NUM_WIDTH-1] <= 1;
+    $finish;
+end
+
+endmodule
+
+
+// subtraction
+
+module sub_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+sub_sat_comb dut (
+    .a,
+    .b,
+    .res
+);
+
+initial begin
+    $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+    a <= 1;
+    b <= 2;
+    #10
+    a <= -1;
+    b <= -2;
+    #10
+    a[`NUM_WIDTH-1] <= 0;
+    b[`NUM_WIDTH-1] <= 0;
+    #10
+    b = 2;
+    b[`NUM_WIDTH-1] <= 1;
+    #10
+    a = 1;
+    a[`NUM_WIDTH-1] <= 1;
+    $finish;
+end
+
+endmodule
+
+
+// multiplication
+
+module mul_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+mul_sat_comb dut (
+    .a,
+    .b,
+    .res
+);
+
+initial begin
+    $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+    a <= 1 << `FRAC_WIDTH;
+    b <= 2 << `FRAC_WIDTH;
+    #10
+    a <= -1 << `FRAC_WIDTH;
+    b <= -2 << `FRAC_WIDTH;
+    #10
+    a[`NUM_WIDTH-1] <= 0;
+    b[`NUM_WIDTH-1] <= 0;
+    #10
+    b <= 2 << `FRAC_WIDTH;
+    b[`NUM_WIDTH-1] <= 1;
+    #10
+    a <= 1 << `FRAC_WIDTH;
+    a[`NUM_WIDTH-1] <= 1;
+    $finish;
+end
+
+endmodule
+
+
+// maximum & argmax
+
+module max_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res_val;
+wire [`INDEX_WIDTH-1:0] res_pos;
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+max_comb dut (
+    .x_pk,
+    .res_val,
+    .res_pos
+);
+
+reg [`INDEX_WIDTH-1:0] i;
+
+initial begin
+    $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res_val %32b res_pos %10b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res_val[31:0], res_pos);
+    for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+        x[i] <= -20;
+    end
+    x[0] <= 0;
+    x[1] <= 0;
+    x[2] <= 0;
+    #10
+    x[0] <= 1;
+    x[1] <= 2;
+    x[2] <= 3;
+    #10
+    x[0] <= 10;
+    #10
+    x[1] <= 10;
+    #10
+    x[2] <= 20;
+    #10
+    x[1] <= 20;
+    #10
+    x[0] <= 20;
+    #10
+    x[0] <= -1;
+    x[1] <= -2;
+    x[2] <= -3;
+    #10
+    x[0] <= 1;
+    $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/network.v b/verilog/rtl/network.v
new file mode 100644
index 0000000..6fec60c
--- /dev/null
+++ b/verilog/rtl/network.v
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// neural network composed of an input layer,
+// two fully connected hidden layers with a ReLU activation function
+// and a fully connected output layer with a softmax activation function
+
+module neural_network (
+    input clk,
+    input fp,
+    output fp_out,
+    input [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk,
+    output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk,
+    input bp,
+    output bp_out,
+    input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk, // ground truth
+    input wu,
+    input [1:0] w_layer,
+    input [`INDEX_WIDTH-1:0] w_i,
+    input [`INDEX_WIDTH-1:0] w_j,
+    input [`NUM_WIDTH-1:0] w_in,
+    output [`NUM_WIDTH-1:0] w_out
+);
+
+wire [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+generate genvar g; genvar h;
+
+// synapses between input layer & hidden layer 1
+wire bp_s01;
+wire [`NUM_WIDTH-1:0] e1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`INPUT_SIZE; g=g+1) begin:g_syn01_o
+    for (h=0; h<`HIDDEN1_SIZE; h=h+1) begin:g_syn01_i
+        wire fp_out;
+        wire bp_out;
+        wire [`NUM_WIDTH-1:0] zc;
+        wire [`NUM_WIDTH-1:0] tc; // ignored for input layer
+        wire wu_sel = (w_layer == 0) && (w_i == g) && (w_j == h);
+        wire [`NUM_WIDTH-1:0] w_out_r;
+        synapse i_syn01 (
+            .clk,
+            .fp,
+            .fp_out,
+            .a(a0[g]),
+            .zc,
+            .bp(bp_s01),
+            .bp_out,
+            .e(e1[h]),
+            .tc,
+            .wu(wu && wu_sel),
+            .w_in(w_in),
+            .w_out(w_out_r)
+        );
+        wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+        wire [`NUM_WIDTH-1:0] w_out_a;
+        if (h==0) begin
+            assign w_out_a = w_out_s;
+        end else begin
+            assign w_out_a = g_syn01_i[h-1].w_out_a | w_out_s;
+        end
+    end
+    wire [`NUM_WIDTH-1:0] w_out_b;
+    if (g==0) begin
+        assign w_out_b = g_syn01_i[`HIDDEN1_SIZE-1].w_out_a;
+    end else begin
+        assign w_out_b = g_syn01_o[g-1].w_out_b | g_syn01_i[`HIDDEN1_SIZE-1].w_out_a;
+    end
+end
+wire [`NUM_WIDTH-1:0] w_out_c01 = g_syn01_o[`INPUT_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_z1_o
+    for (h=0; h<`INPUT_SIZE; h=h+1) begin:g_z1_i
+        wire [`NUM_WIDTH-1:0] zc = g_syn01_o[h].g_syn01_i[g].zc;
+        wire [`NUM_WIDTH-1:0] z1s;
+        if (h==0) begin
+            assign z1s = zc;
+        end else begin
+            add_sat_comb i_add_z1 (
+                .a(g_z1_i[h-1].z1s),
+                .b(zc),
+                .res(z1s)
+            );
+        end
+    end
+    assign z1[g] = g_z1_i[`INPUT_SIZE-1].z1s;
+end
+
+wire fp_h1 = g_syn01_o[0].g_syn01_i[0].fp_out;
+assign bp_out = g_syn01_o[0].g_syn01_i[0].bp_out;
+
+// hidden layer 1
+wire bp_h1;
+wire [`NUM_WIDTH-1:0] a1[`HIDDEN1_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] t1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_layer1
+    wire fp_out;
+    wire bp_out;
+    wire [`NUM_WIDTH-1:0] to_act;
+    wire [`NUM_WIDTH-1:0] from_act;
+    wire [`NUM_WIDTH-1:0] from_act_diff;
+    neuron i_neu_1 (
+        .clk,
+        .fp(fp_h1),
+        .fp_out,
+        .z(z1[g]),
+        .a(a1[g]),
+        .bp(bp_h1),
+        .bp_out,
+        .t(t1[g]),
+        .e(e1[g]),
+        .to_act,
+        .from_act,
+        .from_act_diff
+    );
+    leaky_relu_comb i_act_1 (
+        .x(to_act),
+        .res(from_act)
+    );
+    leaky_relu_diff_comb i_act_diff_1 (
+        .x(to_act),
+        .res(from_act_diff)
+    );
+end
+
+wire fp_s12 = g_layer1[0].fp_out;
+assign bp_s01 = g_layer1[0].bp_out;
+
+// synapses between hidden layers 1 & 2
+wire bp_s12;
+wire [`NUM_WIDTH-1:0] e2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_syn12_o
+    for (h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_syn12_i
+        wire fp_out;
+        wire bp_out;
+        wire [`NUM_WIDTH-1:0] zc;
+        wire [`NUM_WIDTH-1:0] tc;
+        wire wu_sel = (w_layer == 1) && (w_i == g) && (w_j == h);
+        wire [`NUM_WIDTH-1:0] w_out_r;
+        synapse i_syn12 (
+            .clk,
+            .fp(fp_s12),
+            .fp_out,
+            .a(a1[g]),
+            .zc,
+            .bp(bp_s12),
+            .bp_out,
+            .e(e2[h]),
+            .tc,
+            .wu(wu && wu_sel),
+            .w_in(w_in),
+            .w_out(w_out_r)
+        );
+        wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+        wire [`NUM_WIDTH-1:0] w_out_a;
+        if (h==0) begin
+            assign w_out_a = w_out_s;
+        end else begin
+            assign w_out_a = g_syn12_i[h-1].w_out_a | w_out_s;
+        end
+    end
+    wire [`NUM_WIDTH-1:0] w_out_b;
+    if (g==0) begin
+        assign w_out_b = g_syn12_i[`HIDDEN2_SIZE-1].w_out_a;
+    end else begin
+        assign w_out_b = g_syn12_o[g-1].w_out_b | g_syn12_i[`HIDDEN2_SIZE-1].w_out_a;
+    end
+end
+wire [`NUM_WIDTH-1:0] w_out_c12 = g_syn12_o[`HIDDEN1_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_z2_o
+    for (h=0; h<`HIDDEN1_SIZE; h=h+1) begin:g_z2_i
+        wire [`NUM_WIDTH-1:0] zc = g_syn12_o[h].g_syn12_i[g].zc;
+        wire [`NUM_WIDTH-1:0] z2s;
+        if (h==0) begin
+            assign z2s = zc;
+        end else begin
+            add_sat_comb i_add_z2 (
+                .a(g_z2_i[h-1].z2s),
+                .b(zc),
+                .res(z2s)
+            );
+        end
+    end
+    assign z2[g] = g_z2_i[`HIDDEN1_SIZE-1].z2s;
+end
+
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_t1_o
+    for(h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_t1_i
+        wire [`NUM_WIDTH-1:0] tc = g_syn12_o[g].g_syn12_i[h].tc;
+        wire [`NUM_WIDTH-1:0] t1s;
+        if (h==0) begin
+            assign t1s = tc;
+        end else begin
+            add_sat_comb i_add_t1 (
+                .a(g_t1_i[h-1].t1s),
+                .b(tc),
+                .res(t1s)
+            );
+        end
+    end
+    assign t1[g] = g_t1_i[`HIDDEN2_SIZE-1].t1s;
+end
+
+wire fp_h2 = g_syn12_o[0].g_syn12_i[0].fp_out;
+assign bp_h1 = g_syn12_o[0].g_syn12_i[0].bp_out;
+
+// hidden layer 2
+wire bp_h2;
+wire [`NUM_WIDTH-1:0] a2[`HIDDEN2_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] t2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_layer2
+    wire fp_out;
+    wire bp_out;
+    wire [`NUM_WIDTH-1:0] to_act;
+    wire [`NUM_WIDTH-1:0] from_act;
+    wire [`NUM_WIDTH-1:0] from_act_diff;
+    neuron i_neu_2 (
+        .clk,
+        .fp(fp_h2),
+        .fp_out,
+        .z(z2[g]),
+        .a(a2[g]),
+        .bp(bp_h2),
+        .bp_out,
+        .t(t2[g]),
+        .e(e2[g]),
+        .to_act,
+        .from_act,
+        .from_act_diff
+    );
+    leaky_relu_comb i_act_2 (
+        .x(to_act),
+        .res(from_act)
+    );
+    leaky_relu_diff_comb i_act_diff_2 (
+        .x(to_act),
+        .res(from_act_diff)
+    );
+end
+
+wire fp_s23 = g_layer2[0].fp_out;
+assign bp_s12 = g_layer2[0].bp_out;
+
+// synapses between hidden layer 2 & output layer
+wire bp_s23;
+wire [`NUM_WIDTH-1:0] e3[`OUTPUT_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_syn23_o
+    for (h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_syn23_i
+        wire fp_out;
+        wire bp_out;
+        wire [`NUM_WIDTH-1:0] zc;
+        wire [`NUM_WIDTH-1:0] tc;
+        wire wu_sel = (w_layer == 2) && (w_i == g) && (w_j == h);
+        wire [`NUM_WIDTH-1:0] w_out_r;
+        synapse i_syn23 (
+            .clk,
+            .fp(fp_s23),
+            .fp_out,
+            .a(a2[g]),
+            .zc,
+            .bp(bp_s23),
+            .bp_out,
+            .e(e3[h]),
+            .tc,
+            .wu(wu && wu_sel),
+            .w_in(w_in),
+            .w_out(w_out_r)
+        );
+        wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+        wire [`NUM_WIDTH-1:0] w_out_a;
+        if (h==0) begin
+            assign w_out_a = w_out_s;
+        end else begin
+            assign w_out_a = g_syn23_i[h-1].w_out_a | w_out_s;
+        end
+    end
+    wire [`NUM_WIDTH-1:0] w_out_b;
+    if (g==0) begin
+        assign w_out_b = g_syn23_i[`OUTPUT_SIZE-1].w_out_a;
+    end else begin
+        assign w_out_b = g_syn23_o[g-1].w_out_b | g_syn23_i[`OUTPUT_SIZE-1].w_out_a;
+    end
+end
+wire [`NUM_WIDTH-1:0] w_out_c23 = g_syn23_o[`HIDDEN2_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z3[`OUTPUT_SIZE-1:0];
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_z3_o
+    for (h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_z3_i
+        wire [`NUM_WIDTH-1:0] zc = g_syn23_o[h].g_syn23_i[g].zc;
+        wire [`NUM_WIDTH-1:0] z3s;
+        if (h==0) begin
+            assign z3s = zc;
+        end else begin
+            add_sat_comb i_add_z3 (
+                .a(g_z3_i[h-1].z3s),
+                .b(zc),
+                .res(z3s)
+            );
+        end
+    end
+    assign z3[g] = g_z3_i[`HIDDEN2_SIZE-1].z3s;
+end
+
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_t2_o
+    for(h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_t2_i
+        wire [`NUM_WIDTH-1:0] tc = g_syn23_o[g].g_syn23_i[h].tc;
+        wire [`NUM_WIDTH-1:0] t2s;
+        if (h==0) begin
+            assign t2s = tc;
+        end else begin
+            add_sat_comb i_add_t2 (
+                .a(g_t2_i[h-1].t2s),
+                .b(tc),
+                .res(t2s)
+            );
+        end
+    end
+    assign t2[g] = g_t2_i[`OUTPUT_SIZE-1].t2s;
+end
+
+wire fp_h3 = g_syn23_o[0].g_syn23_i[0].fp_out;
+assign bp_h2 = g_syn23_o[0].g_syn23_i[0].bp_out;
+
+// output layer
+wire [`NUM_WIDTH-1:0] t3[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] to_softmax[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] from_softmax[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] from_softmax_diff[`OUTPUT_SIZE-1:0];
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_layer3
+    wire fp_out;
+    wire bp_out;
+    wire [`NUM_WIDTH-1:0] to_act;
+    wire [`NUM_WIDTH-1:0] from_act;
+    wire [`NUM_WIDTH-1:0] from_act_diff;
+    neuron i_neu_3 (
+        .clk,
+        .fp(fp_h3),
+        .fp_out,
+        .z(z3[g]),
+        .a(a3[g]),
+        .bp,
+        .bp_out,
+        .t(t3[g]),
+        .e(e3[g]),
+        .to_act(to_softmax[g]),
+        .from_act(from_softmax[g]),
+        .from_act_diff(from_softmax_diff[g])
+    );
+    // feedback using ground truth
+    sub_sat_comb i_sub_fb (
+        .a(a3[g]),
+        .b(g3[g]),
+        .res(t3[g])
+    );
+end
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] to_softmax_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, to_softmax, to_softmax_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] from_softmax_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, from_softmax, from_softmax_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] from_softmax_diff_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, from_softmax_diff, from_softmax_diff_pk)
+
+approx_softmax_comb i_act_3 (
+    .x_pk(to_softmax_pk),
+    .res_pk(from_softmax_pk)
+);
+approx_softmax_diff_comb i_act_diff_3 (
+    .x_pk(to_softmax_pk),
+    .res_pk(from_softmax_diff_pk)
+);
+
+assign fp_out = g_layer3[0].fp_out;
+assign bp_s23 = g_layer3[0].bp_out;
+assign w_out = w_out_c01 | w_out_c12 | w_out_c23;
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/network_tb.v b/verilog/rtl/network_tb.v
new file mode 100644
index 0000000..09fc949
--- /dev/null
+++ b/verilog/rtl/network_tb.v
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbench for the single neural_network module in network.v
+
+module neural_network_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+reg wu;
+reg [1:0] w_layer;
+reg [`INDEX_WIDTH-1:0] w_i;
+reg [`INDEX_WIDTH-1:0] w_j;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+wire [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk;
+`PACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+neural_network dut (
+    .clk,
+    .fp,
+    .fp_out,
+    .a0_pk,
+    .a3_pk,
+    .bp,
+    .bp_out,
+    .g3_pk,
+    .wu,
+    .w_layer,
+    .w_i,
+    .w_j,
+    .w_in,
+    .w_out
+);
+
+wire [`NUM_WIDTH-1:0] a_test = a3[4];
+
+generate genvar i; genvar j;
+
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+    for (j=0; j<`HIDDEN1_SIZE; j=j+1) begin
+        initial begin
+            dut.g_syn01_o[i].g_syn01_i[j].i_syn01.w <= (1 << 22) + (i << 10) + (j << 12);
+        end
+    end
+end
+for (i=0; i<`HIDDEN1_SIZE; i=i+1) begin
+    for (j=0; j<`HIDDEN2_SIZE; j=j+1) begin
+        initial begin
+            dut.g_syn12_o[i].g_syn12_i[j].i_syn12.w <= (1 << 22) + (i << 10) + (j << 12);
+        end
+    end
+end
+for (i=0; i<`HIDDEN2_SIZE; i=i+1) begin
+    for (j=0; j<`OUTPUT_SIZE; j=j+1) begin
+        initial begin
+            dut.g_syn23_o[i].g_syn23_i[j].i_syn23.w <= (1 << 22) + (i << 10) + (j << 12);
+        end
+    end
+end
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+    initial begin
+        a0[i] <= (i % 4 == 0) << 24;
+    end
+end
+for (i=0; i<`OUTPUT_SIZE; i=i+1) begin
+    initial begin
+        g3[i] <= (i == 4) << 24;
+    end
+end
+
+endgenerate
+
+initial begin
+    clk <= 0;
+    fp <= 0;
+    bp <= 0;
+    wu <= 0;
+    w_layer <= 0;
+    w_i <= 1;
+    w_j <= 2;
+    $monitor("time %4t fp %1b fp_out %1b a3[4] %24b bp %1b bp_out %1b w1[1][2] %24b", $time, fp, fp_out, a_test[23:0], bp, bp_out, w_out[23:0]);
+    #5 clk<=1; #5 clk<=0;
+    fp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    fp <= 0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    wu <= 1;
+    w_in <= 24'b111100001100110010101010;
+    #5 clk<=1; #5 clk<=0;
+    wu <= 0;
+    #5 clk<=1; #5 clk<=0;
+    $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/neuron.v b/verilog/rtl/neuron.v
new file mode 100644
index 0000000..b84feb2
--- /dev/null
+++ b/verilog/rtl/neuron.v
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// synapse and neuron primitives for building up the neural network layers
+
+
+// synapse is an edge between two neurons with two-way propagation
+// and an updatable weight
+
+module synapse (
+    input clk,
+    input fp, // forward propagation
+    output reg fp_out,
+    input [`NUM_WIDTH-1:0] a,
+    output reg [`NUM_WIDTH-1:0] zc,
+    input bp, // backward propagation
+    output reg bp_out,
+    input [`NUM_WIDTH-1:0] e,
+    output reg [`NUM_WIDTH-1:0] tc,
+    input wu, // weight update
+    input [`NUM_WIDTH-1:0] w_in,
+    output [`NUM_WIDTH-1:0] w_out
+);
+
+reg [`NUM_WIDTH-1:0] w;
+assign w_out = w;
+
+wire [`NUM_WIDTH-1:0] zn;
+mul_sat_comb i_mul_z (
+    .a(a),
+    .b(w),
+    .res(zn)
+);
+
+wire [`NUM_WIDTH-1:0] tn;
+mul_sat_comb i_mul_t (
+    .a(e),
+    .b(w),
+    .res(tn)
+);
+
+wire [`NUM_WIDTH-1:0] cn;
+mul_sat_comb i_mul_c (
+    .a(a),
+    .b(e),
+    .res(cn)
+);
+
+wire [`NUM_WIDTH-1:0] wn;
+sub_sat_comb i_sub_w (
+    .a(w),
+    .b($signed(cn) >>> `LEARN_SHIFT),
+    .res(wn)
+);
+
+always @(posedge clk) begin
+    if (fp) begin
+        zc <= zn;
+    end
+    fp_out <= fp;
+    if (bp) begin
+        tc <= tn;
+        w <= wn; 
+    end
+    bp_out <= bp;
+    if (wu) begin
+        w <= w_in;
+    end
+end
+
+endmodule
+
+
+// generic neuron with two-way propagation that needs to be connected to
+// the respective activation function and its derivative to make
+// either a ReLU or a softmax neuron
+
+module neuron (
+    input clk,
+    input fp, // forward propagation
+    output reg fp_out,
+    input [`NUM_WIDTH-1:0] z,
+    output reg [`NUM_WIDTH-1:0] a,
+    input bp, // backward propagation
+    output reg bp_out,
+    input [`NUM_WIDTH-1:0] t,
+    output reg [`NUM_WIDTH-1:0] e,
+    output [`NUM_WIDTH-1:0] to_act, // to activation function
+    input [`NUM_WIDTH-1:0] from_act,
+    input [`NUM_WIDTH-1:0] from_act_diff
+);
+
+assign to_act = z;
+
+wire [`NUM_WIDTH-1:0] en;
+mul_sat_comb i_mul_e (
+    .a(t),
+    .b(from_act_diff),
+    .res(en)
+);
+
+always @(posedge clk) begin
+    if (fp) begin
+        a <= from_act;
+    end
+    fp_out <= fp;
+    if (bp) begin
+        e <= en;
+    end
+    bp_out <= bp;
+end
+
+endmodule
+
diff --git a/verilog/rtl/neuron_tb.v b/verilog/rtl/neuron_tb.v
new file mode 100644
index 0000000..620669a
--- /dev/null
+++ b/verilog/rtl/neuron_tb.v
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for neuron.v
+
+
+// synapse is an edge between two neurons with two-way propagation
+// and an updatable weight
+
+module synapse_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a;
+wire [`NUM_WIDTH-1:0] zc;
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] e;
+wire [`NUM_WIDTH-1:0] tc;
+reg wu;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+synapse dut (
+    .clk,
+    .fp,
+    .fp_out,
+    .a,
+    .zc,
+    .bp,
+    .bp_out,
+    .e,
+    .tc,
+    .wu,
+    .w_in,
+    .w_out
+);
+
+initial begin
+    clk <= 0;
+    fp <= 0;
+    bp <= 0;
+    wu <= 0;
+    $monitor("time %4t fp %1b fp_out %1b a %16b zc %16b bp %1b bp_out %1b e %16b tc %16b wu %1b w_in %16b w_out %16b", $time, fp, fp_out, a[31:16], zc[31:16], bp, bp_out, e[31:16], tc[31:16], wu, w_in[31:16], w_out[31:16]);
+    #5 clk<=1; #5 clk<=0;
+    wu <= 1;
+    w_in <= 3 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    wu <= 0;
+    w_in <= 5 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    a <= 7 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    fp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    fp <= 0;
+    a <= 5 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    e <= 7 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 0;
+    #5 clk<=1; #5 clk<=0;
+    $finish;
+end
+
+endmodule
+
+
+// generic neuron with two-way propagation that needs to be connected to
+// the respective activation function and its derivative to make
+// either a ReLU or a softmax neuron
+
+module neuron_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] z;
+wire [`NUM_WIDTH-1:0] a;
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] t;
+wire [`NUM_WIDTH-1:0] e;
+wire [`NUM_WIDTH-1:0] to_act;
+wire [`NUM_WIDTH-1:0] from_act;
+wire [`NUM_WIDTH-1:0] from_act_diff;
+
+neuron dut (
+    .clk,
+    .fp,
+    .fp_out,
+    .z,
+    .a,
+    .bp,
+    .bp_out,
+    .t,
+    .e,
+    .to_act,
+    .from_act,
+    .from_act_diff
+);
+
+assign from_act = to_act * 9;
+assign from_act_diff = to_act * 17;
+
+initial begin
+    clk <= 0;
+    fp <= 0;
+    bp <= 0;
+    $monitor("time %4t fp %1b fp_out %1b z %16b a %16b bp %1b bp_out %1b t %16b e %16b ta %16b fa %16b fad %16b", $time, fp, fp_out, z[35:20], a[35:20], bp, bp_out, t[35:20], e[35:20], to_act[35:20], from_act[35:20], from_act_diff[35:20]);
+    #5 clk<=1; #5 clk<=0;
+    z <= 3 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    fp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    fp <= 0;
+    z <= 5 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    t <= 7 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 1;
+    #5 clk<=1; #5 clk<=0;
+    bp <= 0;
+    t <= 9 << `FRAC_WIDTH;
+    #5 clk<=1; #5 clk<=0;
+    $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/trainable_nn.v b/verilog/rtl/trainable_nn.v
new file mode 100644
index 0000000..367a04e
--- /dev/null
+++ b/verilog/rtl/trainable_nn.v
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+module trainable_nn (
+`ifdef USE_POWER_PINS
+    inout vccd1,	// User area 1 1.8V supply
+    inout vssd1,	// User area 1 digital ground
+`endif
+
+    // Wishbone Slave ports (WB MI A)
+    input wb_clk_i,
+    input wb_rst_i,
+    input wbs_stb_i,
+    input wbs_cyc_i,
+    input wbs_we_i,
+    input [3:0] wbs_sel_i,
+    input [31:0] wbs_dat_i,
+    input [31:0] wbs_adr_i,
+    output wbs_ack_o,
+    output [31:0] wbs_dat_o,
+
+    // Logic Analyzer Signals
+    input  [127:0] la_data_in,
+    output [127:0] la_data_out,
+    input  [127:0] la_oenb,
+
+    // IOs
+    input  [`MPRJ_IO_PADS-1:0] io_in,
+    output [`MPRJ_IO_PADS-1:0] io_out,
+    output [`MPRJ_IO_PADS-1:0] io_oeb,
+
+    // IRQ
+    output [2:0] irq
+);
+
+    assign io_out = {(`MPRJ_IO_PADS){1'b0}};
+    assign io_oeb = {(`MPRJ_IO_PADS){1'b0}};
+
+    assign irq = 3'b000;	// Unused
+
+    wire clk = (~la_oenb[0]) ? la_data_in[0]: wb_clk_i;
+    wire rst = (~la_oenb[1]) ? la_data_in[1]: wb_rst_i;
+
+    wire use_wbs = wbs_cyc_i & wbs_stb_i;
+    wire use_la_addr = &~la_oenb[31:8];
+    wire use_la_data_in = &~la_oenb[95:32];
+
+    wire [23:0] addr = use_la_addr ? la_data_in[31:8] : wbs_adr_i[23:0];
+    wire [63:0] data_in = (use_la_addr & use_la_data_in) ? la_data_in[95:32] : {20'b0, wbs_dat_i, 12'b0};
+    wire we = (use_la_addr & use_la_data_in) | (use_wbs & wbs_we_i);
+    wire [63:0] data_out;
+    assign la_data_out = {32'b0, data_out, 32'b0};
+    assign wbs_dat_o = data_out[43:12];
+
+    neural_interface i_ni (
+        .clk,
+        .addr,
+        .data_in,
+        .we,
+        .data_out
+    );
+
+    always @(posedge clk) begin
+        wbs_ack_o <= (~rst) & use_wbs;
+    end
+
+endmodule
+
diff --git a/verilog/rtl/uprj_netlists.v b/verilog/rtl/uprj_netlists.v
index 3537de8..06a0b3a 100644
--- a/verilog/rtl/uprj_netlists.v
+++ b/verilog/rtl/uprj_netlists.v
@@ -21,8 +21,8 @@
     // Assume default net type to be wire because GL netlists don't have the wire definitions
     `default_nettype wire
     `include "gl/user_project_wrapper.v"
-    `include "gl/user_proj_example.v"
+    `include "gl/trainable_nn.v"
 `else
     `include "user_project_wrapper.v"
-    `include "user_proj_example.v"
+    `include "trainable_nn.v"
 `endif
\ No newline at end of file
diff --git a/verilog/rtl/user_defines.v b/verilog/rtl/user_defines.v
index ee44b08..66bc63d 100644
--- a/verilog/rtl/user_defines.v
+++ b/verilog/rtl/user_defines.v
@@ -52,41 +52,41 @@
 // up in a state that can be used immediately without depending on
 // the management SoC to run a startup program to configure the GPIOs.
 
-`define USER_CONFIG_GPIO_5_INIT  `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_6_INIT  `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_7_INIT  `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_8_INIT  `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_9_INIT  `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_10_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_11_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_12_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_13_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_5_INIT  `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_6_INIT  `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_7_INIT  `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_8_INIT  `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_9_INIT  `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_10_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_11_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_12_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_13_INIT `GPIO_MODE_MGMT_STD_OUTPUT
 
 // Configurations of GPIO 14 to 24 are used on caravel but not caravan.
-`define USER_CONFIG_GPIO_14_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_15_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_16_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_17_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_18_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_19_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_20_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_21_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_22_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_23_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_24_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_14_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_15_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_16_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_17_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_18_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_19_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_20_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_21_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_22_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_23_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_24_INIT `GPIO_MODE_MGMT_STD_OUTPUT
 
-`define USER_CONFIG_GPIO_25_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_26_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_27_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_28_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_29_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_30_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_31_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_32_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_33_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_34_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_35_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_36_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_37_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_25_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_26_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_27_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_28_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_29_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_30_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_31_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_32_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_33_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_34_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_35_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_36_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_37_INIT `GPIO_MODE_MGMT_STD_OUTPUT
 
 `endif // __USER_DEFINES_H
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
deleted file mode 100644
index 26081e9..0000000
--- a/verilog/rtl/user_proj_example.v
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-FileCopyrightText: 2020 Efabless Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// SPDX-License-Identifier: Apache-2.0
-
-`default_nettype none
-/*
- *-------------------------------------------------------------
- *
- * user_proj_example
- *
- * This is an example of a (trivially simple) user project,
- * showing how the user project can connect to the logic
- * analyzer, the wishbone bus, and the I/O pads.
- *
- * This project generates an integer count, which is output
- * on the user area GPIO pads (digital output only).  The
- * wishbone connection allows the project to be controlled
- * (start and stop) from the management SoC program.
- *
- * See the testbenches in directory "mprj_counter" for the
- * example programs that drive this user project.  The three
- * testbenches are "io_ports", "la_test1", and "la_test2".
- *
- *-------------------------------------------------------------
- */
-
-module user_proj_example #(
-    parameter BITS = 32
-)(
-`ifdef USE_POWER_PINS
-    inout vccd1,	// User area 1 1.8V supply
-    inout vssd1,	// User area 1 digital ground
-`endif
-
-    // Wishbone Slave ports (WB MI A)
-    input wb_clk_i,
-    input wb_rst_i,
-    input wbs_stb_i,
-    input wbs_cyc_i,
-    input wbs_we_i,
-    input [3:0] wbs_sel_i,
-    input [31:0] wbs_dat_i,
-    input [31:0] wbs_adr_i,
-    output wbs_ack_o,
-    output [31:0] wbs_dat_o,
-
-    // Logic Analyzer Signals
-    input  [127:0] la_data_in,
-    output [127:0] la_data_out,
-    input  [127:0] la_oenb,
-
-    // IOs
-    input  [`MPRJ_IO_PADS-1:0] io_in,
-    output [`MPRJ_IO_PADS-1:0] io_out,
-    output [`MPRJ_IO_PADS-1:0] io_oeb,
-
-    // IRQ
-    output [2:0] irq
-);
-    wire clk;
-    wire rst;
-
-    wire [`MPRJ_IO_PADS-1:0] io_in;
-    wire [`MPRJ_IO_PADS-1:0] io_out;
-    wire [`MPRJ_IO_PADS-1:0] io_oeb;
-
-    wire [31:0] rdata; 
-    wire [31:0] wdata;
-    wire [BITS-1:0] count;
-
-    wire valid;
-    wire [3:0] wstrb;
-    wire [31:0] la_write;
-
-    // WB MI A
-    assign valid = wbs_cyc_i && wbs_stb_i; 
-    assign wstrb = wbs_sel_i & {4{wbs_we_i}};
-    assign wbs_dat_o = rdata;
-    assign wdata = wbs_dat_i;
-
-    // IO
-    assign io_out = count;
-    assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};
-
-    // IRQ
-    assign irq = 3'b000;	// Unused
-
-    // LA
-    assign la_data_out = {{(127-BITS){1'b0}}, count};
-    // Assuming LA probes [63:32] are for controlling the count register  
-    assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
-    // Assuming LA probes [65:64] are for controlling the count clk & reset  
-    assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
-    assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
-
-    counter #(
-        .BITS(BITS)
-    ) counter(
-        .clk(clk),
-        .reset(rst),
-        .ready(wbs_ack_o),
-        .valid(valid),
-        .rdata(rdata),
-        .wdata(wbs_dat_i),
-        .wstrb(wstrb),
-        .la_write(la_write),
-        .la_input(la_data_in[63:32]),
-        .count(count)
-    );
-
-endmodule
-
-module counter #(
-    parameter BITS = 32
-)(
-    input clk,
-    input reset,
-    input valid,
-    input [3:0] wstrb,
-    input [BITS-1:0] wdata,
-    input [BITS-1:0] la_write,
-    input [BITS-1:0] la_input,
-    output ready,
-    output [BITS-1:0] rdata,
-    output [BITS-1:0] count
-);
-    reg ready;
-    reg [BITS-1:0] count;
-    reg [BITS-1:0] rdata;
-
-    always @(posedge clk) begin
-        if (reset) begin
-            count <= 0;
-            ready <= 0;
-        end else begin
-            ready <= 1'b0;
-            if (~|la_write) begin
-                count <= count + 1;
-            end
-            if (valid && !ready) begin
-                ready <= 1'b1;
-                rdata <= count;
-                if (wstrb[0]) count[7:0]   <= wdata[7:0];
-                if (wstrb[1]) count[15:8]  <= wdata[15:8];
-                if (wstrb[2]) count[23:16] <= wdata[23:16];
-                if (wstrb[3]) count[31:24] <= wdata[31:24];
-            end else if (|la_write) begin
-                count <= la_write & la_input;
-            end
-        end
-    end
-
-endmodule
-`default_nettype wire
diff --git a/verilog/rtl/user_project_wrapper.v b/verilog/rtl/user_project_wrapper.v
index 5ee1cee..a54beaa 100644
--- a/verilog/rtl/user_project_wrapper.v
+++ b/verilog/rtl/user_project_wrapper.v
@@ -82,7 +82,7 @@
 /* User project is instantiated  here   */
 /*--------------------------------------*/
 
-user_proj_example mprj (
+trainable_nn mprj (
 `ifdef USE_POWER_PINS
 	.vccd1(vccd1),	// User area 1 1.8V power
 	.vssd1(vssd1),	// User area 1 digital ground