Add verilog sources
diff --git a/verilog/rtl/actfn.v b/verilog/rtl/actfn.v
new file mode 100644
index 0000000..f7e5214
--- /dev/null
+++ b/verilog/rtl/actfn.v
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// activation functions used in the neural network
+
+
+// leaky ReLU
+
+module leaky_relu_comb (
+ input [`NUM_WIDTH-1:0] x,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_x = x[`NUM_WIDTH-1];
+assign res = sig_x ? {{(`LEAK_SHIFT){1'b1}}, x[`NUM_WIDTH-1:`LEAK_SHIFT]} : x;
+
+endmodule
+
+
+// derivative of leaky ReLU
+
+module leaky_relu_diff_comb (
+ input [`NUM_WIDTH-1:0] x,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_x = x[`NUM_WIDTH-1];
+
+assign res = sig_x ?
+ {{(`INT_WIDTH+`LEAK_SHIFT-1){1'b0}}, 1'b1, {(`FRAC_WIDTH-`LEAK_SHIFT){1'b0}}} :
+ {{(`INT_WIDTH-1){1'b0}}, 1'b1, {(`FRAC_WIDTH){1'b0}}};
+
+endmodule
+
+
+// very rough approximation of 2^x, used in softmax
+
+module approx_exp_comb (
+ input [`NUM_WIDTH-1:0] x,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire saturated = ~x[`NUM_WIDTH-1] & (x[`NUM_WIDTH-1:`FRAC_WIDTH] > `INT_WIDTH - 2);
+
+assign res[`NUM_WIDTH-1] = 1'b0;
+generate genvar g;
+for (g=0; g<`NUM_WIDTH-1; g=g+1) begin:g_exp
+ assign res[g] = saturated | (x[`NUM_WIDTH-1:`FRAC_WIDTH] == g - `FRAC_WIDTH);
+end
+endgenerate
+
+endmodule
+
+
+// piecewise linear approximation of 1/x, used in softmax
+
+module approx_inv_comb (
+ input [`NUM_WIDTH-1:0] x, // assuming x > 0
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire [`NUM_WIDTH:0] bnd;
+wire [`NUM_WIDTH-1:0] msb;
+wire [`FRAC_WIDTH-1:0] m;
+
+assign bnd[`NUM_WIDTH] = 0;
+
+generate genvar g;
+
+for (g=`NUM_WIDTH-1; g>=0; g=g-1) begin:g_msb
+ assign bnd[g] = bnd[g+1] | x[g];
+ assign msb[g] = bnd[g] & ~bnd[g+1];
+end
+for (g=0; g<`NUM_WIDTH; g=g+1) begin:g_mant
+ wire [`FRAC_WIDTH-1:0] mc = msb[g] ? ({x, {(`FRAC_WIDTH){1'b0}}} >> g) : {(`FRAC_WIDTH){1'b0}};
+ wire [`FRAC_WIDTH-1:0] ms;
+ if (g==0) begin:i_mantz
+ assign ms = mc;
+ end else begin:i_mantnz
+ assign ms = g_mant[g-1].ms | mc;
+ end
+end
+assign m = g_mant[`NUM_WIDTH-1].ms;
+
+// m contains the input bit-shifted to within [1, 2), with its integer part (i.e. 1) removed
+// for 1 <= x < 1.25 we use 1/x ~= 115/64 - 51/64 x
+wire [`FRAC_WIDTH:0] minv_a = {7'd115, {(`FRAC_WIDTH-6){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 7'd51} * {7'd1, m}) >> 6);
+// for 1.25 <= x < 1.5 we use 1/x ~= 95/64 - 35/64 x
+wire [`FRAC_WIDTH:0] minv_b = {7'd95, {(`FRAC_WIDTH-6){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 7'd35} * {7'd1, m}) >> 6);
+// for 1.5 <= x < 1.75 we use 1/x ~= 157/128 - 3/8 x
+wire [`FRAC_WIDTH:0] minv_c = {8'd157, {(`FRAC_WIDTH-7){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 3'd3} * {3'd1, m}) >> 3);
+// for 1.75 <= x < 2 we use 1/x ~= 17/16 - 9/32 x
+wire [`FRAC_WIDTH:0] minv_d = {5'd17, {(`FRAC_WIDTH-4){1'b0}}} - (({{(`FRAC_WIDTH){1'b0}}, 5'd9} * {5'd1, m}) >> 5);
+wire [`FRAC_WIDTH:0] minv = m[`FRAC_WIDTH-1] ? (m[`FRAC_WIDTH-2] ? minv_d : minv_c) : (m[`FRAC_WIDTH-2] ? minv_b : minv_a);
+
+for (g=0; g<`NUM_WIDTH; g=g+1) begin:g_mrec
+ wire [`NUM_WIDTH-1:0] mrc = msb[g] ? ({{(`INT_WIDTH){1'b0}}, minv, {(`FRAC_WIDTH){1'b0}}} >> g) : {(`NUM_WIDTH){1'b0}};
+ wire [`NUM_WIDTH-1:0] mrs;
+ if (g==0) begin:i_mrecz
+ assign mrs = mrc;
+ end else begin:i_mrecnz
+ assign mrs = g_mrec[g-1].mrs | mrc;
+ end
+end
+assign res = g_mrec[`NUM_WIDTH-1].mrs;
+
+endgenerate
+
+endmodule
+
+
+// softmax using approximated 2^x and 1/x
+
+module approx_softmax_comb (
+ input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+ output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk
+);
+
+wire [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+wire [`NUM_WIDTH-1:0] xmax;
+wire [`INDEX_WIDTH-1:0] _ignore;
+
+max_comb i_max (
+ .x_pk,
+ .res_val(xmax),
+ .res_pos(_ignore)
+);
+
+wire [`NUM_WIDTH-1:0] dexp[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] esum;
+
+generate genvar g;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_expsum
+ wire [`NUM_WIDTH-1:0] diff;
+ sub_sat_comb i_sub (
+ .a(x[g]),
+ .b(xmax),
+ .res(diff)
+ );
+ approx_exp_comb i_exp (
+ .x(diff),
+ .res(dexp[g])
+ );
+ wire [`NUM_WIDTH-1:0] psum;
+ if (g==0) begin
+ assign psum = dexp[g];
+ end else begin
+ assign psum = g_expsum[g-1].psum + dexp[g];
+ end
+end
+assign esum = g_expsum[`OUTPUT_SIZE-1].psum;
+
+endgenerate
+
+wire [`NUM_WIDTH-1:0] isum;
+approx_inv_comb i_inv (
+ .x(esum),
+ .res(isum)
+);
+
+generate
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_div
+ mul_sat_comb i_mul (
+ .a(dexp[g]),
+ .b(isum),
+ .res(res[g])
+ );
+end
+
+endgenerate
+
+endmodule
+
+
+// derivative of softmax
+
+module approx_softmax_diff_comb (
+ input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+ output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk
+);
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] sm_pk;
+
+approx_softmax_comb i_sm (
+ .x_pk,
+ .res_pk(sm_pk)
+);
+
+wire [`NUM_WIDTH-1:0] sm[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, sm, sm_pk)
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+generate genvar g;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin
+ wire [`NUM_WIDTH-1:0] sqr;
+ mul_sat_comb i_mul (
+ .a(sm[g]),
+ .b(sm[g]),
+ .res(sqr)
+ );
+ sub_sat_comb i_sub (
+ .a(sm[g]),
+ .b(sqr),
+ .res(res[g])
+ );
+end
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/actfn_tb.v b/verilog/rtl/actfn_tb.v
new file mode 100644
index 0000000..ba2c805
--- /dev/null
+++ b/verilog/rtl/actfn_tb.v
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for actfn.v
+
+
+// leaky ReLU
+
+module leaky_relu_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+leaky_relu_comb dut (
+ .x,
+ .res
+);
+
+initial begin
+ $monitor("time %4t x %64b res %64b", $time, x, res);
+ x <= 3;
+ #10
+ x <= -3;
+ #10
+ x <= -3 << `LEAK_SHIFT;
+ $finish;
+end
+
+endmodule
+
+
+// derivative of leaky ReLU
+
+module leaky_relu_diff_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+leaky_relu_diff_comb dut (
+ .x,
+ .res
+);
+
+initial begin
+ $monitor("time %4t x %64b res %64b", $time, x, res);
+ x <= 3;
+ #10
+ x <= -3;
+ #10
+ x <= -3 << `LEAK_SHIFT;
+ $finish;
+end
+
+endmodule
+
+
+// very rough approximation of 2^x, used in softmax
+
+module approx_exp_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+approx_exp_comb dut (
+ .x,
+ .res
+);
+
+initial begin
+ $monitor("time %4t x %64b res %64b", $time, x, res);
+ x <= 3;
+ #10
+ x <= 3 << `FRAC_WIDTH;
+ #10
+ x <= -3 << `FRAC_WIDTH;
+ #10
+ x <= (`INT_WIDTH-2) << `FRAC_WIDTH;
+ #10
+ x <= (`INT_WIDTH-1) << `FRAC_WIDTH;
+ #10
+ x <= (-`FRAC_WIDTH) << `FRAC_WIDTH;
+ #10
+ x <= (-`FRAC_WIDTH-1) << `FRAC_WIDTH;
+ $finish;
+end
+
+endmodule
+
+
+// piecewise linear approximation of 1/x, used in softmax
+
+module approx_inv_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x;
+wire [`NUM_WIDTH-1:0] res;
+
+approx_inv_comb dut (
+ .x,
+ .res
+);
+
+initial begin
+ $monitor("time %4t x %64b res %64b m 1%24b minv %25b", $time, x, res, dut.m, dut.minv);
+ x <= 1;
+ #10
+ x <= 1 << `FRAC_WIDTH;
+ #10
+ x <= 1 << (`FRAC_WIDTH + 2);
+ #10
+ x <= 1 << (`FRAC_WIDTH - 3);
+ #10
+ x <= 4'b1000 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1001 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1010 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1011 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1100 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1101 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1110 << `FRAC_WIDTH;
+ #10
+ x <= 4'b1111 << `FRAC_WIDTH;
+ #10
+ $finish;
+end
+
+endmodule
+
+
+// softmax using approximated 2^x and 1/x
+
+module approx_softmax_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+approx_softmax_comb dut (
+ .x_pk,
+ .res_pk
+);
+
+wire [`NUM_WIDTH-1:0] res0 = res[0]; // workaround for segfault in vvp
+wire [`NUM_WIDTH-1:0] res1 = res[1];
+wire [`NUM_WIDTH-1:0] res2 = res[2];
+
+reg [`INDEX_WIDTH-1:0] i;
+wire [31:0] hint = 32'b1 << `FRAC_WIDTH;
+
+initial begin
+ $display("TIME vvvv X[0] %32b X[1] %32b X[2] %32b RES[0] %32b RES[1] %32b RES[2] %32b", hint, hint, hint, hint, hint, hint);
+ $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res[0] %32b res[1] %32b res[2] %32b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res0[31:0], res1[31:0], res2[31:0]);
+ for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+ x[i] <= 0;
+ end
+ x[0] <= 0;
+ x[1] <= 0;
+ x[2] <= 0;
+ #10
+ x[0] <= 1 << (`FRAC_WIDTH-4);
+ x[1] <= 2 << (`FRAC_WIDTH-4);
+ x[2] <= 3 << (`FRAC_WIDTH-4);
+ #10
+ x[0] <= 1 << `FRAC_WIDTH;
+ #10
+ x[1] <= 1 << `FRAC_WIDTH;
+ #10
+ x[2] <= 2 << `FRAC_WIDTH;
+ #10
+ x[1] <= 2 << `FRAC_WIDTH;
+ #10
+ x[0] <= 2 << `FRAC_WIDTH;
+ #10
+ x[0] <= 1 << (`FRAC_WIDTH+4);
+ #10
+ x[1] <= 2 << (`FRAC_WIDTH+4);
+ #10
+ x[2] <= 3 << (`FRAC_WIDTH+4);
+ $finish;
+end
+
+endmodule
+
+
+// derivative of softmax
+
+module approx_softmax_diff_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res[`OUTPUT_SIZE-1:0];
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] res_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, res, res_pk)
+
+approx_softmax_diff_comb dut (
+ .x_pk,
+ .res_pk
+);
+
+wire [`NUM_WIDTH-1:0] res0 = res[0]; // workaround for segfault in vvp
+wire [`NUM_WIDTH-1:0] res1 = res[1];
+wire [`NUM_WIDTH-1:0] res2 = res[2];
+
+reg [`INDEX_WIDTH-1:0] i;
+wire [31:0] hint = 32'b1 << `FRAC_WIDTH;
+
+initial begin
+ $display("TIME vvvv X[0] %32b X[1] %32b X[2] %32b RES[0] %32b RES[1] %32b RES[2] %32b", hint, hint, hint, hint, hint, hint);
+ $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res[0] %32b res[1] %32b res[2] %32b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res0[31:0], res1[31:0], res2[31:0]);
+ for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+ x[i] <= 0;
+ end
+ x[0] <= 0;
+ x[1] <= 0;
+ x[2] <= 0;
+ #10
+ x[0] <= 1 << (`FRAC_WIDTH-4);
+ x[1] <= 2 << (`FRAC_WIDTH-4);
+ x[2] <= 3 << (`FRAC_WIDTH-4);
+ #10
+ x[0] <= 1 << `FRAC_WIDTH;
+ #10
+ x[1] <= 1 << `FRAC_WIDTH;
+ #10
+ x[2] <= 2 << `FRAC_WIDTH;
+ #10
+ x[1] <= 2 << `FRAC_WIDTH;
+ #10
+ x[0] <= 2 << `FRAC_WIDTH;
+ #10
+ x[0] <= 1 << (`FRAC_WIDTH+4);
+ #10
+ x[1] <= 2 << (`FRAC_WIDTH+4);
+ #10
+ x[2] <= 3 << (`FRAC_WIDTH+4);
+ $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/config.v b/verilog/rtl/config.v
new file mode 100644
index 0000000..e2c80d1
--- /dev/null
+++ b/verilog/rtl/config.v
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// numbers are represented as fixed-point fractions
+// with an integral part of INT_WIDTH bits
+// and a fractional part of FRAC_WIDTH bits
+// (64 bits are an overkill here, but simulations were run on a 64-bit platform)
+`define INT_WIDTH 40
+`define FRAC_WIDTH 24
+`define NUM_WIDTH (`INT_WIDTH + `FRAC_WIDTH)
+
+// multiplication is the main bottleneck in the circuit complexity
+// so we recude integer & fractional widths for multiplications
+// (this didn't significantly affect learning speed in our tests)
+
+`define MUL_INT_WIDTH 6
+`define PRE_MUL_FRAC_WIDTH 12
+`define POST_MUL_FRAC_WIDTH 16
+
+// number of neurons in input, hidden 1, hidden 2 & output layers
+`define INPUT_SIZE 1
+`define HIDDEN1_SIZE 1
+`define HIDDEN2_SIZE 1
+`define OUTPUT_SIZE 1
+
+// bits required to describe the sizes above
+`define INDEX_WIDTH 10
+
+// power of 1/2 used an the slope of leaky ReLU's negative part
+`define LEAK_SHIFT 7
+
+// power of 1/2 used as the learning rate
+`define LEARN_SHIFT 7
+
diff --git a/verilog/rtl/interface.v b/verilog/rtl/interface.v
new file mode 100644
index 0000000..4b4abab
--- /dev/null
+++ b/verilog/rtl/interface.v
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// wrapper for neural_network that allows
+// - setting initial weights, inputs & ground truth
+// - retrieving outputs
+// - starting forward or backward propagation
+// - detecting when forward or backward propagation finishes
+// by using memory i/o within a single virtual address space
+
+module neural_interface (
+ input clk,
+ input [23:0] addr,
+ input [`NUM_WIDTH-1:0] data_in,
+ input we,
+ output reg [`NUM_WIDTH-1:0] data_out
+);
+
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+reg wu;
+reg [1:0] w_layer;
+reg [`INDEX_WIDTH-1:0] w_i;
+reg [`INDEX_WIDTH-1:0] w_j;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+wire [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk;
+`PACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+neural_network i_nn (
+ .clk,
+ .fp,
+ .fp_out,
+ .a0_pk,
+ .a3_pk,
+ .bp,
+ .bp_out,
+ .g3_pk,
+ .wu,
+ .w_layer,
+ .w_i,
+ .w_j,
+ .w_in,
+ .w_out
+);
+
+wire [`NUM_WIDTH-1:0] _ignore_a3;
+wire [`INDEX_WIDTH-1:0] argmax_a3;
+max_comb i_max_a3 (
+ .x_pk(a3_pk),
+ .res_val(_ignore_a3),
+ .res_pos(argmax_a3)
+);
+
+wire [`NUM_WIDTH-1:0] _ignore_g3;
+wire [`INDEX_WIDTH-1:0] argmax_g3;
+max_comb i_max_g3 (
+ .x_pk(g3_pk),
+ .res_val(_ignore_g3),
+ .res_pos(argmax_g3)
+);
+
+reg fp_out_hold;
+reg bp_out_hold;
+wire fp_out_hn = fp_out_hold | fp_out;
+wire bp_out_hn = bp_out_hold | bp_out;
+
+wire [3:0] sel = addr[23:20];
+wire [9:0] index_h = addr[19:10];
+wire [9:0] index_l = addr[9:0];
+
+always @(posedge clk) begin
+ fp <= 0;
+ bp <= 0;
+ wu <= 0;
+ fp_out_hold <= fp_out_hn;
+ bp_out_hold <= bp_out_hn;
+ data_out <= {(`NUM_WIDTH){1'b0}};
+ if (sel[3:2]==0) begin
+ w_layer <= sel[1:0];
+ w_i <= index_h;
+ w_j <= index_l;
+ data_out <= w_out;
+ if (we) begin
+ w_in <= data_in;
+ wu <= 1;
+ end
+ end else if (sel==4) begin
+ data_out <= a0[index_l];
+ if (we) begin
+ a0[index_l] <= data_in;
+ end
+ end else if (sel==5) begin
+ data_out <= a3[index_l];
+ end else if (sel==6) begin
+ data_out <= g3[index_l];
+ if (we) begin
+ g3[index_l] <= data_in;
+ end
+ end else if (sel==7) begin
+ if (index_l == 0) begin
+ data_out <= {(`NUM_WIDTH){fp_out_hn}};
+ if (we) begin
+ if (|data_in) begin
+ fp <= 1;
+ end else begin
+ fp_out_hold <= 0;
+ end
+ end
+ end else if (index_l == 1) begin
+ data_out <= {(`NUM_WIDTH){bp_out_hn}};
+ if (we) begin
+ if (|data_in) begin
+ bp <= 1;
+ end else begin
+ bp_out_hold <= 0;
+ end
+ end
+ end else if (index_l == 2) begin
+ data_out <= argmax_a3 << `FRAC_WIDTH;
+ end else if (index_l == 3) begin
+ data_out <= argmax_g3 << `FRAC_WIDTH;
+ end
+ end
+end
+
+endmodule
+
diff --git a/verilog/rtl/interface_tb.v b/verilog/rtl/interface_tb.v
new file mode 100644
index 0000000..db60498
--- /dev/null
+++ b/verilog/rtl/interface_tb.v
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbench for the single neural_interface module in interface.v
+
+module neural_interface_tb ();
+
+reg clk;
+reg [23:0] addr;
+reg [`NUM_WIDTH-1:0] data_in;
+reg we;
+wire [`NUM_WIDTH-1:0] data_out;
+
+neural_interface dut (
+ .clk,
+ .addr,
+ .data_in,
+ .we,
+ .data_out
+);
+
+generate genvar i; genvar j;
+
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+ for (j=0; j<`HIDDEN1_SIZE; j=j+1) begin
+ initial begin
+ dut.i_nn.g_syn01_o[i].g_syn01_i[j].i_syn01.w <= 0;
+ end
+ end
+end
+for (i=0; i<`HIDDEN1_SIZE; i=i+1) begin
+ for (j=0; j<`HIDDEN2_SIZE; j=j+1) begin
+ initial begin
+ dut.i_nn.g_syn12_o[i].g_syn12_i[j].i_syn12.w <= 0;
+ end
+ end
+end
+for (i=0; i<`HIDDEN2_SIZE; i=i+1) begin
+ for (j=0; j<`OUTPUT_SIZE; j=j+1) begin
+ initial begin
+ dut.i_nn.g_syn23_o[i].g_syn23_i[j].i_syn23.w <= 0;
+ end
+ end
+end
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+ initial begin
+ dut.a0[i] <= 1;
+ end
+end
+for (i=0; i<`OUTPUT_SIZE; i=i+1) begin
+ initial begin
+ dut.i_nn.g_layer3[i].i_neu_3.a <= 2;
+ dut.g3[i] <= 3;
+ end
+end
+
+endgenerate
+
+initial begin
+ clk <= 0;
+ we <= 0;
+ $monitor("time %4t addr %24b data_in %64b we %1b data_out %64b", $time, addr, data_in, we, data_out);
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0010_0000000111_0000000011;
+ #5 clk<=1; #5 clk<=0;
+ data_in <= 15;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0100_0000000000_0000000101;
+ data_in <= 33;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0101_0000000000_0000000101;
+ data_in <= 17;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0110_0000000000_0000000101;
+ data_in <= 9;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ dut.fp_out_hold <= 0;
+ addr <= 24'b0111_0000000000_0000000000;
+ data_in <= 1;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ data_in <= 0;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ dut.bp_out_hold <= 0;
+ addr <= 24'b0111_0000000000_0000000001;
+ data_in <= 1;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ data_in <= 0;
+ we <= 1;
+ #5 clk<=1; #5 clk<=0;
+ we <= 0;
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0111_0000000000_0000000010;
+ #5 clk<=1; #5 clk<=0;
+ addr <= 24'b0111_0000000000_0000000011;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/macros.v b/verilog/rtl/macros.v
new file mode 100644
index 0000000..26d3132
--- /dev/null
+++ b/verilog/rtl/macros.v
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// macros for passing bus arrays to modules
+`define PACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,VAR) \
+ generate genvar VAR; \
+ for (VAR=0; VAR<(LEN); VAR=VAR+1) begin \
+ assign DEST[((WIDTH)*VAR+((WIDTH)-1)):((WIDTH)*VAR)] = SRC[VAR][((WIDTH)-1):0]; \
+ end \
+ endgenerate
+`define PACK_ARRAY(WIDTH,LEN,SRC,DEST) `PACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,pa_``SRC)
+`define UNPACK_ARRAY_INTERNAL(WIDTH,LEN,DEST,SRC,VAR) \
+ generate genvar VAR; \
+ for (VAR=0; VAR<(LEN); VAR=VAR+1) begin \
+ assign DEST[VAR][((WIDTH)-1):0] = SRC[((WIDTH)*VAR+(WIDTH-1)):((WIDTH)*VAR)]; \
+ end \
+ endgenerate
+`define UNPACK_ARRAY(WIDTH,LEN,SRC,DEST) `UNPACK_ARRAY_INTERNAL(WIDTH,LEN,SRC,DEST,ua_``SRC)
+
diff --git a/verilog/rtl/math.v b/verilog/rtl/math.v
new file mode 100644
index 0000000..a34db07
--- /dev/null
+++ b/verilog/rtl/math.v
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// basic arithmetics with saturation
+
+
+// addition
+// res = a + b
+
+module add_sat_comb (
+ input [`NUM_WIDTH-1:0] a,
+ input [`NUM_WIDTH-1:0] b,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sig_sum;
+wire [`NUM_WIDTH-1:0] sum;
+assign {sig_sum, sum} = {sig_a, a} + {sig_b, b};
+wire saturated = sig_sum != sum[`NUM_WIDTH-1];
+assign res = saturated ? {sig_sum, {(`NUM_WIDTH-1){~sig_sum}}} : sum;
+
+endmodule
+
+
+// subtraction
+// res = a - b
+
+module sub_sat_comb (
+ input [`NUM_WIDTH-1:0] a,
+ input [`NUM_WIDTH-1:0] b,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sig_sum;
+wire [`NUM_WIDTH-1:0] sum;
+assign {sig_sum, sum} = {sig_a, a} - {sig_b, b};
+wire saturated = sig_sum != sum[`NUM_WIDTH-1];
+assign res = saturated ? {sig_sum, {(`NUM_WIDTH-1){~sig_sum}}} : sum;
+
+endmodule
+
+
+// multiplication
+// res = a * b
+
+module mul_sat_comb (
+ input [`NUM_WIDTH-1:0] a,
+ input [`NUM_WIDTH-1:0] b,
+ output [`NUM_WIDTH-1:0] res
+);
+
+wire sig_a = a[`NUM_WIDTH-1];
+wire sig_b = b[`NUM_WIDTH-1];
+wire sat_a = |a[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1] & ~&a[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1];
+wire sat_b = |b[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1] & ~&b[`NUM_WIDTH-1:`MUL_INT_WIDTH+`FRAC_WIDTH-1];
+wire [`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1:0] short_a = sat_a ? {sig_a, {(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1){~sig_a}}} : a[`MUL_INT_WIDTH+`FRAC_WIDTH-1:`FRAC_WIDTH-`PRE_MUL_FRAC_WIDTH];
+wire [`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1:0] short_b = sat_b ? {sig_b, {(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH-1){~sig_b}}} : b[`MUL_INT_WIDTH+`FRAC_WIDTH-1:`FRAC_WIDTH-`PRE_MUL_FRAC_WIDTH];
+wire sig_mul;
+wire [`MUL_INT_WIDTH-1:0] mul_hi;
+wire [`MUL_INT_WIDTH+`POST_MUL_FRAC_WIDTH-2:0] mul_md;
+wire [2*`PRE_MUL_FRAC_WIDTH-`POST_MUL_FRAC_WIDTH-1:0] mul_lo;
+assign {sig_mul, mul_hi, mul_md, mul_lo} = {{(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH){sig_a}}, short_a} * {{(`MUL_INT_WIDTH+`PRE_MUL_FRAC_WIDTH){sig_b}}, short_b};
+wire saturated = |{sig_mul, mul_hi} & ~&{sig_mul, mul_hi};
+assign res = saturated ? {sig_mul, {(`NUM_WIDTH-1){~sig_mul}}} : {{(`INT_WIDTH-`MUL_INT_WIDTH+1){sig_mul}}, mul_md, {(`FRAC_WIDTH-`POST_MUL_FRAC_WIDTH){1'b0}}};
+
+endmodule
+
+
+// maximum & argmax
+// res_val = max(x)
+// res_pos = argmax(x)
+
+module max_comb (
+ input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk,
+ output [`NUM_WIDTH-1:0] res_val,
+ output [`INDEX_WIDTH-1:0] res_pos
+);
+
+wire [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+generate genvar g; genvar h;
+
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_max_o
+ wire [`OUTPUT_SIZE-1:0] is_greater;
+ for (h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_max_i
+ assign is_greater[h] = $signed(x[g]) >= $signed(x[h]);
+ end
+ wire is_max = &is_greater;
+ wire [`NUM_WIDTH-1:0] cur_val = is_max ? x[g] : {(`NUM_WIDTH){1'b0}};
+ wire [`NUM_WIDTH-1:0] max_val;
+ wire [`INDEX_WIDTH-1:0] pos;
+ if (g==0) begin
+ assign max_val = cur_val;
+ assign pos = 0;
+ end else begin
+ assign max_val = g_max_o[g-1].max_val | cur_val;
+ assign pos = is_max ? g : g_max_o[g-1].pos;
+ end
+end
+assign res_val = g_max_o[`OUTPUT_SIZE-1].max_val;
+assign res_pos = g_max_o[`OUTPUT_SIZE-1].pos;
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/math_tb.v b/verilog/rtl/math_tb.v
new file mode 100644
index 0000000..242274a
--- /dev/null
+++ b/verilog/rtl/math_tb.v
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for math.v
+
+
+// addition
+
+module add_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+add_sat_comb dut (
+ .a,
+ .b,
+ .res
+);
+
+initial begin
+ $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+ a <= 1;
+ b <= 2;
+ #10
+ a <= -1;
+ b <= -2;
+ #10
+ a[`NUM_WIDTH-1] <= 0;
+ b[`NUM_WIDTH-1] <= 0;
+ #10
+ b = 2;
+ b[`NUM_WIDTH-1] <= 1;
+ #10
+ a = 1;
+ a[`NUM_WIDTH-1] <= 1;
+ $finish;
+end
+
+endmodule
+
+
+// subtraction
+
+module sub_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+sub_sat_comb dut (
+ .a,
+ .b,
+ .res
+);
+
+initial begin
+ $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+ a <= 1;
+ b <= 2;
+ #10
+ a <= -1;
+ b <= -2;
+ #10
+ a[`NUM_WIDTH-1] <= 0;
+ b[`NUM_WIDTH-1] <= 0;
+ #10
+ b = 2;
+ b[`NUM_WIDTH-1] <= 1;
+ #10
+ a = 1;
+ a[`NUM_WIDTH-1] <= 1;
+ $finish;
+end
+
+endmodule
+
+
+// multiplication
+
+module mul_sat_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] a;
+reg [`NUM_WIDTH-1:0] b;
+wire [`NUM_WIDTH-1:0] res;
+
+mul_sat_comb dut (
+ .a,
+ .b,
+ .res
+);
+
+initial begin
+ $monitor("time %4t a %64b b %64b res %64b", $time, a, b, res);
+ a <= 1 << `FRAC_WIDTH;
+ b <= 2 << `FRAC_WIDTH;
+ #10
+ a <= -1 << `FRAC_WIDTH;
+ b <= -2 << `FRAC_WIDTH;
+ #10
+ a[`NUM_WIDTH-1] <= 0;
+ b[`NUM_WIDTH-1] <= 0;
+ #10
+ b <= 2 << `FRAC_WIDTH;
+ b[`NUM_WIDTH-1] <= 1;
+ #10
+ a <= 1 << `FRAC_WIDTH;
+ a[`NUM_WIDTH-1] <= 1;
+ $finish;
+end
+
+endmodule
+
+
+// maximum & argmax
+
+module max_comb_tb ();
+
+reg [`NUM_WIDTH-1:0] x[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] res_val;
+wire [`INDEX_WIDTH-1:0] res_pos;
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] x_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, x, x_pk)
+
+max_comb dut (
+ .x_pk,
+ .res_val,
+ .res_pos
+);
+
+reg [`INDEX_WIDTH-1:0] i;
+
+initial begin
+ $monitor("time %4t x[0] %32b x[1] %32b x[2] %32b res_val %32b res_pos %10b", $time, x[0][31:0], x[1][31:0], x[2][31:0], res_val[31:0], res_pos);
+ for(i=0; i<`OUTPUT_SIZE; i=i+1) begin
+ x[i] <= -20;
+ end
+ x[0] <= 0;
+ x[1] <= 0;
+ x[2] <= 0;
+ #10
+ x[0] <= 1;
+ x[1] <= 2;
+ x[2] <= 3;
+ #10
+ x[0] <= 10;
+ #10
+ x[1] <= 10;
+ #10
+ x[2] <= 20;
+ #10
+ x[1] <= 20;
+ #10
+ x[0] <= 20;
+ #10
+ x[0] <= -1;
+ x[1] <= -2;
+ x[2] <= -3;
+ #10
+ x[0] <= 1;
+ $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/network.v b/verilog/rtl/network.v
new file mode 100644
index 0000000..6fec60c
--- /dev/null
+++ b/verilog/rtl/network.v
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// neural network composed of an input layer,
+// two fully connected hidden layers with a ReLU activation function
+// and a fully connected output layer with a softmax activation function
+
+module neural_network (
+ input clk,
+ input fp,
+ output fp_out,
+ input [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk,
+ output [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk,
+ input bp,
+ output bp_out,
+ input [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk, // ground truth
+ input wu,
+ input [1:0] w_layer,
+ input [`INDEX_WIDTH-1:0] w_i,
+ input [`INDEX_WIDTH-1:0] w_j,
+ input [`NUM_WIDTH-1:0] w_in,
+ output [`NUM_WIDTH-1:0] w_out
+);
+
+wire [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+generate genvar g; genvar h;
+
+// synapses between input layer & hidden layer 1
+wire bp_s01;
+wire [`NUM_WIDTH-1:0] e1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`INPUT_SIZE; g=g+1) begin:g_syn01_o
+ for (h=0; h<`HIDDEN1_SIZE; h=h+1) begin:g_syn01_i
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] zc;
+ wire [`NUM_WIDTH-1:0] tc; // ignored for input layer
+ wire wu_sel = (w_layer == 0) && (w_i == g) && (w_j == h);
+ wire [`NUM_WIDTH-1:0] w_out_r;
+ synapse i_syn01 (
+ .clk,
+ .fp,
+ .fp_out,
+ .a(a0[g]),
+ .zc,
+ .bp(bp_s01),
+ .bp_out,
+ .e(e1[h]),
+ .tc,
+ .wu(wu && wu_sel),
+ .w_in(w_in),
+ .w_out(w_out_r)
+ );
+ wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+ wire [`NUM_WIDTH-1:0] w_out_a;
+ if (h==0) begin
+ assign w_out_a = w_out_s;
+ end else begin
+ assign w_out_a = g_syn01_i[h-1].w_out_a | w_out_s;
+ end
+ end
+ wire [`NUM_WIDTH-1:0] w_out_b;
+ if (g==0) begin
+ assign w_out_b = g_syn01_i[`HIDDEN1_SIZE-1].w_out_a;
+ end else begin
+ assign w_out_b = g_syn01_o[g-1].w_out_b | g_syn01_i[`HIDDEN1_SIZE-1].w_out_a;
+ end
+end
+wire [`NUM_WIDTH-1:0] w_out_c01 = g_syn01_o[`INPUT_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_z1_o
+ for (h=0; h<`INPUT_SIZE; h=h+1) begin:g_z1_i
+ wire [`NUM_WIDTH-1:0] zc = g_syn01_o[h].g_syn01_i[g].zc;
+ wire [`NUM_WIDTH-1:0] z1s;
+ if (h==0) begin
+ assign z1s = zc;
+ end else begin
+ add_sat_comb i_add_z1 (
+ .a(g_z1_i[h-1].z1s),
+ .b(zc),
+ .res(z1s)
+ );
+ end
+ end
+ assign z1[g] = g_z1_i[`INPUT_SIZE-1].z1s;
+end
+
+wire fp_h1 = g_syn01_o[0].g_syn01_i[0].fp_out;
+assign bp_out = g_syn01_o[0].g_syn01_i[0].bp_out;
+
+// hidden layer 1
+wire bp_h1;
+wire [`NUM_WIDTH-1:0] a1[`HIDDEN1_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] t1[`HIDDEN1_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_layer1
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] to_act;
+ wire [`NUM_WIDTH-1:0] from_act;
+ wire [`NUM_WIDTH-1:0] from_act_diff;
+ neuron i_neu_1 (
+ .clk,
+ .fp(fp_h1),
+ .fp_out,
+ .z(z1[g]),
+ .a(a1[g]),
+ .bp(bp_h1),
+ .bp_out,
+ .t(t1[g]),
+ .e(e1[g]),
+ .to_act,
+ .from_act,
+ .from_act_diff
+ );
+ leaky_relu_comb i_act_1 (
+ .x(to_act),
+ .res(from_act)
+ );
+ leaky_relu_diff_comb i_act_diff_1 (
+ .x(to_act),
+ .res(from_act_diff)
+ );
+end
+
+wire fp_s12 = g_layer1[0].fp_out;
+assign bp_s01 = g_layer1[0].bp_out;
+
+// synapses between hidden layers 1 & 2
+wire bp_s12;
+wire [`NUM_WIDTH-1:0] e2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_syn12_o
+ for (h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_syn12_i
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] zc;
+ wire [`NUM_WIDTH-1:0] tc;
+ wire wu_sel = (w_layer == 1) && (w_i == g) && (w_j == h);
+ wire [`NUM_WIDTH-1:0] w_out_r;
+ synapse i_syn12 (
+ .clk,
+ .fp(fp_s12),
+ .fp_out,
+ .a(a1[g]),
+ .zc,
+ .bp(bp_s12),
+ .bp_out,
+ .e(e2[h]),
+ .tc,
+ .wu(wu && wu_sel),
+ .w_in(w_in),
+ .w_out(w_out_r)
+ );
+ wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+ wire [`NUM_WIDTH-1:0] w_out_a;
+ if (h==0) begin
+ assign w_out_a = w_out_s;
+ end else begin
+ assign w_out_a = g_syn12_i[h-1].w_out_a | w_out_s;
+ end
+ end
+ wire [`NUM_WIDTH-1:0] w_out_b;
+ if (g==0) begin
+ assign w_out_b = g_syn12_i[`HIDDEN2_SIZE-1].w_out_a;
+ end else begin
+ assign w_out_b = g_syn12_o[g-1].w_out_b | g_syn12_i[`HIDDEN2_SIZE-1].w_out_a;
+ end
+end
+wire [`NUM_WIDTH-1:0] w_out_c12 = g_syn12_o[`HIDDEN1_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_z2_o
+ for (h=0; h<`HIDDEN1_SIZE; h=h+1) begin:g_z2_i
+ wire [`NUM_WIDTH-1:0] zc = g_syn12_o[h].g_syn12_i[g].zc;
+ wire [`NUM_WIDTH-1:0] z2s;
+ if (h==0) begin
+ assign z2s = zc;
+ end else begin
+ add_sat_comb i_add_z2 (
+ .a(g_z2_i[h-1].z2s),
+ .b(zc),
+ .res(z2s)
+ );
+ end
+ end
+ assign z2[g] = g_z2_i[`HIDDEN1_SIZE-1].z2s;
+end
+
+for (g=0; g<`HIDDEN1_SIZE; g=g+1) begin:g_t1_o
+ for(h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_t1_i
+ wire [`NUM_WIDTH-1:0] tc = g_syn12_o[g].g_syn12_i[h].tc;
+ wire [`NUM_WIDTH-1:0] t1s;
+ if (h==0) begin
+ assign t1s = tc;
+ end else begin
+ add_sat_comb i_add_t1 (
+ .a(g_t1_i[h-1].t1s),
+ .b(tc),
+ .res(t1s)
+ );
+ end
+ end
+ assign t1[g] = g_t1_i[`HIDDEN2_SIZE-1].t1s;
+end
+
+wire fp_h2 = g_syn12_o[0].g_syn12_i[0].fp_out;
+assign bp_h1 = g_syn12_o[0].g_syn12_i[0].bp_out;
+
+// hidden layer 2
+wire bp_h2;
+wire [`NUM_WIDTH-1:0] a2[`HIDDEN2_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] t2[`HIDDEN2_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_layer2
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] to_act;
+ wire [`NUM_WIDTH-1:0] from_act;
+ wire [`NUM_WIDTH-1:0] from_act_diff;
+ neuron i_neu_2 (
+ .clk,
+ .fp(fp_h2),
+ .fp_out,
+ .z(z2[g]),
+ .a(a2[g]),
+ .bp(bp_h2),
+ .bp_out,
+ .t(t2[g]),
+ .e(e2[g]),
+ .to_act,
+ .from_act,
+ .from_act_diff
+ );
+ leaky_relu_comb i_act_2 (
+ .x(to_act),
+ .res(from_act)
+ );
+ leaky_relu_diff_comb i_act_diff_2 (
+ .x(to_act),
+ .res(from_act_diff)
+ );
+end
+
+wire fp_s23 = g_layer2[0].fp_out;
+assign bp_s12 = g_layer2[0].bp_out;
+
+// synapses between hidden layer 2 & output layer
+wire bp_s23;
+wire [`NUM_WIDTH-1:0] e3[`OUTPUT_SIZE-1:0];
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_syn23_o
+ for (h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_syn23_i
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] zc;
+ wire [`NUM_WIDTH-1:0] tc;
+ wire wu_sel = (w_layer == 2) && (w_i == g) && (w_j == h);
+ wire [`NUM_WIDTH-1:0] w_out_r;
+ synapse i_syn23 (
+ .clk,
+ .fp(fp_s23),
+ .fp_out,
+ .a(a2[g]),
+ .zc,
+ .bp(bp_s23),
+ .bp_out,
+ .e(e3[h]),
+ .tc,
+ .wu(wu && wu_sel),
+ .w_in(w_in),
+ .w_out(w_out_r)
+ );
+ wire [`NUM_WIDTH-1:0] w_out_s = wu_sel ? w_out_r : {(`NUM_WIDTH){1'b0}};
+ wire [`NUM_WIDTH-1:0] w_out_a;
+ if (h==0) begin
+ assign w_out_a = w_out_s;
+ end else begin
+ assign w_out_a = g_syn23_i[h-1].w_out_a | w_out_s;
+ end
+ end
+ wire [`NUM_WIDTH-1:0] w_out_b;
+ if (g==0) begin
+ assign w_out_b = g_syn23_i[`OUTPUT_SIZE-1].w_out_a;
+ end else begin
+ assign w_out_b = g_syn23_o[g-1].w_out_b | g_syn23_i[`OUTPUT_SIZE-1].w_out_a;
+ end
+end
+wire [`NUM_WIDTH-1:0] w_out_c23 = g_syn23_o[`HIDDEN2_SIZE-1].w_out_b;
+
+wire [`NUM_WIDTH-1:0] z3[`OUTPUT_SIZE-1:0];
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_z3_o
+ for (h=0; h<`HIDDEN2_SIZE; h=h+1) begin:g_z3_i
+ wire [`NUM_WIDTH-1:0] zc = g_syn23_o[h].g_syn23_i[g].zc;
+ wire [`NUM_WIDTH-1:0] z3s;
+ if (h==0) begin
+ assign z3s = zc;
+ end else begin
+ add_sat_comb i_add_z3 (
+ .a(g_z3_i[h-1].z3s),
+ .b(zc),
+ .res(z3s)
+ );
+ end
+ end
+ assign z3[g] = g_z3_i[`HIDDEN2_SIZE-1].z3s;
+end
+
+for (g=0; g<`HIDDEN2_SIZE; g=g+1) begin:g_t2_o
+ for(h=0; h<`OUTPUT_SIZE; h=h+1) begin:g_t2_i
+ wire [`NUM_WIDTH-1:0] tc = g_syn23_o[g].g_syn23_i[h].tc;
+ wire [`NUM_WIDTH-1:0] t2s;
+ if (h==0) begin
+ assign t2s = tc;
+ end else begin
+ add_sat_comb i_add_t2 (
+ .a(g_t2_i[h-1].t2s),
+ .b(tc),
+ .res(t2s)
+ );
+ end
+ end
+ assign t2[g] = g_t2_i[`OUTPUT_SIZE-1].t2s;
+end
+
+wire fp_h3 = g_syn23_o[0].g_syn23_i[0].fp_out;
+assign bp_h2 = g_syn23_o[0].g_syn23_i[0].bp_out;
+
+// output layer
+wire [`NUM_WIDTH-1:0] t3[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] to_softmax[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] from_softmax[`OUTPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] from_softmax_diff[`OUTPUT_SIZE-1:0];
+for (g=0; g<`OUTPUT_SIZE; g=g+1) begin:g_layer3
+ wire fp_out;
+ wire bp_out;
+ wire [`NUM_WIDTH-1:0] to_act;
+ wire [`NUM_WIDTH-1:0] from_act;
+ wire [`NUM_WIDTH-1:0] from_act_diff;
+ neuron i_neu_3 (
+ .clk,
+ .fp(fp_h3),
+ .fp_out,
+ .z(z3[g]),
+ .a(a3[g]),
+ .bp,
+ .bp_out,
+ .t(t3[g]),
+ .e(e3[g]),
+ .to_act(to_softmax[g]),
+ .from_act(from_softmax[g]),
+ .from_act_diff(from_softmax_diff[g])
+ );
+ // feedback using ground truth
+ sub_sat_comb i_sub_fb (
+ .a(a3[g]),
+ .b(g3[g]),
+ .res(t3[g])
+ );
+end
+
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] to_softmax_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, to_softmax, to_softmax_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] from_softmax_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, from_softmax, from_softmax_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] from_softmax_diff_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, from_softmax_diff, from_softmax_diff_pk)
+
+approx_softmax_comb i_act_3 (
+ .x_pk(to_softmax_pk),
+ .res_pk(from_softmax_pk)
+);
+approx_softmax_diff_comb i_act_diff_3 (
+ .x_pk(to_softmax_pk),
+ .res_pk(from_softmax_diff_pk)
+);
+
+assign fp_out = g_layer3[0].fp_out;
+assign bp_s23 = g_layer3[0].bp_out;
+assign w_out = w_out_c01 | w_out_c12 | w_out_c23;
+
+endgenerate
+
+endmodule
+
diff --git a/verilog/rtl/network_tb.v b/verilog/rtl/network_tb.v
new file mode 100644
index 0000000..09fc949
--- /dev/null
+++ b/verilog/rtl/network_tb.v
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbench for the single neural_network module in network.v
+
+module neural_network_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a0[`INPUT_SIZE-1:0];
+wire [`NUM_WIDTH-1:0] a3[`OUTPUT_SIZE-1:0];
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] g3[`OUTPUT_SIZE-1:0];
+reg wu;
+reg [1:0] w_layer;
+reg [`INDEX_WIDTH-1:0] w_i;
+reg [`INDEX_WIDTH-1:0] w_j;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+wire [`INPUT_SIZE*`NUM_WIDTH-1:0] a0_pk;
+`PACK_ARRAY(`NUM_WIDTH, `INPUT_SIZE, a0, a0_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] a3_pk;
+`UNPACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, a3, a3_pk)
+wire [`OUTPUT_SIZE*`NUM_WIDTH-1:0] g3_pk;
+`PACK_ARRAY(`NUM_WIDTH, `OUTPUT_SIZE, g3, g3_pk)
+
+neural_network dut (
+ .clk,
+ .fp,
+ .fp_out,
+ .a0_pk,
+ .a3_pk,
+ .bp,
+ .bp_out,
+ .g3_pk,
+ .wu,
+ .w_layer,
+ .w_i,
+ .w_j,
+ .w_in,
+ .w_out
+);
+
+wire [`NUM_WIDTH-1:0] a_test = a3[4];
+
+generate genvar i; genvar j;
+
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+ for (j=0; j<`HIDDEN1_SIZE; j=j+1) begin
+ initial begin
+ dut.g_syn01_o[i].g_syn01_i[j].i_syn01.w <= (1 << 22) + (i << 10) + (j << 12);
+ end
+ end
+end
+for (i=0; i<`HIDDEN1_SIZE; i=i+1) begin
+ for (j=0; j<`HIDDEN2_SIZE; j=j+1) begin
+ initial begin
+ dut.g_syn12_o[i].g_syn12_i[j].i_syn12.w <= (1 << 22) + (i << 10) + (j << 12);
+ end
+ end
+end
+for (i=0; i<`HIDDEN2_SIZE; i=i+1) begin
+ for (j=0; j<`OUTPUT_SIZE; j=j+1) begin
+ initial begin
+ dut.g_syn23_o[i].g_syn23_i[j].i_syn23.w <= (1 << 22) + (i << 10) + (j << 12);
+ end
+ end
+end
+for (i=0; i<`INPUT_SIZE; i=i+1) begin
+ initial begin
+ a0[i] <= (i % 4 == 0) << 24;
+ end
+end
+for (i=0; i<`OUTPUT_SIZE; i=i+1) begin
+ initial begin
+ g3[i] <= (i == 4) << 24;
+ end
+end
+
+endgenerate
+
+initial begin
+ clk <= 0;
+ fp <= 0;
+ bp <= 0;
+ wu <= 0;
+ w_layer <= 0;
+ w_i <= 1;
+ w_j <= 2;
+ $monitor("time %4t fp %1b fp_out %1b a3[4] %24b bp %1b bp_out %1b w1[1][2] %24b", $time, fp, fp_out, a_test[23:0], bp, bp_out, w_out[23:0]);
+ #5 clk<=1; #5 clk<=0;
+ fp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ fp <= 0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ wu <= 1;
+ w_in <= 24'b111100001100110010101010;
+ #5 clk<=1; #5 clk<=0;
+ wu <= 0;
+ #5 clk<=1; #5 clk<=0;
+ $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/neuron.v b/verilog/rtl/neuron.v
new file mode 100644
index 0000000..b84feb2
--- /dev/null
+++ b/verilog/rtl/neuron.v
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// synapse and neuron primitives for building up the neural network layers
+
+
+// synapse is an edge between two neurons with two-way propagation
+// and an updatable weight
+
+module synapse (
+ input clk,
+ input fp, // forward propagation
+ output reg fp_out,
+ input [`NUM_WIDTH-1:0] a,
+ output reg [`NUM_WIDTH-1:0] zc,
+ input bp, // backward propagation
+ output reg bp_out,
+ input [`NUM_WIDTH-1:0] e,
+ output reg [`NUM_WIDTH-1:0] tc,
+ input wu, // weight update
+ input [`NUM_WIDTH-1:0] w_in,
+ output [`NUM_WIDTH-1:0] w_out
+);
+
+reg [`NUM_WIDTH-1:0] w;
+assign w_out = w;
+
+wire [`NUM_WIDTH-1:0] zn;
+mul_sat_comb i_mul_z (
+ .a(a),
+ .b(w),
+ .res(zn)
+);
+
+wire [`NUM_WIDTH-1:0] tn;
+mul_sat_comb i_mul_t (
+ .a(e),
+ .b(w),
+ .res(tn)
+);
+
+wire [`NUM_WIDTH-1:0] cn;
+mul_sat_comb i_mul_c (
+ .a(a),
+ .b(e),
+ .res(cn)
+);
+
+wire [`NUM_WIDTH-1:0] wn;
+sub_sat_comb i_sub_w (
+ .a(w),
+ .b($signed(cn) >>> `LEARN_SHIFT),
+ .res(wn)
+);
+
+always @(posedge clk) begin
+ if (fp) begin
+ zc <= zn;
+ end
+ fp_out <= fp;
+ if (bp) begin
+ tc <= tn;
+ w <= wn;
+ end
+ bp_out <= bp;
+ if (wu) begin
+ w <= w_in;
+ end
+end
+
+endmodule
+
+
+// generic neuron with two-way propagation that needs to be connected to
+// the respective activation function and its derivative to make
+// either a ReLU or a softmax neuron
+
+module neuron (
+ input clk,
+ input fp, // forward propagation
+ output reg fp_out,
+ input [`NUM_WIDTH-1:0] z,
+ output reg [`NUM_WIDTH-1:0] a,
+ input bp, // backward propagation
+ output reg bp_out,
+ input [`NUM_WIDTH-1:0] t,
+ output reg [`NUM_WIDTH-1:0] e,
+ output [`NUM_WIDTH-1:0] to_act, // to activation function
+ input [`NUM_WIDTH-1:0] from_act,
+ input [`NUM_WIDTH-1:0] from_act_diff
+);
+
+assign to_act = z;
+
+wire [`NUM_WIDTH-1:0] en;
+mul_sat_comb i_mul_e (
+ .a(t),
+ .b(from_act_diff),
+ .res(en)
+);
+
+always @(posedge clk) begin
+ if (fp) begin
+ a <= from_act;
+ end
+ fp_out <= fp;
+ if (bp) begin
+ e <= en;
+ end
+ bp_out <= bp;
+end
+
+endmodule
+
diff --git a/verilog/rtl/neuron_tb.v b/verilog/rtl/neuron_tb.v
new file mode 100644
index 0000000..620669a
--- /dev/null
+++ b/verilog/rtl/neuron_tb.v
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+// testbenches for neuron.v
+
+
+// synapse is an edge between two neurons with two-way propagation
+// and an updatable weight
+
+module synapse_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] a;
+wire [`NUM_WIDTH-1:0] zc;
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] e;
+wire [`NUM_WIDTH-1:0] tc;
+reg wu;
+reg [`NUM_WIDTH-1:0] w_in;
+wire [`NUM_WIDTH-1:0] w_out;
+
+synapse dut (
+ .clk,
+ .fp,
+ .fp_out,
+ .a,
+ .zc,
+ .bp,
+ .bp_out,
+ .e,
+ .tc,
+ .wu,
+ .w_in,
+ .w_out
+);
+
+initial begin
+ clk <= 0;
+ fp <= 0;
+ bp <= 0;
+ wu <= 0;
+ $monitor("time %4t fp %1b fp_out %1b a %16b zc %16b bp %1b bp_out %1b e %16b tc %16b wu %1b w_in %16b w_out %16b", $time, fp, fp_out, a[31:16], zc[31:16], bp, bp_out, e[31:16], tc[31:16], wu, w_in[31:16], w_out[31:16]);
+ #5 clk<=1; #5 clk<=0;
+ wu <= 1;
+ w_in <= 3 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ wu <= 0;
+ w_in <= 5 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ a <= 7 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ fp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ fp <= 0;
+ a <= 5 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ e <= 7 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 0;
+ #5 clk<=1; #5 clk<=0;
+ $finish;
+end
+
+endmodule
+
+
+// generic neuron with two-way propagation that needs to be connected to
+// the respective activation function and its derivative to make
+// either a ReLU or a softmax neuron
+
+module neuron_tb ();
+
+reg clk;
+reg fp;
+wire fp_out;
+reg [`NUM_WIDTH-1:0] z;
+wire [`NUM_WIDTH-1:0] a;
+reg bp;
+wire bp_out;
+reg [`NUM_WIDTH-1:0] t;
+wire [`NUM_WIDTH-1:0] e;
+wire [`NUM_WIDTH-1:0] to_act;
+wire [`NUM_WIDTH-1:0] from_act;
+wire [`NUM_WIDTH-1:0] from_act_diff;
+
+neuron dut (
+ .clk,
+ .fp,
+ .fp_out,
+ .z,
+ .a,
+ .bp,
+ .bp_out,
+ .t,
+ .e,
+ .to_act,
+ .from_act,
+ .from_act_diff
+);
+
+assign from_act = to_act * 9;
+assign from_act_diff = to_act * 17;
+
+initial begin
+ clk <= 0;
+ fp <= 0;
+ bp <= 0;
+ $monitor("time %4t fp %1b fp_out %1b z %16b a %16b bp %1b bp_out %1b t %16b e %16b ta %16b fa %16b fad %16b", $time, fp, fp_out, z[35:20], a[35:20], bp, bp_out, t[35:20], e[35:20], to_act[35:20], from_act[35:20], from_act_diff[35:20]);
+ #5 clk<=1; #5 clk<=0;
+ z <= 3 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ fp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ fp <= 0;
+ z <= 5 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ t <= 7 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 1;
+ #5 clk<=1; #5 clk<=0;
+ bp <= 0;
+ t <= 9 << `FRAC_WIDTH;
+ #5 clk<=1; #5 clk<=0;
+ $finish;
+end
+
+endmodule
+
diff --git a/verilog/rtl/trainable_nn.v b/verilog/rtl/trainable_nn.v
new file mode 100644
index 0000000..367a04e
--- /dev/null
+++ b/verilog/rtl/trainable_nn.v
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// SPDX-FileCopyrightText: 2022 Tamas Hubai
+
+`default_nettype none
+
+module trainable_nn (
+`ifdef USE_POWER_PINS
+ inout vccd1, // User area 1 1.8V supply
+ inout vssd1, // User area 1 digital ground
+`endif
+
+ // Wishbone Slave ports (WB MI A)
+ input wb_clk_i,
+ input wb_rst_i,
+ input wbs_stb_i,
+ input wbs_cyc_i,
+ input wbs_we_i,
+ input [3:0] wbs_sel_i,
+ input [31:0] wbs_dat_i,
+ input [31:0] wbs_adr_i,
+ output wbs_ack_o,
+ output [31:0] wbs_dat_o,
+
+ // Logic Analyzer Signals
+ input [127:0] la_data_in,
+ output [127:0] la_data_out,
+ input [127:0] la_oenb,
+
+ // IOs
+ input [`MPRJ_IO_PADS-1:0] io_in,
+ output [`MPRJ_IO_PADS-1:0] io_out,
+ output [`MPRJ_IO_PADS-1:0] io_oeb,
+
+ // IRQ
+ output [2:0] irq
+);
+
+ assign io_out = {(`MPRJ_IO_PADS){1'b0}};
+ assign io_oeb = {(`MPRJ_IO_PADS){1'b0}};
+
+ assign irq = 3'b000; // Unused
+
+ wire clk = (~la_oenb[0]) ? la_data_in[0]: wb_clk_i;
+ wire rst = (~la_oenb[1]) ? la_data_in[1]: wb_rst_i;
+
+ wire use_wbs = wbs_cyc_i & wbs_stb_i;
+ wire use_la_addr = &~la_oenb[31:8];
+ wire use_la_data_in = &~la_oenb[95:32];
+
+ wire [23:0] addr = use_la_addr ? la_data_in[31:8] : wbs_adr_i[23:0];
+ wire [63:0] data_in = (use_la_addr & use_la_data_in) ? la_data_in[95:32] : {20'b0, wbs_dat_i, 12'b0};
+ wire we = (use_la_addr & use_la_data_in) | (use_wbs & wbs_we_i);
+ wire [63:0] data_out;
+ assign la_data_out = {32'b0, data_out, 32'b0};
+ assign wbs_dat_o = data_out[43:12];
+
+ neural_interface i_ni (
+ .clk,
+ .addr,
+ .data_in,
+ .we,
+ .data_out
+ );
+
+ always @(posedge clk) begin
+ wbs_ack_o <= (~rst) & use_wbs;
+ end
+
+endmodule
+
diff --git a/verilog/rtl/uprj_netlists.v b/verilog/rtl/uprj_netlists.v
index 3537de8..06a0b3a 100644
--- a/verilog/rtl/uprj_netlists.v
+++ b/verilog/rtl/uprj_netlists.v
@@ -21,8 +21,8 @@
// Assume default net type to be wire because GL netlists don't have the wire definitions
`default_nettype wire
`include "gl/user_project_wrapper.v"
- `include "gl/user_proj_example.v"
+ `include "gl/trainable_nn.v"
`else
`include "user_project_wrapper.v"
- `include "user_proj_example.v"
+ `include "trainable_nn.v"
`endif
\ No newline at end of file
diff --git a/verilog/rtl/user_defines.v b/verilog/rtl/user_defines.v
index ee44b08..66bc63d 100644
--- a/verilog/rtl/user_defines.v
+++ b/verilog/rtl/user_defines.v
@@ -52,41 +52,41 @@
// up in a state that can be used immediately without depending on
// the management SoC to run a startup program to configure the GPIOs.
-`define USER_CONFIG_GPIO_5_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_6_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_7_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_8_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_9_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_10_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_11_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_12_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_13_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_5_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_6_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_7_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_8_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_9_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_10_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_11_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_12_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_13_INIT `GPIO_MODE_MGMT_STD_OUTPUT
// Configurations of GPIO 14 to 24 are used on caravel but not caravan.
-`define USER_CONFIG_GPIO_14_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_15_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_16_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_17_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_18_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_19_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_20_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_21_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_22_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_23_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_24_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_14_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_15_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_16_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_17_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_18_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_19_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_20_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_21_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_22_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_23_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_24_INIT `GPIO_MODE_MGMT_STD_OUTPUT
-`define USER_CONFIG_GPIO_25_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_26_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_27_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_28_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_29_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_30_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_31_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_32_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_33_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_34_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_35_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_36_INIT `GPIO_MODE_INVALID
-`define USER_CONFIG_GPIO_37_INIT `GPIO_MODE_INVALID
+`define USER_CONFIG_GPIO_25_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_26_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_27_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_28_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_29_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_30_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_31_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_32_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_33_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_34_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_35_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_36_INIT `GPIO_MODE_MGMT_STD_OUTPUT
+`define USER_CONFIG_GPIO_37_INIT `GPIO_MODE_MGMT_STD_OUTPUT
`endif // __USER_DEFINES_H
diff --git a/verilog/rtl/user_proj_example.v b/verilog/rtl/user_proj_example.v
deleted file mode 100644
index 26081e9..0000000
--- a/verilog/rtl/user_proj_example.v
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-FileCopyrightText: 2020 Efabless Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// SPDX-License-Identifier: Apache-2.0
-
-`default_nettype none
-/*
- *-------------------------------------------------------------
- *
- * user_proj_example
- *
- * This is an example of a (trivially simple) user project,
- * showing how the user project can connect to the logic
- * analyzer, the wishbone bus, and the I/O pads.
- *
- * This project generates an integer count, which is output
- * on the user area GPIO pads (digital output only). The
- * wishbone connection allows the project to be controlled
- * (start and stop) from the management SoC program.
- *
- * See the testbenches in directory "mprj_counter" for the
- * example programs that drive this user project. The three
- * testbenches are "io_ports", "la_test1", and "la_test2".
- *
- *-------------------------------------------------------------
- */
-
-module user_proj_example #(
- parameter BITS = 32
-)(
-`ifdef USE_POWER_PINS
- inout vccd1, // User area 1 1.8V supply
- inout vssd1, // User area 1 digital ground
-`endif
-
- // Wishbone Slave ports (WB MI A)
- input wb_clk_i,
- input wb_rst_i,
- input wbs_stb_i,
- input wbs_cyc_i,
- input wbs_we_i,
- input [3:0] wbs_sel_i,
- input [31:0] wbs_dat_i,
- input [31:0] wbs_adr_i,
- output wbs_ack_o,
- output [31:0] wbs_dat_o,
-
- // Logic Analyzer Signals
- input [127:0] la_data_in,
- output [127:0] la_data_out,
- input [127:0] la_oenb,
-
- // IOs
- input [`MPRJ_IO_PADS-1:0] io_in,
- output [`MPRJ_IO_PADS-1:0] io_out,
- output [`MPRJ_IO_PADS-1:0] io_oeb,
-
- // IRQ
- output [2:0] irq
-);
- wire clk;
- wire rst;
-
- wire [`MPRJ_IO_PADS-1:0] io_in;
- wire [`MPRJ_IO_PADS-1:0] io_out;
- wire [`MPRJ_IO_PADS-1:0] io_oeb;
-
- wire [31:0] rdata;
- wire [31:0] wdata;
- wire [BITS-1:0] count;
-
- wire valid;
- wire [3:0] wstrb;
- wire [31:0] la_write;
-
- // WB MI A
- assign valid = wbs_cyc_i && wbs_stb_i;
- assign wstrb = wbs_sel_i & {4{wbs_we_i}};
- assign wbs_dat_o = rdata;
- assign wdata = wbs_dat_i;
-
- // IO
- assign io_out = count;
- assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};
-
- // IRQ
- assign irq = 3'b000; // Unused
-
- // LA
- assign la_data_out = {{(127-BITS){1'b0}}, count};
- // Assuming LA probes [63:32] are for controlling the count register
- assign la_write = ~la_oenb[63:32] & ~{BITS{valid}};
- // Assuming LA probes [65:64] are for controlling the count clk & reset
- assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
- assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
-
- counter #(
- .BITS(BITS)
- ) counter(
- .clk(clk),
- .reset(rst),
- .ready(wbs_ack_o),
- .valid(valid),
- .rdata(rdata),
- .wdata(wbs_dat_i),
- .wstrb(wstrb),
- .la_write(la_write),
- .la_input(la_data_in[63:32]),
- .count(count)
- );
-
-endmodule
-
-module counter #(
- parameter BITS = 32
-)(
- input clk,
- input reset,
- input valid,
- input [3:0] wstrb,
- input [BITS-1:0] wdata,
- input [BITS-1:0] la_write,
- input [BITS-1:0] la_input,
- output ready,
- output [BITS-1:0] rdata,
- output [BITS-1:0] count
-);
- reg ready;
- reg [BITS-1:0] count;
- reg [BITS-1:0] rdata;
-
- always @(posedge clk) begin
- if (reset) begin
- count <= 0;
- ready <= 0;
- end else begin
- ready <= 1'b0;
- if (~|la_write) begin
- count <= count + 1;
- end
- if (valid && !ready) begin
- ready <= 1'b1;
- rdata <= count;
- if (wstrb[0]) count[7:0] <= wdata[7:0];
- if (wstrb[1]) count[15:8] <= wdata[15:8];
- if (wstrb[2]) count[23:16] <= wdata[23:16];
- if (wstrb[3]) count[31:24] <= wdata[31:24];
- end else if (|la_write) begin
- count <= la_write & la_input;
- end
- end
- end
-
-endmodule
-`default_nettype wire
diff --git a/verilog/rtl/user_project_wrapper.v b/verilog/rtl/user_project_wrapper.v
index 5ee1cee..a54beaa 100644
--- a/verilog/rtl/user_project_wrapper.v
+++ b/verilog/rtl/user_project_wrapper.v
@@ -82,7 +82,7 @@
/* User project is instantiated here */
/*--------------------------------------*/
-user_proj_example mprj (
+trainable_nn mprj (
`ifdef USE_POWER_PINS
.vccd1(vccd1), // User area 1 1.8V power
.vssd1(vssd1), // User area 1 digital ground