// SPDX-FileCopyrightText: 2020 Efabless Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// SPDX-License-Identifier: Apache-2.0

`default_nettype none
/*
 *-------------------------------------------------------------
 *
 * chaos_automaton
 *
 * This chip is a pure asynchronous cellular automaton.  Each cell has
 * four inputs from N, S, E, W and generates four outputs to N, S, E, W.
 * Each output can be configured for any boolean function of the four
 * inputs (16 bits each).
 * 
 * Outputs on the periphery (or some selection thereof) are passed to the
 * chip GPIO.  Inputs may also come from the chip periphery;  choice of
 * input or output is programmable like the cell boolean function.
 * 
 * All periphery inputs and outputs may be channeled through the logic
 * analyzer to apply input to or monitor output from the array.
 * 
 * The wishbone bus may be used to program the cell functions.
 * 
 * This can be used in a loop with an evolutionary algorithm to tune the
 * chip functions to achieve a specific behavior.
 * 
 * Most of the core circuitry is straightforward.  The total number of
 * cells is parameterized, so that the largest number of cells that will
 * fit in the caravel user project space can be determined.
 *
 * Version v1:  To avoid massive amounts of wiring (e.g., 16 or 32
 * data wires + 10 address wires to every single cell), all of the
 * LUT configuration memory is stored in a (very long) serial chain
 * in a full loop.  The scan chain is 64 bits longer than the number
 * of cells and allows 64 bits to be transferred to and from the
 * wishbone bus independently of the cells.  Every cell has 64 latches
 * in addition to the 64 flops so that the scan chain can be cycled
 * without affecting ongoing operation of the automaton.
 *
 * Version v2:  The logic analyzer is replaced by a local version that
 * has the same number of bits as periphery I/O.  There are two registers
 * per signal, one for output, and one for input.  All registers update
 * simultaneously.  Every periphery input is connected to three sources,
 * XOR'd together:  A periphery output, a GPIO input, and a register.
 * Every periphery output is connected to three sinks:  A periphery
 * input, a GPIO output, and a register.  The periphery output-to-input
 * connections can be a loop-back or neighbor loop-back.
 *
 * Memory mapped address space:
 *
 *	BASE_ADR + 7 to BASE_ADR + 0:   Configuration data to read or write
 *	BASE_ADR + 11 to BASE_ADR + 8:	Core cell address for read/write
 *	BASE_ADR + 12:			Triggers
 *	BASE_ADR + 17 to BASE_ADR + 16: Per-side input configuration
 *	BASE_ADR + 18:			GPIO input and output slice selection
 *	BASE_ADR + 19:			GPIO direction
 *	BASE_ADR + ?? to BASE_ADR + 20: Operational data
 *	(BASE_ADR + 39 for 50x30 array)
 *
 * Trigger bits:
 *	bit 0:  Shift by (address) cells (64 bits).
 *	bit 1:  Finish cycle.  Return shift register to run state, toggle "hold"
 *
 * (to be done:)
 *	bit 2:  Capture data
 *	bit 3:  Apply data
 *
 * All trigger bits are self-resetting.  The trigger bit (as read) remains
 * high until the transfer has completed.  The trigger bit can be polled to
 * determine when the cycle has completed.
 *
 * The shift cycle bit can be used to load the configuration of the array
 * cell by cell.  The typical case is to set address = 1 and apply or read
 * each cell's configuration in turn.  However, it can also be used piecemeal,
 * for example, to read out a block of configurations, without having
 * to loop a full cycle for each one.  The counter tracks what the
 * current offset is, and can return to the run-state position on
 * application of bit 1, "Finish cycle".  At the end of "Finish cycle"
 * the hold bit is toggled to latch and apply any new configuration
 * data.
 *
 * Reading and writing a single cell's configuration can be accomplished
 * by a sequence of shift cycles and reads/writes.  To change the
 * configuration of a single cell:  (1) Write the cell address, (2) Apply
 * the shift cycle, (3) Write the configuration data, (4) Apply the
 * finish cycle.  To read the configuration of a single cell:  (1) Write
 * the cell address, (2) Apply the shift cycle, (3) Read the configuration
 * data, (4) Apply the finish cycle.
 *
 *
 * This version uses the chaos_subarray, which is intended to be
 * prehardened as a macro and tiled in the top level.
 *-------------------------------------------------------------
 */

// NOTE:  Uncomment the following lines for syntax checking
// `define MPRJ_IO_PADS 38
// `include "chaos_subarray.v"

/*
 *-----------------------------------------------------------------
 * User project top level 
 *-----------------------------------------------------------------
 */

module chaos_automaton #(
    parameter XSIZE = 30,	// Total number of cells left to right
    parameter YSIZE = 50,	// Total number of cells top to bottom
    parameter XTOP = 3,		// Number of sub-arrays left to right
    parameter YTOP = 5,		// Number of sub-arrays top to bottom
    parameter ASIZE = 11,	// Enough bits to count XSIZE * YSIZE
    parameter BASE_ADR = 32'h 3000_0000 // Wishbone base address
)(
`ifdef USE_POWER_PINS
    inout vdda1,	// User area 1 3.3V supply
    inout vdda2,	// User area 2 3.3V supply
    inout vssa1,	// User area 1 analog ground
    inout vssa2,	// User area 2 analog ground
    inout vccd1,	// User area 1 1.8V supply
    inout vccd2,	// User area 2 1.8v supply
    inout vssd1,	// User area 1 digital ground
    inout vssd2,	// User area 2 digital ground
`endif

    // Wishbone Slave ports (WB MI A)
    input wb_clk_i,
    input wb_rst_i,
    input wbs_stb_i,
    input wbs_cyc_i,
    input wbs_we_i,
    input [3:0] wbs_sel_i,
    input [31:0] wbs_dat_i,
    input [31:0] wbs_adr_i,
    output wbs_ack_o,
    output [31:0] wbs_dat_o,

    // Logic Analyzer Signals (unused)
    input  [127:0] la_data_in,
    output [127:0] la_data_out,
    input  [127:0] la_oenb,

    // IOs
    input  [`MPRJ_IO_PADS-1:0] io_in,
    output [`MPRJ_IO_PADS-1:0] io_out,
    output [`MPRJ_IO_PADS-1:0] io_oeb,

    // IRQ
    output [2:0] irq
);

`define IDLE	3'b000
`define START	3'b001
`define FINISH	3'b010
`define XDATAS	3'b011
`define XDATAF	3'b100
`define LOAD	3'b101

`define CONFIGL	8'h00		/* Address offset of configuration data low word */
`define CONFIGH	8'h01		/* Address offset of configuration data high word */
`define ADDRESS	8'h02		/* Address offset of cell address value */
`define XFER	8'h03		/* Address offset of transfer bits */
`define DIRECT  8'h04		/* Address offset of GPIO directions */
`define SOURCE  8'h04		/* Address offset of GPIO source selection */
`define DATATOP	8'h05		/* Address offset of start of data section */

`define MAXADDR (XSIZE * YSIZE)	/* Highest cell address plus one */

    reg clk;			/* serial clock to transfer data 	*/
    reg hold;			/* trigger to hold transferred data 	*/
    reg [2:0] xfer_state;	/* state of the data transfer		*/
    reg [1:0] xfer_ctrl;	/* Configuration transfer trigger bits	*/
    reg [63:0] config_data;	/* 64 bits to read or write configuration */

    reg [ASIZE - 1:0] cell_addr;	/* Core cell to address	*/
    reg [ASIZE - 1:0] cell_offset;	/* Current offset of shift register */
    reg [ASIZE + 6:0] bit_count;	/* Full count (cell address + bits) */

    wire [`MPRJ_IO_PADS-1:0] io_in;
    wire [`MPRJ_IO_PADS-1:0] io_out;
    wire [`MPRJ_IO_PADS-1:0] io_oeb;

    wire [1:0] config_sel;
    wire address_sel;
    wire xfer_sel;
    wire direct_sel;
    wire source_sel;

    // NOTE:  This should be parameterized.
    // For the 50x30 array, there are 50+50+30+30 = 160 periphery bits =
    // 5 words of 32 bits.  This is hard-coded for convenience.  If the
    // array size changes, this needs to be changed as well.  Needs to be
    // converted to a "generate" block.
    wire [4:0] data_sel;

    wire valid;
    reg ready;
    wire [3:0] iomem_we;
    wire selected;
    wire [1:0] busy;
    reg [31:0] rdata_pre;
    wire [63:0] rdata;
    reg [31:0] wbs_dat_o;
    reg [63:0] wdata;
    reg write;

    // Direction for each GPIO (32 used)
    reg [31:0] gpio_oeb;

    // Data to and from array periphery I/O
    wire [YSIZE-1: 0] data_in_east;
    wire [YSIZE-1: 0] data_in_west;
    wire [XSIZE-1: 0] data_in_north;
    wire [XSIZE-1: 0] data_in_south;

    wire [YSIZE-1: 0] data_out_east;
    wire [YSIZE-1: 0] data_out_west;
    wire [XSIZE-1: 0] data_out_north;
    wire [XSIZE-1: 0] data_out_south;

    // Latched output for wishbone read-back (to be done)
    // TBD

    // Latched input from wishbone (to do:  Make shadow register)
    wire [YSIZE-1: 0] latched_in_east;
    wire [YSIZE-1: 0] latched_in_west;
    wire [XSIZE-1: 0] latched_in_north;
    wire [XSIZE-1: 0] latched_in_south;

    // Shadow registers for wishbone input (to be done)
    // TBD

    // Register array mapping latched data to 32-bit sections for data
    // transfer through the wishbone
    reg [XSIZE*2 + YSIZE*2 - 1:0] latched_in;

    // Wire array mapping output data to 32-bit sections for data
    // transfer through the wishbone
    wire [XSIZE*2 + YSIZE*2 - 1:0] data_out;

    // Periphery output-to-input loop-back selection
    reg [2:0] north_loopback;
    reg [2:0] east_loopback;
    reg [2:0] south_loopback;
    reg [2:0] west_loopback;

// Loopback value definitions

`define INPUT_LOW	3'b000
`define INPUT_HIGH	3'b001
`define LOOPBACK	3'b010
`define NEIGHBOR_LEFT	3'b011
`define NEIGHBOR_RIGHT  3'b100

    // GPIO slicing (because there are many fewer GPIO than array outputs)
    // GPIOs can be clustered on either end or in the center of the array
    // side, or distributed along the side (1 GPIO per 5 array cells)
    reg [1:0] gpio_output_slice;
    reg [1:0] gpio_input_slice;

    // Registered GPIO directions go directly to io_oeb[37:6].  Leave the
    // lower 6 GPIO to the management processor.
    assign io_oeb = {gpio_oeb, 6'b1};

    // Wishbone address select indicators
    assign config_sel[0] = (wbs_adr_i[7:2] == `CONFIGL);
    assign config_sel[1] = (wbs_adr_i[7:2] == `CONFIGH);
    assign address_sel = (wbs_adr_i[7:2] == `ADDRESS);
    assign xfer_sel = (wbs_adr_i[7:2] == `XFER);
    assign direct_sel = (wbs_adr_i[7:2] == `DIRECT);
    assign source_sel = (wbs_adr_i[7:2] == `SOURCE);

    // Hard-coded to 5 words;  see note above
    assign data_sel[0] = (wbs_adr_i[7:2] == (`DATATOP + 0));
    assign data_sel[1] = (wbs_adr_i[7:2] == (`DATATOP + 1));
    assign data_sel[2] = (wbs_adr_i[7:2] == (`DATATOP + 2));
    assign data_sel[3] = (wbs_adr_i[7:2] == (`DATATOP + 3));
    assign data_sel[4] = (wbs_adr_i[7:2] == (`DATATOP + 4));

    assign valid = wbs_cyc_i && wbs_stb_i; 
    assign wbs_ack_o = ready;
    assign iomem_we = wbs_sel_i & {4{wbs_we_i}};

    // IRQ
    assign irq = 3'b000;	// Unused

    // Instantiate the chaos cell array

    chaos_array #(
        .XSIZE(XSIZE),
        .YSIZE(YSIZE),
        .XTOP(XTOP),
        .YTOP(YTOP),
	.BASE_ADR(BASE_ADR)
    ) chaos_array_inst (
	`ifdef USE_POWER_PINS
    	     .vccd1(vccd1),
	     .vssd1(vssd1),
	`endif
        .clk(clk),
        .reset(wb_rst_i),
        .hold(hold),
        .rdata(rdata),
        .wdata(wdata),
	.write(write),
        .data_in_east(data_in_east),
        .data_in_west(data_in_west),
        .data_in_north(data_in_north),
        .data_in_south(data_in_south),
        .data_out_east(data_out_east),
        .data_out_west(data_out_west),
        .data_out_north(data_out_north),
        .data_out_south(data_out_south)
    );

    // Wire definitions mapping the GPIO to the array periphery
    wire [YSIZE-1:0] gpio_east, gpio_west;
    wire [XSIZE-1:0] gpio_north, gpio_south;

    // Wire definitions mapping the array periphery loop-back connections
    wire [YSIZE-1:0] data_muxed_east, data_muxed_west;
    wire [XSIZE-1:0] data_muxed_north, data_muxed_south;

    // Hook up array inputs (data_in_*) to an XOR'd combination of
    // (1) array outputs (data_out_*, muxed into data_muxed_*),
    // (2) the GPIO pads (muxed into gpio_*), and
    // (3) data from the wishbone bus (latched_in_*).

    assign data_in_west = latched_in_west ^ gpio_west ^ data_muxed_west;
    assign data_in_east = latched_in_east ^ gpio_east ^ data_muxed_east;
    assign data_in_south = latched_in_south ^ gpio_south ^ data_muxed_south;
    assign data_in_north = latched_in_north ^ gpio_north ^ data_muxed_north;

`define INPUT_LOW	3'b000
`define INPUT_HIGH	3'b001
`define LOOPBACK	3'b010
`define NEIGHBOR_LEFT	3'b011
`define NEIGHBOR_RIGHT  3'b100

    // Define loop-back inputs
    assign data_muxed_west =
	(west_loopback == `NEIGHBOR_LEFT) ? {data_out_west[YSIZE-2:0], 1'b0} :
	(west_loopback == `NEIGHBOR_RIGHT) ? {1'b0, data_out_west[YSIZE-1:1]} :
	(west_loopback == `LOOPBACK) ?  data_out_west :
	(west_loopback == `INPUT_HIGH) ? 'b1 : 'b0;

    assign data_muxed_east =
	(east_loopback == `NEIGHBOR_LEFT) ? {data_out_east[YSIZE-2:0], 1'b0} :
	(east_loopback == `NEIGHBOR_RIGHT) ? {1'b0, data_out_east[YSIZE-1:1]} :
	(east_loopback == `LOOPBACK) ?  data_out_east :
	(east_loopback == `INPUT_HIGH) ? 'b1 : 'b0;

    assign data_muxed_south =
	(south_loopback == `NEIGHBOR_LEFT) ? {data_out_south[XSIZE-2:0], 1'b0} :
	(south_loopback == `NEIGHBOR_RIGHT) ?  {1'b0, data_out_south[XSIZE-1:1]} :
	(south_loopback == `LOOPBACK) ? data_out_south :
	(south_loopback == `INPUT_HIGH) ? 'b1 : 'b0;

    assign data_muxed_north =
	(north_loopback == `NEIGHBOR_LEFT) ? {data_out_north[XSIZE-2:0], 1'b0} :
	(north_loopback == `NEIGHBOR_RIGHT) ?  {1'b0, data_out_north[XSIZE-1:1]} :
	(north_loopback == `LOOPBACK) ? data_out_north :
	(south_loopback == `INPUT_HIGH) ? 'b1 : 'b0;

    // Define I/O input slices
    // NOTE:  This is hard-coded.  There are 38 GPIOs.  Assigning 32 of them
    // (GPIO 37 to 6) to array inputs and outputs.  These are arranged as
    // 10 on the sides and 6 on the top and bottom.  These are further sub-
    // divided into 5 inputs and 5 outputs on the sides, and 3 inputs and
    // 3 outputs on top and bottom.  Depending on the selection, these
    // can be injected into various places around the array.

    // Another note:  It probably makes more sense to define vectors for
    // io_in_east, io_in_north, etc., and align them in the direction of
    // the arrays (high to low index is top to bottom, or right to left).

    assign gpio_east = 	// I/O 15 to 6
	(gpio_input_slice == 0) ?	// Distributed
		{2'b0, io_in[15], 4'b0, io_in[14], 4'b0, io_in[13],
		 4'b0, io_in[12], 4'b0, io_in[11], 4'b0, io_in[10],
		 4'b0, io_in[9],  4'b0, io_in[8],  4'b0, io_in[7],
		 4'b0, io_in[6],  2'b0} :
	(gpio_input_slice == 1) ? {40'b0, io_in[15:6]} :	// Bottom shifted
	(gpio_input_slice == 2) ? {20'b0, io_in[15:6], 20'b0} : // Centered
	{io_in[15:6], 40'b0};					// Top shifted

    assign gpio_north = 	// I/O 21 to 16
	(gpio_input_slice == 0) ?	// Distributed
		{2'b0, io_in[16], 4'b0, io_in[17], 4'b0, io_in[18],
		 4'b0, io_in[19], 4'b0, io_in[20], 4'b0, io_in[21], 2'b0} :
	(gpio_input_slice == 1) ?	// Right shifted
		{14'b0, io_in[16], io_in[17], io_in[18], io_in[19],
		io_in[20], io_in[21]} :
	(gpio_input_slice == 2) ?	// Centered
		{7'b0, io_in[16], io_in[17], io_in[18], io_in[19],
		io_in[20], io_in[21], 7'b0} :
	{io_in[16], io_in[17], io_in[18], io_in[19], io_in[20],
		io_in[21], 4'b0};	// Left shifted

    assign gpio_west = 	// I/O 22 to 31
	(gpio_input_slice == 0) ?	// Distributed
		{2'b0, io_in[22], 4'b0, io_in[23], 4'b0, io_in[24],
		 4'b0, io_in[25], 4'b0, io_in[26], 4'b0, io_in[27],
		 4'b0, io_in[28], 4'b0, io_in[29], 4'b0, io_in[30],
		 4'b0, io_in[31],  2'b0} :
	(gpio_input_slice == 1) ?	// Bottom shifted
		{40'b0, io_in[22], io_in[23], io_in[24], io_in[25],
		io_in[26], io_in[27], io_in[28], io_in[29], io_in[31],
		io_in[31]} :
	(gpio_input_slice == 2) ?	// Centered
		{20'b0, io_in[22], io_in[23], io_in[24], io_in[25],
		io_in[26], io_in[27], io_in[28], io_in[29], io_in[31],
		io_in[31], 20'b0} :
	{io_in[22], io_in[23], io_in[24], io_in[25], io_in[26],
		io_in[27], io_in[28], io_in[29], io_in[31], io_in[31],
		40'b0};					// Top shifted

    assign gpio_south = 	// I/O 32 to 37
	(gpio_input_slice == 0) ?	// Distributed
		{2'b0, io_in[37], 4'b0, io_in[36], 4'b0, io_in[35],
		 4'b0, io_in[34], 4'b0, io_in[33], 4'b0, io_in[32], 2'b0} :
	(gpio_input_slice == 1) ? {14'b0, io_in[37:32]} :	// Right shifted
	(gpio_input_slice == 2) ? {7'b0, io_in[37:32], 7'b0} :	// Centered
	{io_in[37:32], 14'b0};					// Left shifted

    // East side
    assign io_out[6] =
	(gpio_output_slice == 0) ? data_out_east[2] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[20] :	// Center
	(gpio_output_slice == 2) ? data_out_east[40] :	// Top
	data_out_east[0];				// Bottom
    assign io_out[7] =
	(gpio_output_slice == 0) ? data_out_east[7] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[21] :	// Center
	(gpio_output_slice == 2) ? data_out_east[41] :	// Top
	data_out_east[1];				// Bottom
    assign io_out[8] =
	(gpio_output_slice == 0) ? data_out_east[12] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[22] :	// Center
	(gpio_output_slice == 2) ? data_out_east[42] :	// Top
	data_out_east[2];				// Bottom
    assign io_out[9] =
	(gpio_output_slice == 0) ? data_out_east[17] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[23] :	// Center
	(gpio_output_slice == 2) ? data_out_east[43] :	// Top
	data_out_east[3];				// Bottom
    assign io_out[10] =
	(gpio_output_slice == 0) ? data_out_east[22] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[24] :	// Center
	(gpio_output_slice == 2) ? data_out_east[44] :	// Top
	data_out_east[4];				// Bottom
    assign io_out[11] =
	(gpio_output_slice == 0) ? data_out_east[27] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[25] :	// Center
	(gpio_output_slice == 2) ? data_out_east[45] :	// Top
	data_out_east[5];				// Bottom
    assign io_out[12] =
	(gpio_output_slice == 0) ? data_out_east[32] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[26] :	// Center
	(gpio_output_slice == 2) ? data_out_east[46] :	// Top
	data_out_east[6];				// Bottom
    assign io_out[13] =
	(gpio_output_slice == 0) ? data_out_east[37] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[27] :	// Center
	(gpio_output_slice == 2) ? data_out_east[47] :	// Top
	data_out_east[7];				// Bottom
    assign io_out[14] =
	(gpio_output_slice == 0) ? data_out_east[42] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[28] :	// Center
	(gpio_output_slice == 2) ? data_out_east[48] :	// Top
	data_out_east[8];				// Bottom
    assign io_out[15] =
	(gpio_output_slice == 0) ? data_out_east[47] :	// Distributed
	(gpio_output_slice == 1) ? data_out_east[29] :	// Center
	(gpio_output_slice == 2) ? data_out_east[49] :	// Top
	data_out_east[9];				// Bottom

    // North side
    assign io_out[16] =
	(gpio_output_slice == 0) ? data_out_north[27] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[16] :	// Center
	(gpio_output_slice == 2) ? data_out_north[29] :	// Right
	data_out_north[5];				// Left
    assign io_out[17] =
	(gpio_output_slice == 0) ? data_out_north[22] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[15] :	// Center
	(gpio_output_slice == 2) ? data_out_north[28] :	// Right
	data_out_north[4];				// Left
    assign io_out[18] =
	(gpio_output_slice == 0) ? data_out_north[17] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[14] :	// Center
	(gpio_output_slice == 2) ? data_out_north[27] :	// Right
	data_out_north[3];				// Left
    assign io_out[19] =
	(gpio_output_slice == 0) ? data_out_north[12] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[13] :	// Center
	(gpio_output_slice == 2) ? data_out_north[26] :	// Right
	data_out_north[2];				// Left
    assign io_out[20] =
	(gpio_output_slice == 0) ? data_out_north[7] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[12] :	// Center
	(gpio_output_slice == 2) ? data_out_north[25] :	// Right
	data_out_north[1];				// Left
    assign io_out[21] =
	(gpio_output_slice == 0) ? data_out_north[2] :	// Distributed
	(gpio_output_slice == 1) ? data_out_north[11] :	// Center
	(gpio_output_slice == 2) ? data_out_north[24] :	// Right
	data_out_north[0];				// Left

    // West side
    assign io_out[22] =
	(gpio_output_slice == 0) ? data_out_west[47] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[29] :	// Center
	(gpio_output_slice == 2) ? data_out_west[49] :	// Top
	data_out_east[9];				// Bottom
    assign io_out[23] =
	(gpio_output_slice == 0) ? data_out_west[42] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[28] :	// Center
	(gpio_output_slice == 2) ? data_out_west[48] :	// Top
	data_out_east[8];				// Bottom
    assign io_out[24] =
	(gpio_output_slice == 0) ? data_out_west[37] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[27] :	// Center
	(gpio_output_slice == 2) ? data_out_west[47] :	// Top
	data_out_east[7];				// Bottom
    assign io_out[25] =
	(gpio_output_slice == 0) ? data_out_west[32] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[26] :	// Center
	(gpio_output_slice == 2) ? data_out_west[46] :	// Top
	data_out_east[6];				// Bottom
    assign io_out[26] =
	(gpio_output_slice == 0) ? data_out_west[27] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[25] :	// Center
	(gpio_output_slice == 2) ? data_out_west[45] :	// Top
	data_out_east[5];				// Bottom
    assign io_out[27] =
	(gpio_output_slice == 0) ? data_out_west[22] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[24] :	// Center
	(gpio_output_slice == 2) ? data_out_west[44] :	// Top
	data_out_east[4];				// Bottom
    assign io_out[28] =
	(gpio_output_slice == 0) ? data_out_west[17] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[23] :	// Center
	(gpio_output_slice == 2) ? data_out_west[43] :	// Top
	data_out_east[3];				// Bottom
    assign io_out[29] =
	(gpio_output_slice == 0) ? data_out_west[12] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[22] :	// Center
	(gpio_output_slice == 2) ? data_out_west[42] :	// Top
	data_out_east[2];				// Bottom
    assign io_out[30] =
	(gpio_output_slice == 0) ? data_out_west[7] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[21] :	// Center
	(gpio_output_slice == 2) ? data_out_west[41] :	// Top
	data_out_east[1];				// Bottom
    assign io_out[31] =
	(gpio_output_slice == 0) ? data_out_west[2] :	// Distributed
	(gpio_output_slice == 1) ? data_out_west[20] :	// Center
	(gpio_output_slice == 2) ? data_out_west[40] :	// Top
	data_out_east[0];				// Bottom

    // South side
    assign io_out[32] =
	(gpio_output_slice == 0) ? data_out_south[2] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[11] :	// Center
	(gpio_output_slice == 2) ? data_out_south[24] :	// Right
	data_out_north[0];				// Left
    assign io_out[33] =
	(gpio_output_slice == 0) ? data_out_south[7] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[12] :	// Center
	(gpio_output_slice == 2) ? data_out_south[25] :	// Right
	data_out_north[1];				// Left
    assign io_out[34] =
	(gpio_output_slice == 0) ? data_out_south[12] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[13] :	// Center
	(gpio_output_slice == 2) ? data_out_south[26] :	// Right
	data_out_north[2];				// Left
    assign io_out[35] =
	(gpio_output_slice == 0) ? data_out_south[17] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[14] :	// Center
	(gpio_output_slice == 2) ? data_out_south[27] :	// Right
	data_out_north[3];				// Left
    assign io_out[36] =
	(gpio_output_slice == 0) ? data_out_south[22] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[15] :	// Center
	(gpio_output_slice == 2) ? data_out_south[28] :	// Right
	data_out_north[4];				// Left
    assign io_out[37] =
	(gpio_output_slice == 0) ? data_out_south[27] :	// Distributed
	(gpio_output_slice == 1) ? data_out_south[16] :	// Center
	(gpio_output_slice == 2) ? data_out_south[29] :	// Right
	data_out_north[5];				// Left

    // Map the output data from the sides to a single array that can be
    // broken up into 32 bit segments for data transfer.  

    assign data_out = {data_out_north, data_out_east, data_out_south, data_out_west};

    /* Read data (only rdata is something that was not written by the processor) */

    always @* begin
	rdata_pre = 'b0;
 	if (xfer_sel) begin
	    rdata_pre = {30'b0, busy};
	end else if (config_sel[0]) begin
	    rdata_pre = rdata[31:0];
	end else if (config_sel[1]) begin
	    rdata_pre = rdata[63:32];
	end else if (address_sel) begin
	    /* When ADDRESS is selected, pass back the existing cell	*/
	    /* count rather than what was written into cell_addr.	*/
	    rdata_pre = bit_count[ASIZE + 6: 7];
	end else if (direct_sel) begin
	    rdata_pre = gpio_oeb;
	end else if (source_sel) begin
	    rdata_pre = {10'b0, gpio_output_slice, 2'b0, gpio_input_slice,
			1'b0, north_loopback, 1'b0, east_loopback,
			1'b0, south_loopback, 1'b0, west_loopback};
	end else if (data_sel[0]) begin
	    rdata_pre = data_out[31:0];
	end else if (data_sel[1]) begin
	    rdata_pre = data_out[63:32];
	end else if (data_sel[2]) begin
	    rdata_pre = data_out[95:64];
	end else if (data_sel[3]) begin
	    rdata_pre = data_out[127:96];
	end else if (data_sel[4]) begin
	    rdata_pre = data_out[159:128];
	end
    end

    /* Read data */

    always @(posedge wb_clk_i or posedge wb_rst_i) begin
	if (wb_rst_i) begin
	    wbs_dat_o <= 0;
	    ready <= 0;
	end else begin
	    ready <= 0;
            if (valid && !ready && wbs_adr_i[31:8] == BASE_ADR[31:8]) begin
		ready <= 1'b1;
		wbs_dat_o <= rdata_pre;
	    end
	end
    end

    // Map the latched data from the sides to a single array that can be
    // broken up into 32 bit segments for data transfer.  

    assign latched_in_north = latched_in[2*XSIZE+2*YSIZE-1:2*XSIZE+YSIZE];
    assign latched_in_east = latched_in[2*YSIZE+XSIZE-1:YSIZE+XSIZE];
    assign latched_in_south = latched_in[YSIZE+XSIZE-1:YSIZE];
    assign latched_in_west = latched_in[YSIZE-1:0];

    /* Write data */

    always @(posedge wb_clk_i or posedge wb_rst_i) begin
        if (wb_rst_i) begin
            xfer_ctrl <= 0;
	    wdata <= 0;
	    write <= 1'b0;
        end else begin
	    write <= 1'b0;
            if (valid && !ready && wbs_adr_i[31:8] == BASE_ADR[31:8]) begin
                if (xfer_sel) begin
                    if (iomem_we[0]) xfer_ctrl <= wbs_dat_i[1:0];
		end else if (config_sel[0]) begin
                    if (iomem_we[0]) wdata[7:0] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) wdata[15:8] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) wdata[23:16] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) wdata[31:24] <= wbs_dat_i[31:24];
		    if (|iomem_we) write <= 1'b1;
		end else if (config_sel[1]) begin
                    if (iomem_we[0]) wdata[39:32] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) wdata[47:40] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) wdata[55:48] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) wdata[63:56] <= wbs_dat_i[31:24];
		    if (|iomem_we) write <= 1'b1;
		end else if (address_sel) begin
		    /* NOTE:  If XSIZE * YSIZE > 256, this must be adjusted */
                    if (iomem_we[0]) cell_addr <= wbs_dat_i[7:0];
		end else if (direct_sel) begin
                    if (iomem_we[0]) gpio_oeb[7:0] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) gpio_oeb[15:8] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) gpio_oeb[23:16] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) gpio_oeb[31:24] <= wbs_dat_i[31:24];
		end else if (source_sel) begin
                    if (iomem_we[0]) begin
			 west_loopback <= wbs_dat_i[2:0];
			 south_loopback <= wbs_dat_i[6:4];
		    end
                    if (iomem_we[1]) begin
			 east_loopback <= wbs_dat_i[2:0];
			 north_loopback <= wbs_dat_i[6:4];
		    end
                    if (iomem_we[2]) begin
			 gpio_input_slice <= wbs_dat_i[1:0];
			 gpio_output_slice <= wbs_dat_i[5:4];
		    end
		end else if (data_sel[0]) begin
                    if (iomem_we[0]) latched_in[7:0] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) latched_in[15:8] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) latched_in[23:16] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) latched_in[31:24] <= wbs_dat_i[31:24];
		end else if (data_sel[1]) begin
                    if (iomem_we[0]) latched_in[39:32] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) latched_in[47:40] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) latched_in[55:48] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) latched_in[63:56] <= wbs_dat_i[31:24];
		end else if (data_sel[2]) begin
                    if (iomem_we[0]) latched_in[71:64] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) latched_in[79:72] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) latched_in[87:80] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) latched_in[95:88] <= wbs_dat_i[31:24];
		end else if (data_sel[3]) begin
                    if (iomem_we[0]) latched_in[103:96] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) latched_in[111:104] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) latched_in[119:112] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) latched_in[127:120] <= wbs_dat_i[31:24];
		end else if (data_sel[4]) begin
                    if (iomem_we[0]) latched_in[135:128] <= wbs_dat_i[7:0];
                    if (iomem_we[1]) latched_in[143:136] <= wbs_dat_i[15:8];
                    if (iomem_we[2]) latched_in[151:144] <= wbs_dat_i[23:16];
                    if (iomem_we[3]) latched_in[159:152] <= wbs_dat_i[31:24];
                end
            end else begin
                xfer_ctrl <= 0;      // Immediately self-resetting
            end
        end
    end

    /* Transfer status */

    assign busy[0] = (xfer_state == `START || xfer_state == `XDATAS);
    assign busy[1] = (xfer_state == `FINISH || xfer_state == `XDATAF ||
			xfer_state == `LOAD);

    /* Transfer cycles */

    always @(posedge wb_clk_i or posedge wb_rst_i) begin
	if (wb_rst_i == 1'b1) begin
	    xfer_state <= `IDLE;
	    bit_count <= 'd0;
	    cell_offset <= 'd0;
	    clk <= 1'b0;
	    hold <= 1'b1;
	end else begin
	    clk <= 1'b0;
	    hold <= 1'b1;
	    if (xfer_state == `IDLE) begin
		if (xfer_ctrl[0] == 1'b1) begin
		    xfer_state <= `START;
		end else if (xfer_ctrl[1] == 1'b1) begin
		    xfer_state <= `FINISH;
		end
	    end else if (xfer_state == `START) begin
		bit_count[ASIZE + 6:7] <= cell_addr;
		bit_count[6:0] <= 7'b1111110;
		xfer_state <= `XDATAS;
	    end else if (xfer_state == `FINISH) begin
		bit_count[ASIZE + 6:7] <= `MAXADDR - cell_offset;
		bit_count[6:0] <= 7'b1111110;
		xfer_state <= `XDATAF;
	    end else if (xfer_state == `XDATAS) begin
		clk <= ~clk;
		bit_count <= bit_count - 1;
		if (bit_count[6:0] == 0) begin
		    cell_offset <= cell_offset + 1;
		end
		if (clk == 1'b0) begin
		    if (bit_count == 0) begin
			xfer_state <= `IDLE;
		    end
		end
	    end else if (xfer_state == `XDATAF) begin
		clk <= ~clk;
		bit_count <= bit_count - 1;
		if (bit_count[6:0] == 0) begin
		    cell_offset <= cell_offset + 1;
		end
		if (clk == 1'b0) begin
		    if (bit_count == 0) begin
			xfer_state <= `LOAD;
		    end
		end
	    end else if (xfer_state == `LOAD) begin
		hold <= 1'b0;
		xfer_state <= `IDLE;
		cell_offset <= 'd0;
	    end
	end
    end
endmodule

/*
 *-----------------------------------------------------------------
 * Chaos array (XSIZE * YSIZE)
 *-----------------------------------------------------------------
 */

module chaos_array #(
    parameter XSIZE = 30,   /* Total number of cells in X */
    parameter YSIZE = 30,   /* Total number of cells in Y */
    parameter XTOP = 3,	    /* Number of sub-arrays in X */
    parameter YTOP = 3,	    /* Number of sub-arrays in Y */
    parameter BASE_ADR = 32'h3000_0000
)(
`ifdef USE_POWER_PINS
    inout vccd1,	// User area 1 1.8V supply
    inout vssd1, 	// User area 1 digital ground
`endif

    input clk,
    input reset,
    input hold,
    input write,
    input [63:0] wdata,
    output [63:0] rdata,
    input [YSIZE-1:0] data_in_east,	// Perimeter input
    input [YSIZE-1:0] data_in_west,
    input [XSIZE-1:0] data_in_north,
    input [XSIZE-1:0] data_in_south,
    output [YSIZE-1:0] data_out_east,	// Perimeter output
    output [YSIZE-1:0] data_out_west,
    output [XSIZE-1:0] data_out_north,
    output [XSIZE-1:0] data_out_south
);
    wire [XSIZE - 1: 0] uconn [YTOP: 0];
    wire [XSIZE - 1: 0] dconn [YTOP: 0];
    wire [YSIZE - 1: 0] rconn [XTOP: 0];
    wire [YSIZE - 1: 0] lconn [XTOP: 0];

    wire [YTOP - 1: 0] shiftreg [XTOP: 0];
    wire [YTOP - 1: 0] clkarray [XTOP: 0];

    wire io_data_sel;		// wishbone select data
    wire xfer_sel;		// wishbone select transfer

    assign clkarray[0][0] = clk;

    // Sub-array architecture:
    //
    //       dudu      dudu      dudu
    //       |^|^      |^|^      |^|^   
    //       v|v|      v|v|      v|v|   
    //     +------+  +------+  +------+
    //  l->|      |->|      |->|      |->l
    //  r<-|      |<-|      |<-|      |<-r
    //  l->|      |->|      |->|      |->l
    //  r<-|      |<-|      |<-|      |<-r
    //     +------+  +------+  +------+
    //       |^|^      |^|^      |^|^   
    //       v|v|      v|v|      v|v|   
    //     +------+  +------+  +------+
    //  l->|      |->|      |->|      |->l
    //  r<-|      |<-|      |<-|      |<-r
    //  l->|      |->|      |->|      |->l
    //  r<-|      |<-|      |<-|      |<-r
    //     +------+  +------+  +------+
    //       |^|^      |^|^      |^|^   
    //       v|v|      v|v|      v|v|   
    //       dudu      dudu      dudu
    //
    // Each box in the above diagram is a sub-array size 2x2.
    // The top level has XSIZE = 6, YSIZE = 4 with XTOP = 3
    // and YTOP = 2.
    //
    // The top-level inputs and outputs are the perimeter values
    // on the four edges of the top level array.
    //
    // To represent all the connections among the sub-arrays, it
    // can be seen from the above that d and u (dconn and uconn)
    // are arrays of size (XSIZE, YTOP + 1), while l and r (lconn
    // and rconn) are arrays of size (XTOP + 1, YSIZE).

    // NOTE:  For viewing internal signals in gtkwave,
    // some 2D arrays may need to be copied into 1D arrays.
    // See the original verilog for examples.

    /* The perimeter inputs and outputs connect the array to the
     * parent module.  Note that this hides all the interior data,
     * which could be an issue with understanding how the circuit
     * works.
     */

    assign data_out_north = uconn[YTOP][XSIZE - 1:0];
    assign data_out_south = dconn[0][XSIZE - 1:0];
    assign data_out_east = rconn[XTOP][YSIZE - 1:0];
    assign data_out_west = lconn[0][YSIZE - 1:0];

    assign dconn[YTOP][XSIZE - 1:0] = data_in_south;
    assign uconn[0][XSIZE - 1:0] = data_in_north;
    assign rconn[0][YSIZE - 1:0] = data_in_east;
    assign lconn[XTOP][YSIZE - 1:0] = data_in_west;

    genvar i, j;

    /* NOTE:  To see the internal cell values in gtkwave, it is necessary
     * to split out a few individual instances from the 2D array.  Loop
     * from j = 1 in 2D generate loop, then add a 1D generate loop for
     * i = N to XSIZE with j set to zero, then add individual instances for 
     * i = 0 to N - 1 with j set to zero.
     */

    /* Connected array of subarrays */
    generate
	for (j = 0; j < YTOP; j=j+1) begin: subarrayy
	    for (i = 0; i < XTOP; i=i+1) begin: subarrayx
    	        chaos_subarray #(
		    .XSIZE(XSIZE / XTOP),
		    .YSIZE(YSIZE / YTOP)
		) chaos_subarray_inst (
		    `ifdef USE_POWER_PINS
			.vccd1(vccd1),
			.vssd1(vssd1),
		    `endif
    		    .inorth(dconn[j+1][(i+1)*(XSIZE/XTOP)-1:i*(XSIZE/XTOP)]),
		    .isouth(uconn[j][(i+1)*(XSIZE/XTOP)-1:i*(XSIZE/XTOP)]),
		    .ieast(lconn[i+1][(j+1)*(YSIZE/YTOP)-1:j*(YSIZE/YTOP)]),
		    .iwest(rconn[i][(j+1)*(YSIZE/YTOP)-1:j*(YSIZE/YTOP)]),
		    .onorth(uconn[j+1][(i+1)*(XSIZE/XTOP)-1:i*(XSIZE/XTOP)]),
		    .osouth(dconn[j][(i+1)*(XSIZE/XTOP)-1:i*(XSIZE/XTOP)]),
		    .oeast(rconn[i+1][(j+1)*(YSIZE/YTOP)-1:j*(YSIZE/YTOP)]),
		    .owest(lconn[i][(j+1)*(YSIZE/YTOP)-1:j*(YSIZE/YTOP)]),
		    .reset(reset),
		    .hold(hold),
		    .iclk(clkarray[i][j]),
		    .oclk(clkarray[i+1][j]),
		    .idata(shiftreg[i][j]),
		    .odata(shiftreg[i+1][j])
    	    	);
	    end
	end

	/* NOTE:  This would work better topologically if each	*/
	/* row switched the direction of the shift register.	*/

	for (j = 0; j < YTOP - 1; j=j+1) begin: shifty
	    assign shiftreg[0][j+1] = shiftreg[XTOP][j];
	    assign clkarray[0][j+1] = clkarray[XTOP][j];
	end
    endgenerate

    /* Storage for data transfers to and from the processor.  This is	*/
    /* 64 bits, so can hold the configuration data for one core cell.	*/
   
    reg [63:0] lutdata;

    /* Wire up the lutdata registers as a shift register and connect the */
    /* ends to the array's shift register to form a loop.		*/

    always @(posedge clk or posedge write) begin
	if (write) begin
	    /* Copy data from wdata to lutdata on write */
	    lutdata <= wdata;
	end else begin
	    /* Shift data on clock when "write" is not raised */
	    lutdata[63:1] <= lutdata[62:0];
	    lutdata[0] <= shiftreg[XTOP][YTOP-1];
	end
    end

    assign shiftreg[0][0] = lutdata[63];

    assign rdata = lutdata;	/* Data to read back */

endmodule
`default_nettype wire
