// ============================================================================
// Whisk: a 16-bit bit-serial RISC processor (c) Luke Wren 2022
// SPDX-License-Identifier: Apache-2.0
// ============================================================================
// Whisk is a 16-bit bit-serial processor, with external SPI SRAM interface,
// designed in a hurry for Tiny Tapeout 2. See for an overview of
// the instruction set. Supporting hardware:
// - SPI SRAM with sequential mode and 16-bit addressing, e.g. Microchip
// 23K256T-I (32 kiB SRAM)
// - One 8-bit parallel-to-serial shift register, for input port
// - Two 8-bit serial-to-parallel shift registers, for output port
// - A host device capable of loading the SPI SRAM, setting it to sequential
// mode, and releasing Whisk's reset. I'll probably use a Pico.
// There will be a board with all of these components ready for bringup, and
// it will be added to this repository (also I will probably make a few of
// them, and will gladly send you one if you ask). However this will not be
// done before tapeout, as I started this project a week before the
// deadline!
`default_nettype none
`define WHISK_CELLS_SKY130
// ============================================================================
// Module wren6991_whisk_tt2_io_wrapper: Top level for TT2 synthesis.
// instantiate whisk_top, and map named ports to numbered TT2 inputs/outputs
// ============================================================================
module wren6991_whisk_tt2_io_wrapper (
input wire [7:0] io_in,
output wire [7:0] io_out
// Global signals
wire io_clk = io_in[0];
wire io_rst_n = io_in[1];
// SPI memory interface
wire io_mem_sdi = io_in[2];
wire io_mem_csn;
wire io_mem_sck;
wire io_mem_sdo;
assign io_out[0] = io_mem_csn;
assign io_out[1] = io_mem_sck;
assign io_out[2] = io_mem_sdo;
// IO port (shift register interface)
wire io_ioport_sdi = io_in[3];
wire io_ioport_sck;
wire io_ioport_sdo;
wire io_ioport_latch_i;
wire io_ioport_latch_o;
assign io_out[3] = io_ioport_sck;
assign io_out[4] = io_ioport_sdo;
assign io_out[5] = io_ioport_latch_i;
assign io_out[6] = io_ioport_latch_o;
// Be a good neighbour
assign io_out[7] = 1'b0;
whisk_top top_u (
.io_clk (io_clk),
.io_rst_n (io_rst_n),
.io_mem_sdi (io_mem_sdi),
.io_mem_csn (io_mem_csn),
.io_mem_sck (io_mem_sck),
.io_mem_sdo (io_mem_sdo),
.io_ioport_sdi (io_ioport_sdi),
.io_ioport_sck (io_ioport_sck),
.io_ioport_sdo (io_ioport_sdo),
.io_ioport_latch_i (io_ioport_latch_i),
.io_ioport_latch_o (io_ioport_latch_o)
// ============================================================================
// Module whisk_top: instantiate the CPU core together with the SPI mem
// serdes and IO port serdes.
// ============================================================================
module whisk_top (
input wire io_clk,
input wire io_rst_n,
input wire io_mem_sdi,
output wire io_mem_csn,
output wire io_mem_sck,
output wire io_mem_sdo,
input wire io_ioport_sdi,
output wire io_ioport_sck,
output wire io_ioport_sdo,
output wire io_ioport_latch_i,
output wire io_ioport_latch_o
// ----------------------------------------------------------------------------
// Clock/reset wrangling
// Don't buffer the clock -- seems like the scripts define a clock on io_in[0]?
wire clk = io_clk;
// Synchronise reset removal to clk
reg [1:0] reset_sync;
wire rst_n = reset_sync[1];
always @ (posedge clk or negedge io_rst_n) begin
if (!io_rst_n) begin
reset_sync <= 2'd00;
end else begin
reset_sync <= ~(~reset_sync << 1);
// ----------------------------------------------------------------------------
// Processor instantiation
wire mem_sck_en_next;
wire mem_sdo_next;
wire mem_csn_next;
wire mem_sdi_prev;
wire ioport_sck_en_next;
wire ioport_sdo_next;
wire ioport_sdi_prev;
wire ioport_latch_i_next;
wire ioport_latch_o_next;
whisk_cpu cpu (
.clk (clk),
.rst_n (rst_n),
.mem_sck_en_next (mem_sck_en_next),
.mem_sdo_next (mem_sdo_next),
.mem_csn_next (mem_csn_next),
.mem_sdi_prev (mem_sdi_prev),
.ioport_sck_en_next (ioport_sck_en_next),
.ioport_sdo_next (ioport_sdo_next),
.ioport_sdi_prev (ioport_sdi_prev),
.ioport_latch_i_next (ioport_latch_i_next),
.ioport_latch_o_next (ioport_latch_o_next)
// ----------------------------------------------------------------------------
// Serdes (IO registers)
whisk_spi_serdes mem_serdes_u (
.clk (clk),
.rst_n (rst_n),
.sdo (mem_sdo_next),
.sck_en (mem_sck_en_next),
.csn (mem_csn_next),
.sdi (mem_sdi_prev),
.padout_sck (io_mem_sck),
.padout_csn (io_mem_csn),
.padout_sdo (io_mem_sdo),
.padin_sdi (io_mem_sdi)
whisk_ioport_serdes io_serdes_u (
.clk (clk),
.rst_n (rst_n),
.sdo (ioport_sdo_next),
.sck_en (ioport_sck_en_next),
.latch_i (ioport_latch_i_next),
.latch_o (ioport_latch_o_next),
.sdi (ioport_sdi_prev),
.padout_sdo (io_ioport_sdo),
.padout_sck (io_ioport_sck),
.padout_latch_i (io_ioport_latch_i),
.padout_latch_o (io_ioport_latch_o),
.padin_sdi (io_ioport_sdi)
// ============================================================================
// Module whisk_cpu: top-level for the Whisk processor, minus the IO wrapper
// and the SPI/IOPORT serdes
// ============================================================================
module whisk_cpu (
input wire clk,
input wire rst_n,
// SPI SRAM interface
output wire mem_sck_en_next,
output wire mem_sdo_next,
output wire mem_csn_next,
input wire mem_sdi_prev,
// Shift registers for IO port
output wire ioport_sck_en_next,
output wire ioport_sdo_next,
input wire ioport_sdi_prev,
output wire ioport_latch_i_next,
output wire ioport_latch_o_next
// ----------------------------------------------------------------------------
// Constants
// Machine size
localparam W_INSTR = 16;
localparam W_DATA = 16;
localparam N_REGS = 6;
// Instruction layout
localparam INSTR_OP_LSB = 0;
localparam INSTR_OP_MSB = 3;
localparam INSTR_COND_LSB = 4;
localparam INSTR_COND_MSB = 6;
localparam INSTR_RT_LSB = 7;
localparam INSTR_RT_MSB = 9;
localparam INSTR_RS_LSB = 10;
localparam INSTR_RS_MSB = 12;
localparam INSTR_RD_LSB = 13;
localparam INSTR_RD_MSB = 15;
// Major opcodes (instr[3:0])
localparam [3:0] OP_ADD = 4'h0; // rd = rs + rt
localparam [3:0] OP_SUB = 4'h1; // rd = rs - rt
localparam [3:0] OP_AND = 4'h2; // rd = rs & rt
localparam [3:0] OP_ANDN = 4'h3; // rd = ~rs & rt
localparam [3:0] OP_OR = 4'h4; // rd = rs | rt
localparam [3:0] OP_SHIFT = 4'h5; // Minor opcode in rt
localparam [3:0] OP_INOUT = 4'h6; // Minor opcode in rs
localparam [3:0] OP_LD = 4'h8; // rd = mem[rs ];
localparam [3:0] OP_LD_IA = 4'h9; // rd = mem[rs ]; rs += rt;
localparam [3:0] OP_LD_ADD = 4'ha; // rd = mem[rs + rt];
localparam [3:0] OP_LD_IB = 4'hb; // rd = mem[rs + rt]; rs += rt;
localparam [3:0] OP_ST = 4'hc; // mem[rs ] = rd;
localparam [3:0] OP_ST_IA = 4'hd; // mem[rs ] = rd; rs += rt;
localparam [3:0] OP_ST_ADD = 4'he; // mem[rs + rt] = rd;
localparam [3:0] OP_ST_IB = 4'hf; // mem[rs + rt] = rd; rs += rt;
// Minor opcodes (rt)
localparam [2:0] OP2_SRL = 3'h0;
localparam [2:0] OP2_SRA = 3'h1;
localparam [2:0] OP2_SLL = 3'h4;
// Minor opcodes (rs)
localparam [2:0] OP2_IN = 3'h0;
localparam [2:0] OP2_OUT = 3'h4;
// ----------------------------------------------------------------------------
// Main control state machine
reg [W_INSTR-1:0] instr;
wire [INSTR_OP_MSB -INSTR_OP_LSB :0] instr_op;
wire [INSTR_COND_MSB-INSTR_COND_LSB:0] instr_cond;
wire [INSTR_RT_MSB -INSTR_RT_LSB :0] instr_rt;
wire [INSTR_RS_MSB -INSTR_RS_LSB :0] instr_rs;
wire [INSTR_RD_MSB -INSTR_RD_LSB :0] instr_rd;
assign {instr_rd, instr_rs, instr_rt, instr_cond, instr_op} = instr;
wire instr_op_ls = instr_op[3]; // Whether an instruction is a load/store
wire instr_op_st_nld = instr_op[2]; // Whether a load/store is a load or store
wire instr_op_ls_suma = instr_op[1]; // Whether sum is used for address
wire instr_op_ls_sumr = instr_op[0]; // Whether sum is written back to register
reg [3:0] bit_ctr;
reg [2:0] state;
reg instr_cond_true;
reg instr_has_imm_operand;
// Note there is a 2 cycle delay from issuing a bit on SDO to getting a bit
// back on SDI. This is handled with a 1-cycle gap after issuing a read
// address, so that e.g. S_FETCH always has the first instruction bit
// available on the first cycle.
localparam [2:0] S_FETCH = 3'd0; // Sample 16 instr bits, increment PC
localparam [2:0] S_EXEC = 3'd1; // Loop all GPRs, write one GPR
localparam [2:0] S_PC_NONSEQ0 = 3'd2; // Issue cmd, then issue 1 PC bit
localparam [2:0] S_PC_NONSEQ1 = 3'd3; // Issue rest of PC, then 1 cyc delay
localparam [2:0] S_LS_ADDR0 = 3'd4; // Deferred LS SPI cmd following immediate
localparam [2:0] S_LS_ADDR1 = 3'd4; // Issue addr then, if load, 1 cyc delay
localparam [2:0] S_LS_DATA = 3'd5; // Issue store data, or sample load data
localparam [2:0] S_SKIP_IMM = 3'd6; // Skip immediate following false condition
reg [2:0] state_nxt_wrap;
reg [2:0] state_nxt;
always @ (*) begin
state_nxt_wrap = state;
case (state)
S_FETCH: begin
if (!instr_cond_true) begin
if (instr_has_imm_operand) begin
state_nxt_wrap = S_SKIP_IMM;
end else begin
state_nxt_wrap = S_FETCH;
end else begin
state_nxt_wrap = S_EXEC;
S_EXEC: begin
if (instr_op_ls && instr_has_imm_operand) begin
// Command was deferred due to immediate read keeping SPI busy
state_nxt_wrap = S_LS_ADDR0;
end else if (instr_op_ls) begin
// Command was issued concurrently, skip straight to address issue
state_nxt_wrap = S_LS_ADDR1;
end else if (instr_rd == 3'd7) begin
state_nxt_wrap = S_PC_NONSEQ0;
end else begin
state_nxt_wrap = S_FETCH;
S_PC_NONSEQ0: begin
state_nxt_wrap = S_PC_NONSEQ1;
S_PC_NONSEQ1: begin
if (!instr_cond_true) begin
// Have just been reset, instr is invalid
state_nxt_wrap = S_FETCH;
end else begin
state_nxt_wrap = S_FETCH;
S_LS_ADDR0: begin
state_nxt_wrap = S_LS_ADDR1;
S_LS_ADDR1: begin
state_nxt_wrap = S_LS_DATA;
S_LS_DATA: begin
state_nxt_wrap = S_PC_NONSEQ0;
S_SKIP_IMM: begin
state_nxt_wrap = S_FETCH;
state_nxt = &bit_ctr ? state_nxt_wrap : state;
// Start of day:
// - The only resettable flops are state, bit_ctr, and instr_cond_true.
// - We reset state/bit_ctr to a nonsequential fetch, and reset
// instr_cond_true=0 (usually unreachable)
// - instr_cond_true=0 masks the fetch address to 0, regardless of PC
// - The first instruction must be `add pc, zero, #4` to initialise PC
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
state <= S_PC_NONSEQ0;
bit_ctr <= 4'h0;
end else begin
state <= state_nxt;
bit_ctr <= bit_ctr + 4'h1;
// ----------------------------------------------------------------------------
// Instruction shifter and early decode
always @ (posedge clk) begin
if (state == S_FETCH) begin
instr <= {mem_sdi_prev, instr[15:1]};
// Decode condition and imm operand flags as the instruction comes in, so we
// can use them to steer the state machine at the end of S_FETCH.
reg instr_has_imm_operand_nxt;
reg instr_cond_true_nxt;
// From ALU:
wire [7:0] condition_vec8;
always @ (*) begin
instr_has_imm_operand_nxt = instr_has_imm_operand;
instr_cond_true_nxt = instr_cond_true;
if (instr_has_imm_operand && !instr_cond_true) begin
// In this case we must be in S_FETCH. Hold instr_cond_true for an
// additional fetch cycle so that the immediate operand is also
// dumped, but clear the operand flag so we don't loop forever.
if (&bit_ctr) begin
instr_has_imm_operand_nxt = 1'b0;
end else if (state == S_FETCH) begin
if (bit_ctr == (INSTR_RT_MSB + 1)) begin
// Grab rt as it goes past (this is why rt is not the MSBs!)
instr_has_imm_operand_nxt = instr[W_INSTR-1 -: 3] == 3'd6;
if (bit_ctr == (INSTR_COND_MSB + 1)) begin
// Decode condition as it goes past
instr_cond_true_nxt = condition_vec8[instr[W_INSTR-1 -: 3]];
// instr_cond_true must reset to 0, because we use it to recognise the first
// fetch after reset. We don't care about instr_has_imm_operand, because it
// is initialised during S_FETCH before first use.
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
instr_cond_true <= 1'b0;
end else begin
instr_cond_true <= instr_cond_true_nxt;
always @ (posedge clk) begin
instr_has_imm_operand <= instr_has_imm_operand_nxt;
// ----------------------------------------------------------------------------
// Register file
wire reg_rd_qr;
wire reg_rs_qr, reg_rs_qr_next;
wire reg_rt_qr;
wire alu_result;
wire writeback_wen =
state == S_EXEC && !(instr_op_ls && !instr_op_ls_sumr) ||
state == S_LS_ADDR0 && instr_op_ls_sumr ||
state == S_LS_DATA && !instr_op_st_nld;
wire writeback_data = state == S_LS_DATA ? mem_sdi_prev : alu_result;
wire [INSTR_RD_MSB-INSTR_RD_LSB:0] writeback_reg =
instr_op_ls && state != S_LS_DATA ? instr_rs : instr_rd;
whisk_regfile #(
.W (W_DATA),
) regfile_u (
.clk (clk),
.rd (writeback_reg),
.rd_q (reg_rd_qr),
.rd_wen (writeback_wen),
.rd_d (writeback_data),
.rs (instr_rs),
.rs_q (reg_rs_qr),
.rs_q_next (reg_rs_qr_next),
.rt (instr_rt),
.rt_q (reg_rt_qr)
// ----------------------------------------------------------------------------
// Program counter
wire pc_dl;
wire pc_qr;
wire [15:0] pc_q_all;
wire pc_qr_next = pc_q_all[1];
whisk_shiftreg_right #(
.W (16)
) pc_u (
.clk (clk),
.dl (pc_dl),
.q_all (pc_q_all),
.qr (pc_qr)
wire pc_increment =
state == S_FETCH ||
state == S_EXEC && instr_has_imm_operand ||
state == S_SKIP_IMM;
reg pc_ci;
wire pc_co, pc_sum;
assign {pc_co, pc_sum} = pc_qr + (~|bit_ctr[3:1] ? bit_ctr[0] && pc_increment : pc_ci);
always @ (posedge clk) begin
pc_ci <= pc_co;
wire rd_is_pc = instr_rd == 3'd7;
assign pc_dl =
state == S_EXEC && rd_is_pc ? alu_result :
state == S_LS_DATA && rd_is_pc && !instr_op_st_nld ? mem_sdi_prev : pc_sum;
// ----------------------------------------------------------------------------
// ALU
wire alu_op_s =
instr_rs == 3'd7 ? pc_qr : reg_rs_qr;
wire alu_op_s_next =
instr_rs == 3'd7 ? pc_qr_next : reg_rs_qr_next;
wire alu_op_t =
instr_rt == 3'd7 ? pc_qr :
instr_rt == 3'd6 ? mem_sdi_prev : reg_rt_qr;
reg alu_ci;
wire [1:0] alu_add = alu_op_s + alu_op_t + (~|bit_ctr ? 1'b0 : alu_ci);
wire [1:0] alu_sub = alu_op_s + !alu_op_t + (~|bit_ctr ? 1'b1 : alu_ci);
// Left shift uses the carry flop as a 1-cycle delay, counter to the
// register's rightward rotation. Right shift looks ahead to advance its
// rotation. The final carry flag is the bit shifted "out of" the register.
wire [1:0] alu_shift_l = {
|alu_ci && |bit_ctr
wire [1:0] alu_shift_r = {
|bit_ctr ? alu_ci : alu_op_s,
&bit_ctr ? alu_op_s && instr_rt[0] : alu_op_s_next
// Carry is an all-ones flag for bitwise ops
wire bit_co = alu_result && (alu_ci || ~|bit_ctr);
wire alu_co;
assign {alu_co, alu_result} =
instr_op_ls ? alu_add :
instr_op == OP_ADD ? alu_add :
instr_op == OP_SUB ? alu_sub :
instr_op == OP_AND ? {bit_co, alu_op_s && alu_op_t} :
instr_op == OP_ANDN ? {bit_co, !alu_op_s && alu_op_t} :
instr_op == OP_OR ? {bit_co, alu_op_s || alu_op_t} :
instr_op == OP_SHIFT && instr_rt[2] ? alu_shift_l :
instr_op == OP_SHIFT && !instr_rt[2] ? alu_shift_r :
instr_op == OP_INOUT ? ioport_sdi_prev : alu_add;
always @ (posedge clk) begin
alu_ci <= alu_co;
// ----------------------------------------------------------------------------
// Flags
reg flag_z;
reg flag_c;
reg flag_n;
wire update_flag_zn = (state == S_EXEC || state == S_LS_DATA) && ~|instr_cond;
wire update_flag_c = update_flag_zn && state == S_EXEC;
always @ (posedge clk) begin
if (update_flag_zn) begin
flag_z <= (flag_z || ~|bit_ctr) && !alu_result;
flag_n <= alu_result;
if (update_flag_c) begin
flag_c <= alu_co;
assign condition_vec8 = {
!flag_z, flag_z,
!flag_c, flag_c,
!flag_n, flag_n,
1'b1, 1'b1
// ----------------------------------------------------------------------------
// Address register
// Captures address calculations LSB-first and then replays them MSB-first.
wire ar_l_nr;
wire ar_dl;
wire ar_dr;
wire ar_ql;
wire ar_qr;
// Need to look ahead by one bit to get correct timing for read addresses:
wire [15:0] ar_q_all;
wire ar_ql_next = ar_q_all[14];
whisk_shiftreg_leftright #(
.W (16)
) ar_u (
.clk (clk),
.l_nr (ar_l_nr),
.dl (ar_dl),
.ql (ar_ql),
.dr (ar_dr),
.qr (ar_qr),
.q_all (ar_q_all)
// Shift left when replaying addresses.
assign ar_l_nr = state == S_LS_ADDR1 || state == S_PC_NONSEQ1;
assign ar_dl =
state == S_PC_NONSEQ0 ? pc_qr :
instr_op_ls_suma ? alu_add : reg_rs_qr;
// ----------------------------------------------------------------------------
// SPI controls
// Deassert CSn before issuing a nonsequential address.
// Note LS_ADDR0 state is skipped if we are able to issue from EXEC:
wire issue_ls_addr_ph0 =
state == S_LS_ADDR0 ||
state == S_EXEC && instr_op_ls && !instr_has_imm_operand;
wire [3:0] spi_cmd_start_cycle =
state == S_PC_NONSEQ0 ? 4'h7 :
instr_op_st_nld ? 4'h8 : 4'h7;
assign mem_csn_next = bit_ctr < spi_cmd_start_cycle && (
state == S_PC_NONSEQ0 || issue_ls_addr_ph0
// Pedal to the metal on SCK except when pulling CSn for a nonsequential
// access, or when executing an unskipped instruction with no immediate.
assign mem_sck_en_next = !(
mem_csn_next ||
state == (&bit_ctr[3:1] ? S_FETCH : S_EXEC) && !instr_has_imm_operand && instr_cond_true
// Store address replays entirely in LS_ADDR1, but load/fetch extend one cycle
// into previous state, so carefully pick what delay to observe the address
// with. (Also mask address to zero for very first fetch at start of day.)
wire mem_spi_addr =
!instr_cond_true ? 1'b0 :
state == S_PC_NONSEQ1 ? ar_ql_next :
state == S_LS_ADDR1 && instr_op_st_nld ? ar_ql :
state == S_LS_ADDR1 && !instr_op_st_nld ? ar_ql_next : ar_dl;
// Note: SPI commands are MSB-first (the commands here are 03h and 02h).
localparam [15:0] SPI_INSTR_READ = 16'hc000 >> 1;
localparam [15:0] SPI_INSTR_WRITE = 16'h8000;
wire mem_sdo_ls_addr_ph0 =
instr_op_st_nld ? SPI_INSTR_WRITE[bit_ctr] :
&bit_ctr ? mem_spi_addr : SPI_INSTR_READ[bit_ctr];
assign mem_sdo_next =
state == S_PC_NONSEQ0 ? (&bit_ctr ? pc_qr : SPI_INSTR_READ[bit_ctr]) :
state == S_PC_NONSEQ1 ? mem_spi_addr :
issue_ls_addr_ph0 ? mem_sdo_ls_addr_ph0 :
state == S_LS_ADDR1 ? mem_spi_addr :
state == S_LS_DATA ? reg_rd_qr : 1'b0;
// ----------------------------------------------------------------------------
// IO port
// Expected hardware is a 1x 8-bit PISO, and 2x 8-bit SIPO shift registers:
// - OUT: Clock out 16 bits from rt[15:0]/imm[15:0], then pulse latch_o high.
// - IN: Clock 8 bits into rd[15:8], with latch_i low for the first clock.
// The IN interface is still driven when executing an OUT, with more clocks.
// Abusable for 6 extra inputs if a second PISO register is chained.
// rt[13:6] is actually clocked out on an IN, there's just no latch_o pulse.
// Abusable to drive longer SIPO chains using multiple INs and a final OUT.
wire exec_io_instr = state == S_EXEC && instr_op == OP_INOUT;
wire io_instr_out = (instr_rs & (OP2_OUT | OP2_IN)) == OP2_OUT;
// The instruction is still valid on the first cycle of FETCH. This lets us
// latch outputs *after* the last clock pulse, without spending a flop.
assign ioport_latch_o_next = state == S_FETCH && ~|bit_ctr &&
instr_op == OP_INOUT && io_instr_out && instr_cond_true;
assign ioport_latch_i_next = !(exec_io_instr && bit_ctr == 4'h6);
assign ioport_sdo_next = exec_io_instr && alu_op_t;
assign ioport_sck_en_next = exec_io_instr && (
(bit_ctr >= 4'h6 && bit_ctr < 4'he) ||
// ============================================================================
// Module whisk_regfile: a register file of multiple shift registers, with 3
// read ports (rd/rs/rt) and one write port (rd).
// ============================================================================
// All registers rotate right by one bit every cycle. No enable, so do things
// in multiples of 16 cycles. Registers not written to are recirculated.
// q is the value of the rightmost flop in each register. The rs port also has
// a q_next value, which taps in one flop from the end, and is required for
// performing right-shift-by-one in 16 cycles.
// Out-of-range indices read as 0, and ignore writes.
module whisk_regfile #(
parameter W = 16,
parameter N = 6
) (
input wire clk,
input wire [$clog2(N)-1:0] rd,
output wire rd_q,
input wire rd_wen,
input wire rd_d,
input wire [$clog2(N)-1:0] rs,
output wire rs_q,
output wire rs_q_next,
input wire [$clog2(N)-1:0] rt,
output wire rt_q,
localparam N_PADDED = 1 << $clog2(N);
wire [N-1:0] d;
wire [W-1:0] q [N_PADDED-1:0];
assign rd_q = q[rd][0];
assign rs_q = q[rs][0];
assign rs_q_next = q[rs][1];
assign rt_q = q[rt][0];
genvar g;
for (g = 0; g < N_PADDED; g = g + 1) begin: loop_gprs
if (g >= N) begin: gpr_tieoff
assign q[g] = {W{1'b0}};
end else begin: gpr_shifter
// Recirculate unless register is addressed as rd.
wire qr;
assign d[g] = rd_wen && rd == g ? rd_d : qr;
whisk_shiftreg_right #(
.W (W)
) reg_u (
.clk (clk),
.dl (d[g]),
.qr (qr),
.q_all (q[g])
// ============================================================================
// Module whisk_shiftreg_leftright: a shift register that always shifts left
// or right each cycle.
// ============================================================================
// Note there is no enable because the underlying scan flops do not have an
// enable (there is an enable version, but it's larger, and more routing
// required!). If you don't want to shift, just shift back and forth for an
// even number of cycles, or do a full loop :)
// dl and ql are the leftmost inputs and outputs. If l_nr is low (right), ql
// becomes dl on every posedge of clk. (Yes, it's confusing!)
// dr and qr are the rightmost inputs and outputs. If l_nr is high (left), qr
// becomes dr on every posedge of clk.
module whisk_shiftreg_leftright #(
parameter W = 16
) (
input wire clk,
input wire l_nr,
input wire dl,
input wire dr,
output wire ql,
output wire qr,
output wire [W-1:0] q_all
wire [W+1:0] chain_q;
assign chain_q[0 ] = dr;
assign chain_q[W + 1] = dl;
assign qr = chain_q[1];
assign ql = chain_q[W];
assign q_all = chain_q[W:1];
genvar g;
for (g = 1; g < W + 1; g = g + 1) begin: shift_stage
// Shift-to-left means select the input to your right, and vice versa.
whisk_flop_scanmux flop_u (
.clk (clk),
.sel (l_nr),
.d ({chain_q[g - 1], chain_q[g + 1]}),
.q (chain_q[g])
// ============================================================================
// Module whisk_shiftreg_right: register that only shifts right, like Zoolander
// ============================================================================
// Cost per bit is lower than whisk_shiftreg_leftright
module whisk_shiftreg_right #(
parameter W = 16
) (
input wire clk,
input wire dl,
output wire qr,
output reg [W-1:0] q_all
always @ (posedge clk) begin
q_all <= {dl, q_all[W-1:1]};
assign qr = q_all[0];
// ============================================================================
// Module whisk_flop_scanmux: a flop with a mux on its input. Usually reserved
// for DFT scan insertion, but we don't need that where we're going >:)
// ============================================================================
module whisk_flop_scanmux (
input wire clk,
input wire sel,
input wire [1:0] d,
output wire q
// (scanchain in TT2 uses sky130_fd_sc_hd__sdfxtp, a simple flop with scan
// mux. An enable version, sky130_fd_sc_hd__sedfxtp, is also available, but
// this is significantly larger. Instantiate the unit-drive version because
// we have a ridiculously long clock period; not sure whether the backend is
// allowed to change the drive.)
sky130_fd_sc_hd__sdfxtp_1 sdff_u (
.CLK (clk),
.D (d[0]),
.SCD (d[1]),
.SCE (sel),
.Q (q),
.VPWR (1'b1),
.VGND (1'b0)
// Synthesisable model
reg q_r;
always @ (posedge clk) begin
q_r <= d[sel];
assign q = q_r;
// ============================================================================
// Module whisk_flop_en: a flop with an input enable (DFFE). For some reason
// these are not mapped automatically, so we get a DFF, a mux and two buffers
// ============================================================================
module whisk_flop_en (
input wire clk,
input wire d,
input wire e,
output wire q
sky130_fd_sc_hd__edfxtp_1 dffe_u (
.CLK (clk),
.D (d),
.DE (e),
.Q (q),
.VPWR (1'b1),
.VGND (1'b0)
// Synthesisable model
reg q_r;
always @ (posedge clk) begin
if (e) begin
q_r <= d;
assign q = q_r;
// ============================================================================
// Module whisk_spi_serdes: handle the timing of the SPI interface, and
// provide a slightly abstracted interface to the whisk core, with all
// signals on posedge of clk.
// ============================================================================
module whisk_spi_serdes(
input wire clk,
input wire rst_n,
// Core
input wire sdo,
input wire sck_en,
input wire csn,
output wire sdi,
// IOs
output wire padout_sck,
output wire padout_csn,
output wire padout_sdo,
input wire padin_sdi
// ----------------------------------------------------------------------------
// Output paths
reg sdo_r;
reg sck_en_r;
reg csn_r;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
sdo_r <= 1'b0;
csn_r <= 1'b1;
sck_en_r <= 1'b0;
end else begin
sdo_r <= sdo;
csn_r <= csn;
sck_en_r <= sck_en;
assign padout_sdo = sdo_r;
assign padout_csn = csn_r;
// Through-path for clock input to SCK output. TODO clock gating cell
// required? This is sampled by the scan flops at the tile output.
assign padout_sck = sck_en_r && !clk;
// ----------------------------------------------------------------------------
// Input paths
// ASIC version
// TODO find a suitable delay buffer cell for hold buffering, and decide how to
// dimension it against i[7:0] skew
// TODO find a suitable latch cell (possibly sky130_fd_sc_hd__dlxtp)
wire padin_sdi_delay = padin_sdi;
reg sdi_latch;
always @ (*) begin
if (clk) begin
sdi_latch <= padin_sdi_delay;
assign sdi = sdi_latch;
// Dodgy sim-only version
reg padin_sdi_reg;
always @ (negedge clk) begin
padin_sdi_reg <= padin_sdi;
// FIXME there is something I don't understand here with the CXXRTL delta cycles
// assign sdi = padin_sdi_reg;
assign sdi = padin_sdi;
// ============================================================================
// Module whisk_ioport_serdes: similar to whisk_spi_serdes, but for the
// shift-register-based IO port.
// ============================================================================
module whisk_ioport_serdes(
input wire clk,
input wire rst_n,
// Core
input wire sdo,
input wire sck_en,
input wire latch_i,
input wire latch_o,
output wire sdi,
// IOs
output wire padout_sdo,
output wire padout_sck,
output wire padout_latch_i,
output wire padout_latch_o,
input wire padin_sdi
// ----------------------------------------------------------------------------
// Output paths
reg sdo_r;
reg sck_en_r;
reg latch_i_r;
reg latch_o_r;
always @ (posedge clk or negedge rst_n) begin
if (!rst_n) begin
sdo_r <= 1'b0;
sck_en_r <= 1'b0;
latch_i_r <= 1'b0;
latch_o_r <= 1'b0;
end else begin
sdo_r <= sdo;
sck_en_r <= sck_en;
latch_i_r <= latch_i;
latch_o_r <= latch_o;
assign padout_sdo = sdo_r;
assign padout_latch_i = latch_i_r;
assign padout_latch_o = latch_o_r;
// TODO clock gating cell?
assign padout_sck = sck_en_r && !clk;
// ----------------------------------------------------------------------------
// Input paths
// FIXME this is actually different from SPI, right? Probably transitions on
// posedge? Need to find some actual datasheets for candidate shift
// registers.
// ASIC version
// TODO find a suitable delay buffer cell for hold buffering, and decide how to
// dimension it against i[7:0] skew
// TODO find a suitable latch cell (possibly sky130_fd_sc_hd__dlxtp)
wire padin_sdi_delay = padin_sdi;
reg sdi_latch;
always @ (*) begin
if (clk) begin
sdi_latch <= padin_sdi_delay;
assign sdi = sdi_latch;
// Dodgy sim-only version
reg padin_sdi_reg;
always @ (negedge clk) begin
padin_sdi_reg <= padin_sdi;
assign sdi = padin_sdi_reg;
// ============================================================================
// _ _ _
// | | (_) | |
// __ _| |__ _ ___| | __
// \ \ /\ / / '_ \| / __| |/ /
// \ V V /| | | | \__ \ <
// \_/\_/ |_| |_|_|___/_|\_\
// When I was 16 I designed a 7400-series breadboard processor called Fork,
// with a language called Spoon. Now I'm 26 and I'm designing a processor
// called Whisk. I wonder what I'll do when I grow up.
// Many mistakes were made in this ISA. What did you think? My aim with this
// version of Whisk is to run enough software to discover exactly why my
// instruction set is bad. Hopefully Tiny Tapeout 3 will bring faster IOs,
// with 2D muxing instead of a scan chain, and then I can try getting some
// serious software running on Whisk v2, at a few MHz instead of 12 kHz.