| // ============================================================================ |
| // Whisk: a 16-bit bit-serial RISC processor (c) Luke Wren 2022 |
| // SPDX-License-Identifier: Apache-2.0 |
| // ============================================================================ |
| |
| // Whisk is a 16-bit bit-serial processor, with external SPI SRAM interface, |
| // designed in a hurry for Tiny Tapeout 2. See README.md for an overview of |
| // the instruction set. Supporting hardware: |
| // |
| // - SPI SRAM with sequential mode and 16-bit addressing, e.g. Microchip |
| // 23K256T-I (32 kiB SRAM) |
| // |
| // - One 8-bit parallel-to-serial shift register, for input port |
| // |
| // - Two 8-bit serial-to-parallel shift registers, for output port |
| // |
| // - A host device capable of loading the SPI SRAM, setting it to sequential |
| // mode, and releasing Whisk's reset. I'll probably use a Pico. |
| // |
| // There will be a board with all of these components ready for bringup, and |
| // it will be added to this repository (also I will probably make a few of |
| // them, and will gladly send you one if you ask). However this will not be |
| // done before tapeout, as I started this project a week before the |
| // deadline! |
| |
| `ifdef WHISK_DEFAULT_NETTYPE_NONE |
| `default_nettype none |
| `endif |
| |
| `ifndef WHISK_NO_CELLS |
| `define WHISK_CELLS_SKY130 |
| `endif |
| |
| // ============================================================================ |
| // Module wren6991_whisk_tt2_io_wrapper: Top level for TT2 synthesis. |
| // instantiate whisk_top, and map named ports to numbered TT2 inputs/outputs |
| // ============================================================================ |
| |
| module wren6991_whisk_tt2_io_wrapper ( |
| input wire [7:0] io_in, |
| output wire [7:0] io_out |
| ); |
| |
| // Global signals |
| wire io_clk = io_in[0]; |
| wire io_rst_n = io_in[1]; |
| |
| // SPI memory interface |
| wire io_mem_sdi = io_in[2]; |
| |
| wire io_mem_csn; |
| wire io_mem_sck; |
| wire io_mem_sdo; |
| |
| assign io_out[0] = io_mem_csn; |
| assign io_out[1] = io_mem_sck; |
| assign io_out[2] = io_mem_sdo; |
| |
| // IO port (shift register interface) |
| wire io_ioport_sdi = io_in[3]; |
| |
| wire io_ioport_sck; |
| wire io_ioport_sdo; |
| wire io_ioport_latch_i; |
| wire io_ioport_latch_o; |
| |
| assign io_out[3] = io_ioport_sck; |
| assign io_out[4] = io_ioport_sdo; |
| assign io_out[5] = io_ioport_latch_i; |
| assign io_out[6] = io_ioport_latch_o; |
| |
| // Be a good neighbour |
| assign io_out[7] = 1'b0; |
| |
| whisk_top top_u ( |
| .io_clk (io_clk), |
| .io_rst_n (io_rst_n), |
| |
| .io_mem_sdi (io_mem_sdi), |
| .io_mem_csn (io_mem_csn), |
| .io_mem_sck (io_mem_sck), |
| .io_mem_sdo (io_mem_sdo), |
| |
| .io_ioport_sdi (io_ioport_sdi), |
| .io_ioport_sck (io_ioport_sck), |
| .io_ioport_sdo (io_ioport_sdo), |
| .io_ioport_latch_i (io_ioport_latch_i), |
| .io_ioport_latch_o (io_ioport_latch_o) |
| ); |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_top: instantiate the CPU core together with the SPI mem |
| // serdes and IO port serdes. |
| // ============================================================================ |
| |
| module whisk_top ( |
| input wire io_clk, |
| input wire io_rst_n, |
| |
| input wire io_mem_sdi, |
| output wire io_mem_csn, |
| output wire io_mem_sck, |
| output wire io_mem_sdo, |
| |
| input wire io_ioport_sdi, |
| output wire io_ioport_sck, |
| output wire io_ioport_sdo, |
| output wire io_ioport_latch_i, |
| output wire io_ioport_latch_o |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Clock/reset wrangling |
| |
| // Don't buffer the clock -- seems like the scripts define a clock on io_in[0]? |
| wire clk = io_clk; |
| |
| // Synchronise reset removal to clk |
| reg [1:0] reset_sync; |
| wire rst_n = reset_sync[1]; |
| |
| always @ (posedge clk or negedge io_rst_n) begin |
| if (!io_rst_n) begin |
| reset_sync <= 2'd00; |
| end else begin |
| reset_sync <= ~(~reset_sync << 1); |
| end |
| end |
| |
| // ---------------------------------------------------------------------------- |
| // Processor instantiation |
| |
| wire mem_sck_en_next; |
| wire mem_sdo_next; |
| wire mem_csn_next; |
| wire mem_sdi_prev; |
| |
| wire ioport_sck_en_next; |
| wire ioport_sdo_next; |
| wire ioport_sdi_prev; |
| wire ioport_latch_i_next; |
| wire ioport_latch_o_next; |
| |
| whisk_cpu cpu ( |
| .clk (clk), |
| .rst_n (rst_n), |
| |
| .mem_sck_en_next (mem_sck_en_next), |
| .mem_sdo_next (mem_sdo_next), |
| .mem_csn_next (mem_csn_next), |
| .mem_sdi_prev (mem_sdi_prev), |
| |
| .ioport_sck_en_next (ioport_sck_en_next), |
| .ioport_sdo_next (ioport_sdo_next), |
| .ioport_sdi_prev (ioport_sdi_prev), |
| .ioport_latch_i_next (ioport_latch_i_next), |
| .ioport_latch_o_next (ioport_latch_o_next) |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Serdes (IO registers) |
| |
| whisk_spi_serdes mem_serdes_u ( |
| .clk (clk), |
| .rst_n (rst_n), |
| |
| .sdo (mem_sdo_next), |
| .sck_en (mem_sck_en_next), |
| .csn (mem_csn_next), |
| .sdi (mem_sdi_prev), |
| |
| .padout_sck (io_mem_sck), |
| .padout_csn (io_mem_csn), |
| .padout_sdo (io_mem_sdo), |
| .padin_sdi (io_mem_sdi) |
| ); |
| |
| whisk_ioport_serdes io_serdes_u ( |
| .clk (clk), |
| .rst_n (rst_n), |
| |
| .sdo (ioport_sdo_next), |
| .sck_en (ioport_sck_en_next), |
| .latch_i (ioport_latch_i_next), |
| .latch_o (ioport_latch_o_next), |
| .sdi (ioport_sdi_prev), |
| |
| .padout_sdo (io_ioport_sdo), |
| .padout_sck (io_ioport_sck), |
| .padout_latch_i (io_ioport_latch_i), |
| .padout_latch_o (io_ioport_latch_o), |
| .padin_sdi (io_ioport_sdi) |
| ); |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_cpu: top-level for the Whisk processor, minus the IO wrapper |
| // and the SPI/IOPORT serdes |
| // ============================================================================ |
| |
| module whisk_cpu ( |
| input wire clk, |
| input wire rst_n, |
| |
| // SPI SRAM interface |
| output wire mem_sck_en_next, |
| output wire mem_sdo_next, |
| output wire mem_csn_next, |
| input wire mem_sdi_prev, |
| |
| // Shift registers for IO port |
| output wire ioport_sck_en_next, |
| output wire ioport_sdo_next, |
| input wire ioport_sdi_prev, |
| output wire ioport_latch_i_next, |
| output wire ioport_latch_o_next |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Constants |
| |
| // Machine size |
| localparam W_INSTR = 16; |
| localparam W_DATA = 16; |
| localparam N_REGS = 6; |
| |
| // Instruction layout |
| localparam INSTR_OP_LSB = 0; |
| localparam INSTR_OP_MSB = 3; |
| localparam INSTR_COND_LSB = 4; |
| localparam INSTR_COND_MSB = 6; |
| localparam INSTR_RT_LSB = 7; |
| localparam INSTR_RT_MSB = 9; |
| localparam INSTR_RS_LSB = 10; |
| localparam INSTR_RS_MSB = 12; |
| localparam INSTR_RD_LSB = 13; |
| localparam INSTR_RD_MSB = 15; |
| |
| // Major opcodes (instr[3:0]) |
| localparam [3:0] OP_ADD = 4'h0; // rd = rs + rt |
| localparam [3:0] OP_SUB = 4'h1; // rd = rs - rt |
| localparam [3:0] OP_AND = 4'h2; // rd = rs & rt |
| localparam [3:0] OP_ANDN = 4'h3; // rd = ~rs & rt |
| localparam [3:0] OP_OR = 4'h4; // rd = rs | rt |
| localparam [3:0] OP_SHIFT = 4'h5; // Minor opcode in rt |
| localparam [3:0] OP_INOUT = 4'h6; // Minor opcode in rs |
| |
| localparam [3:0] OP_LD = 4'h8; // rd = mem[rs ]; |
| localparam [3:0] OP_LD_IA = 4'h9; // rd = mem[rs ]; rs += rt; |
| localparam [3:0] OP_LD_ADD = 4'ha; // rd = mem[rs + rt]; |
| localparam [3:0] OP_LD_IB = 4'hb; // rd = mem[rs + rt]; rs += rt; |
| |
| localparam [3:0] OP_ST = 4'hc; // mem[rs ] = rd; |
| localparam [3:0] OP_ST_IA = 4'hd; // mem[rs ] = rd; rs += rt; |
| localparam [3:0] OP_ST_ADD = 4'he; // mem[rs + rt] = rd; |
| localparam [3:0] OP_ST_IB = 4'hf; // mem[rs + rt] = rd; rs += rt; |
| |
| // Minor opcodes (rt) |
| localparam [2:0] OP2_SRL = 3'h0; |
| localparam [2:0] OP2_SRA = 3'h1; |
| localparam [2:0] OP2_SLL = 3'h4; |
| |
| // Minor opcodes (rs) |
| localparam [2:0] OP2_IN = 3'h0; |
| localparam [2:0] OP2_OUT = 3'h4; |
| |
| // ---------------------------------------------------------------------------- |
| // Main control state machine |
| |
| reg [W_INSTR-1:0] instr; |
| |
| wire [INSTR_OP_MSB -INSTR_OP_LSB :0] instr_op; |
| wire [INSTR_COND_MSB-INSTR_COND_LSB:0] instr_cond; |
| wire [INSTR_RT_MSB -INSTR_RT_LSB :0] instr_rt; |
| wire [INSTR_RS_MSB -INSTR_RS_LSB :0] instr_rs; |
| wire [INSTR_RD_MSB -INSTR_RD_LSB :0] instr_rd; |
| |
| assign {instr_rd, instr_rs, instr_rt, instr_cond, instr_op} = instr; |
| |
| wire instr_op_ls = instr_op[3]; // Whether an instruction is a load/store |
| wire instr_op_st_nld = instr_op[2]; // Whether a load/store is a load or store |
| wire instr_op_ls_suma = instr_op[1]; // Whether sum is used for address |
| wire instr_op_ls_sumr = instr_op[0]; // Whether sum is written back to register |
| |
| reg [3:0] bit_ctr; |
| reg [2:0] state; |
| reg instr_cond_true; |
| reg instr_has_imm_operand; |
| |
| |
| // Note there is a 2 cycle delay from issuing a bit on SDO to getting a bit |
| // back on SDI. This is handled with a 1-cycle gap after issuing a read |
| // address, so that e.g. S_FETCH always has the first instruction bit |
| // available on the first cycle. |
| |
| localparam [2:0] S_FETCH = 3'd0; // Sample 16 instr bits, increment PC |
| localparam [2:0] S_EXEC = 3'd1; // Loop all GPRs, write one GPR |
| localparam [2:0] S_PC_NONSEQ0 = 3'd2; // Issue cmd, then issue 1 PC bit |
| localparam [2:0] S_PC_NONSEQ1 = 3'd3; // Issue rest of PC, then 1 cyc delay |
| localparam [2:0] S_LS_ADDR0 = 3'd4; // Deferred LS SPI cmd following immediate |
| localparam [2:0] S_LS_ADDR1 = 3'd4; // Issue addr then, if load, 1 cyc delay |
| localparam [2:0] S_LS_DATA = 3'd5; // Issue store data, or sample load data |
| localparam [2:0] S_SKIP_IMM = 3'd6; // Skip immediate following false condition |
| |
| reg [2:0] state_nxt_wrap; |
| reg [2:0] state_nxt; |
| |
| always @ (*) begin |
| state_nxt_wrap = state; |
| case (state) |
| S_FETCH: begin |
| if (!instr_cond_true) begin |
| if (instr_has_imm_operand) begin |
| state_nxt_wrap = S_SKIP_IMM; |
| end else begin |
| state_nxt_wrap = S_FETCH; |
| end |
| end else begin |
| state_nxt_wrap = S_EXEC; |
| end |
| end |
| S_EXEC: begin |
| if (instr_op_ls && instr_has_imm_operand) begin |
| // Command was deferred due to immediate read keeping SPI busy |
| state_nxt_wrap = S_LS_ADDR0; |
| end else if (instr_op_ls) begin |
| // Command was issued concurrently, skip straight to address issue |
| state_nxt_wrap = S_LS_ADDR1; |
| end else if (instr_rd == 3'd7) begin |
| state_nxt_wrap = S_PC_NONSEQ0; |
| end else begin |
| state_nxt_wrap = S_FETCH; |
| end |
| end |
| S_PC_NONSEQ0: begin |
| state_nxt_wrap = S_PC_NONSEQ1; |
| end |
| S_PC_NONSEQ1: begin |
| if (!instr_cond_true) begin |
| // Have just been reset, instr is invalid |
| state_nxt_wrap = S_FETCH; |
| end else begin |
| state_nxt_wrap = S_FETCH; |
| end |
| end |
| S_LS_ADDR0: begin |
| state_nxt_wrap = S_LS_ADDR1; |
| end |
| S_LS_ADDR1: begin |
| state_nxt_wrap = S_LS_DATA; |
| end |
| S_LS_DATA: begin |
| state_nxt_wrap = S_PC_NONSEQ0; |
| end |
| S_SKIP_IMM: begin |
| state_nxt_wrap = S_FETCH; |
| end |
| endcase |
| state_nxt = &bit_ctr ? state_nxt_wrap : state; |
| end |
| |
| // Start of day: |
| // |
| // - The only resettable flops are state, bit_ctr, and instr_cond_true. |
| // |
| // - We reset state/bit_ctr to a nonsequential fetch, and reset |
| // instr_cond_true=0 (usually unreachable) |
| // |
| // - instr_cond_true=0 masks the fetch address to 0, regardless of PC |
| // |
| // - The first instruction must be `add pc, zero, #4` to initialise PC |
| |
| always @ (posedge clk or negedge rst_n) begin |
| if (!rst_n) begin |
| state <= S_PC_NONSEQ0; |
| bit_ctr <= 4'h0; |
| end else begin |
| state <= state_nxt; |
| bit_ctr <= bit_ctr + 4'h1; |
| end |
| end |
| |
| // ---------------------------------------------------------------------------- |
| // Instruction shifter and early decode |
| |
| always @ (posedge clk) begin |
| if (state == S_FETCH) begin |
| instr <= {mem_sdi_prev, instr[15:1]}; |
| end |
| end |
| |
| // Decode condition and imm operand flags as the instruction comes in, so we |
| // can use them to steer the state machine at the end of S_FETCH. |
| |
| reg instr_has_imm_operand_nxt; |
| reg instr_cond_true_nxt; |
| |
| // From ALU: |
| wire [7:0] condition_vec8; |
| |
| always @ (*) begin |
| instr_has_imm_operand_nxt = instr_has_imm_operand; |
| instr_cond_true_nxt = instr_cond_true; |
| |
| if (instr_has_imm_operand && !instr_cond_true) begin |
| // In this case we must be in S_FETCH. Hold instr_cond_true for an |
| // additional fetch cycle so that the immediate operand is also |
| // dumped, but clear the operand flag so we don't loop forever. |
| if (&bit_ctr) begin |
| instr_has_imm_operand_nxt = 1'b0; |
| end |
| end else if (state == S_FETCH) begin |
| if (bit_ctr == (INSTR_RT_MSB + 1)) begin |
| // Grab rt as it goes past (this is why rt is not the MSBs!) |
| instr_has_imm_operand_nxt = instr[W_INSTR-1 -: 3] == 3'd6; |
| end |
| if (bit_ctr == (INSTR_COND_MSB + 1)) begin |
| // Decode condition as it goes past |
| instr_cond_true_nxt = condition_vec8[instr[W_INSTR-1 -: 3]]; |
| end |
| end |
| end |
| |
| // instr_cond_true must reset to 0, because we use it to recognise the first |
| // fetch after reset. We don't care about instr_has_imm_operand, because it |
| // is initialised during S_FETCH before first use. |
| |
| always @ (posedge clk or negedge rst_n) begin |
| if (!rst_n) begin |
| instr_cond_true <= 1'b0; |
| end else begin |
| instr_cond_true <= instr_cond_true_nxt; |
| end |
| end |
| |
| always @ (posedge clk) begin |
| instr_has_imm_operand <= instr_has_imm_operand_nxt; |
| end |
| |
| // ---------------------------------------------------------------------------- |
| // Register file |
| |
| wire reg_rd_qr; |
| wire reg_rs_qr, reg_rs_qr_next; |
| wire reg_rt_qr; |
| |
| wire alu_result; |
| |
| wire writeback_wen = |
| state == S_EXEC && !(instr_op_ls && !instr_op_ls_sumr) || |
| state == S_LS_ADDR0 && instr_op_ls_sumr || |
| state == S_LS_DATA && !instr_op_st_nld; |
| |
| wire writeback_data = state == S_LS_DATA ? mem_sdi_prev : alu_result; |
| |
| wire [INSTR_RD_MSB-INSTR_RD_LSB:0] writeback_reg = |
| instr_op_ls && state != S_LS_DATA ? instr_rs : instr_rd; |
| |
| whisk_regfile #( |
| .W (W_DATA), |
| .N (N_REGS) |
| ) regfile_u ( |
| .clk (clk), |
| |
| .rd (writeback_reg), |
| .rd_q (reg_rd_qr), |
| .rd_wen (writeback_wen), |
| .rd_d (writeback_data), |
| |
| .rs (instr_rs), |
| .rs_q (reg_rs_qr), |
| .rs_q_next (reg_rs_qr_next), |
| |
| .rt (instr_rt), |
| .rt_q (reg_rt_qr) |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Program counter |
| |
| wire pc_dl; |
| wire pc_qr; |
| |
| wire [15:0] pc_q_all; |
| wire pc_qr_next = pc_q_all[1]; |
| |
| whisk_shiftreg_right #( |
| .W (16) |
| ) pc_u ( |
| .clk (clk), |
| .dl (pc_dl), |
| .q_all (pc_q_all), |
| .qr (pc_qr) |
| ); |
| |
| wire pc_increment = |
| state == S_FETCH || |
| state == S_EXEC && instr_has_imm_operand || |
| state == S_SKIP_IMM; |
| |
| reg pc_ci; |
| wire pc_co, pc_sum; |
| |
| assign {pc_co, pc_sum} = pc_qr + (~|bit_ctr[3:1] ? bit_ctr[0] && pc_increment : pc_ci); |
| |
| always @ (posedge clk) begin |
| pc_ci <= pc_co; |
| end |
| |
| wire rd_is_pc = instr_rd == 3'd7; |
| |
| assign pc_dl = |
| state == S_EXEC && rd_is_pc ? alu_result : |
| state == S_LS_DATA && rd_is_pc && !instr_op_st_nld ? mem_sdi_prev : pc_sum; |
| |
| // ---------------------------------------------------------------------------- |
| // ALU |
| |
| wire alu_op_s = |
| instr_rs == 3'd7 ? pc_qr : reg_rs_qr; |
| |
| wire alu_op_s_next = |
| instr_rs == 3'd7 ? pc_qr_next : reg_rs_qr_next; |
| |
| wire alu_op_t = |
| instr_rt == 3'd7 ? pc_qr : |
| instr_rt == 3'd6 ? mem_sdi_prev : reg_rt_qr; |
| |
| reg alu_ci; |
| wire [1:0] alu_add = alu_op_s + alu_op_t + (~|bit_ctr ? 1'b0 : alu_ci); |
| wire [1:0] alu_sub = alu_op_s + !alu_op_t + (~|bit_ctr ? 1'b1 : alu_ci); |
| |
| // Left shift uses the carry flop as a 1-cycle delay, counter to the |
| // register's rightward rotation. Right shift looks ahead to advance its |
| // rotation. The final carry flag is the bit shifted "out of" the register. |
| |
| wire [1:0] alu_shift_l = { |
| alu_op_s, |
| |alu_ci && |bit_ctr |
| }; |
| |
| wire [1:0] alu_shift_r = { |
| |bit_ctr ? alu_ci : alu_op_s, |
| &bit_ctr ? alu_op_s && instr_rt[0] : alu_op_s_next |
| }; |
| |
| // Carry is an all-ones flag for bitwise ops |
| wire bit_co = alu_result && (alu_ci || ~|bit_ctr); |
| |
| wire alu_co; |
| assign {alu_co, alu_result} = |
| instr_op_ls ? alu_add : |
| instr_op == OP_ADD ? alu_add : |
| instr_op == OP_SUB ? alu_sub : |
| instr_op == OP_AND ? {bit_co, alu_op_s && alu_op_t} : |
| instr_op == OP_ANDN ? {bit_co, !alu_op_s && alu_op_t} : |
| instr_op == OP_OR ? {bit_co, alu_op_s || alu_op_t} : |
| instr_op == OP_SHIFT && instr_rt[2] ? alu_shift_l : |
| instr_op == OP_SHIFT && !instr_rt[2] ? alu_shift_r : |
| instr_op == OP_INOUT ? ioport_sdi_prev : alu_add; |
| |
| always @ (posedge clk) begin |
| alu_ci <= alu_co; |
| end |
| |
| // ---------------------------------------------------------------------------- |
| // Flags |
| |
| reg flag_z; |
| reg flag_c; |
| reg flag_n; |
| |
| wire update_flag_zn = (state == S_EXEC || state == S_LS_DATA) && ~|instr_cond; |
| wire update_flag_c = update_flag_zn && state == S_EXEC; |
| |
| always @ (posedge clk) begin |
| if (update_flag_zn) begin |
| flag_z <= (flag_z || ~|bit_ctr) && !alu_result; |
| flag_n <= alu_result; |
| end |
| if (update_flag_c) begin |
| flag_c <= alu_co; |
| end |
| end |
| |
| assign condition_vec8 = { |
| !flag_z, flag_z, |
| !flag_c, flag_c, |
| !flag_n, flag_n, |
| 1'b1, 1'b1 |
| }; |
| |
| // ---------------------------------------------------------------------------- |
| // Address register |
| |
| // Captures address calculations LSB-first and then replays them MSB-first. |
| |
| wire ar_l_nr; |
| wire ar_dl; |
| wire ar_dr; |
| wire ar_ql; |
| wire ar_qr; |
| |
| // Need to look ahead by one bit to get correct timing for read addresses: |
| wire [15:0] ar_q_all; |
| wire ar_ql_next = ar_q_all[14]; |
| |
| whisk_shiftreg_leftright #( |
| .W (16) |
| ) ar_u ( |
| .clk (clk), |
| .l_nr (ar_l_nr), |
| .dl (ar_dl), |
| .ql (ar_ql), |
| .dr (ar_dr), |
| .qr (ar_qr), |
| .q_all (ar_q_all) |
| ); |
| |
| // Shift left when replaying addresses. |
| assign ar_l_nr = state == S_LS_ADDR1 || state == S_PC_NONSEQ1; |
| |
| assign ar_dl = |
| state == S_PC_NONSEQ0 ? pc_qr : |
| instr_op_ls_suma ? alu_add : reg_rs_qr; |
| |
| // ---------------------------------------------------------------------------- |
| // SPI controls |
| |
| // Deassert CSn before issuing a nonsequential address. |
| |
| // Note LS_ADDR0 state is skipped if we are able to issue from EXEC: |
| wire issue_ls_addr_ph0 = |
| state == S_LS_ADDR0 || |
| state == S_EXEC && instr_op_ls && !instr_has_imm_operand; |
| |
| wire [3:0] spi_cmd_start_cycle = |
| state == S_PC_NONSEQ0 ? 4'h7 : |
| instr_op_st_nld ? 4'h8 : 4'h7; |
| |
| assign mem_csn_next = bit_ctr < spi_cmd_start_cycle && ( |
| state == S_PC_NONSEQ0 || issue_ls_addr_ph0 |
| ); |
| |
| // Pedal to the metal on SCK except when pulling CSn for a nonsequential |
| // access, or when executing an unskipped instruction with no immediate. |
| |
| assign mem_sck_en_next = !( |
| mem_csn_next || |
| state == (&bit_ctr[3:1] ? S_FETCH : S_EXEC) && !instr_has_imm_operand && instr_cond_true |
| ); |
| |
| // Store address replays entirely in LS_ADDR1, but load/fetch extend one cycle |
| // into previous state, so carefully pick what delay to observe the address |
| // with. (Also mask address to zero for very first fetch at start of day.) |
| |
| wire mem_spi_addr = |
| !instr_cond_true ? 1'b0 : |
| state == S_PC_NONSEQ1 ? ar_ql_next : |
| state == S_LS_ADDR1 && instr_op_st_nld ? ar_ql : |
| state == S_LS_ADDR1 && !instr_op_st_nld ? ar_ql_next : ar_dl; |
| |
| // Note: SPI commands are MSB-first (the commands here are 03h and 02h). |
| localparam [15:0] SPI_INSTR_READ = 16'hc000 >> 1; |
| localparam [15:0] SPI_INSTR_WRITE = 16'h8000; |
| |
| wire mem_sdo_ls_addr_ph0 = |
| instr_op_st_nld ? SPI_INSTR_WRITE[bit_ctr] : |
| &bit_ctr ? mem_spi_addr : SPI_INSTR_READ[bit_ctr]; |
| |
| assign mem_sdo_next = |
| state == S_PC_NONSEQ0 ? (&bit_ctr ? pc_qr : SPI_INSTR_READ[bit_ctr]) : |
| state == S_PC_NONSEQ1 ? mem_spi_addr : |
| issue_ls_addr_ph0 ? mem_sdo_ls_addr_ph0 : |
| state == S_LS_ADDR1 ? mem_spi_addr : |
| state == S_LS_DATA ? reg_rd_qr : 1'b0; |
| |
| // ---------------------------------------------------------------------------- |
| // IO port |
| |
| // Expected hardware is a 1x 8-bit PISO, and 2x 8-bit SIPO shift registers: |
| // |
| // - OUT: Clock out 16 bits from rt[15:0]/imm[15:0], then pulse latch_o high. |
| // |
| // - IN: Clock 8 bits into rd[15:8], with latch_i low for the first clock. |
| // |
| // The IN interface is still driven when executing an OUT, with more clocks. |
| // Abusable for 6 extra inputs if a second PISO register is chained. |
| // |
| // rt[13:6] is actually clocked out on an IN, there's just no latch_o pulse. |
| // Abusable to drive longer SIPO chains using multiple INs and a final OUT. |
| |
| wire exec_io_instr = state == S_EXEC && instr_op == OP_INOUT; |
| wire io_instr_out = (instr_rs & (OP2_OUT | OP2_IN)) == OP2_OUT; |
| |
| // The instruction is still valid on the first cycle of FETCH. This lets us |
| // latch outputs *after* the last clock pulse, without spending a flop. |
| assign ioport_latch_o_next = state == S_FETCH && ~|bit_ctr && |
| instr_op == OP_INOUT && io_instr_out && instr_cond_true; |
| |
| assign ioport_latch_i_next = !(exec_io_instr && bit_ctr == 4'h6); |
| |
| assign ioport_sdo_next = exec_io_instr && alu_op_t; |
| |
| assign ioport_sck_en_next = exec_io_instr && ( |
| (bit_ctr >= 4'h6 && bit_ctr < 4'he) || |
| io_instr_out |
| ); |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_regfile: a register file of multiple shift registers, with 3 |
| // read ports (rd/rs/rt) and one write port (rd). |
| // ============================================================================ |
| |
| // All registers rotate right by one bit every cycle. No enable, so do things |
| // in multiples of 16 cycles. Registers not written to are recirculated. |
| // |
| // q is the value of the rightmost flop in each register. The rs port also has |
| // a q_next value, which taps in one flop from the end, and is required for |
| // performing right-shift-by-one in 16 cycles. |
| // |
| // Out-of-range indices read as 0, and ignore writes. |
| |
| module whisk_regfile #( |
| parameter W = 16, |
| parameter N = 6 |
| ) ( |
| input wire clk, |
| |
| input wire [$clog2(N)-1:0] rd, |
| output wire rd_q, |
| input wire rd_wen, |
| input wire rd_d, |
| |
| input wire [$clog2(N)-1:0] rs, |
| output wire rs_q, |
| output wire rs_q_next, |
| |
| input wire [$clog2(N)-1:0] rt, |
| output wire rt_q, |
| ); |
| |
| localparam N_PADDED = 1 << $clog2(N); |
| |
| wire [N-1:0] d; |
| wire [N-1:0] d; |
| wire [W-1:0] q [N_PADDED-1:0]; |
| |
| assign rd_q = q[rd][0]; |
| assign rs_q = q[rs][0]; |
| assign rs_q_next = q[rs][1]; |
| assign rt_q = q[rt][0]; |
| |
| genvar g; |
| generate |
| for (g = 0; g < N_PADDED; g = g + 1) begin: loop_gprs |
| if (g >= N) begin: gpr_tieoff |
| |
| assign q[g] = {W{1'b0}}; |
| |
| end else begin: gpr_shifter |
| |
| // Recirculate unless register is addressed as rd. |
| wire qr; |
| assign d[g] = rd_wen && rd == g ? rd_d : qr; |
| |
| whisk_shiftreg_right #( |
| .W (W) |
| ) reg_u ( |
| .clk (clk), |
| .dl (d[g]), |
| .qr (qr), |
| .q_all (q[g]) |
| ); |
| |
| end |
| end |
| endgenerate |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_shiftreg_leftright: a shift register that always shifts left |
| // or right each cycle. |
| // ============================================================================ |
| |
| // Note there is no enable because the underlying scan flops do not have an |
| // enable (there is an enable version, but it's larger, and more routing |
| // required!). If you don't want to shift, just shift back and forth for an |
| // even number of cycles, or do a full loop :) |
| // |
| // dl and ql are the leftmost inputs and outputs. If l_nr is low (right), ql |
| // becomes dl on every posedge of clk. (Yes, it's confusing!) |
| // |
| // dr and qr are the rightmost inputs and outputs. If l_nr is high (left), qr |
| // becomes dr on every posedge of clk. |
| |
| module whisk_shiftreg_leftright #( |
| parameter W = 16 |
| ) ( |
| input wire clk, |
| input wire l_nr, |
| input wire dl, |
| input wire dr, |
| output wire ql, |
| output wire qr, |
| output wire [W-1:0] q_all |
| ); |
| |
| wire [W+1:0] chain_q; |
| |
| assign chain_q[0 ] = dr; |
| assign chain_q[W + 1] = dl; |
| |
| assign qr = chain_q[1]; |
| assign ql = chain_q[W]; |
| assign q_all = chain_q[W:1]; |
| |
| genvar g; |
| generate |
| for (g = 1; g < W + 1; g = g + 1) begin: shift_stage |
| // Shift-to-left means select the input to your right, and vice versa. |
| whisk_flop_scanmux flop_u ( |
| .clk (clk), |
| .sel (l_nr), |
| .d ({chain_q[g - 1], chain_q[g + 1]}), |
| .q (chain_q[g]) |
| ); |
| end |
| endgenerate |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_shiftreg_right: register that only shifts right, like Zoolander |
| // ============================================================================ |
| |
| // Cost per bit is lower than whisk_shiftreg_leftright |
| |
| module whisk_shiftreg_right #( |
| parameter W = 16 |
| ) ( |
| input wire clk, |
| input wire dl, |
| output wire qr, |
| output reg [W-1:0] q_all |
| ); |
| |
| always @ (posedge clk) begin |
| q_all <= {dl, q_all[W-1:1]}; |
| end |
| |
| assign qr = q_all[0]; |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_flop_scanmux: a flop with a mux on its input. Usually reserved |
| // for DFT scan insertion, but we don't need that where we're going >:) |
| // ============================================================================ |
| |
| module whisk_flop_scanmux ( |
| input wire clk, |
| input wire sel, |
| input wire [1:0] d, |
| output wire q |
| ); |
| |
| `ifdef WHISK_CELLS_SKY130 |
| |
| // (scanchain in TT2 uses sky130_fd_sc_hd__sdfxtp, a simple flop with scan |
| // mux. An enable version, sky130_fd_sc_hd__sedfxtp, is also available, but |
| // this is significantly larger. Instantiate the unit-drive version because |
| // we have a ridiculously long clock period; not sure whether the backend is |
| // allowed to change the drive.) |
| |
| sky130_fd_sc_hd__sdfxtp_1 sdff_u ( |
| .CLK (clk), |
| .D (d[0]), |
| .SCD (d[1]), |
| .SCE (sel), |
| .Q (q), |
| .VPWR (1'b1), |
| .VGND (1'b0) |
| ); |
| |
| `else |
| |
| // Synthesisable model |
| |
| reg q_r; |
| always @ (posedge clk) begin |
| q_r <= d[sel]; |
| end |
| |
| assign q = q_r; |
| |
| `endif |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_flop_en: a flop with an input enable (DFFE). For some reason |
| // these are not mapped automatically, so we get a DFF, a mux and two buffers |
| // ============================================================================ |
| |
| module whisk_flop_en ( |
| input wire clk, |
| input wire d, |
| input wire e, |
| output wire q |
| ); |
| |
| `ifdef WHISK_CELLS_SKY130 |
| |
| sky130_fd_sc_hd__edfxtp_1 dffe_u ( |
| .CLK (clk), |
| .D (d), |
| .DE (e), |
| .Q (q), |
| .VPWR (1'b1), |
| .VGND (1'b0) |
| ); |
| |
| `else |
| |
| // Synthesisable model |
| |
| reg q_r; |
| always @ (posedge clk) begin |
| if (e) begin |
| q_r <= d; |
| end |
| end |
| |
| assign q = q_r; |
| |
| `endif |
| |
| endmodule |
| // ============================================================================ |
| // Module whisk_spi_serdes: handle the timing of the SPI interface, and |
| // provide a slightly abstracted interface to the whisk core, with all |
| // signals on posedge of clk. |
| // ============================================================================ |
| |
| module whisk_spi_serdes( |
| input wire clk, |
| input wire rst_n, |
| |
| // Core |
| input wire sdo, |
| input wire sck_en, |
| input wire csn, |
| output wire sdi, |
| |
| // IOs |
| output wire padout_sck, |
| output wire padout_csn, |
| output wire padout_sdo, |
| input wire padin_sdi |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Output paths |
| |
| reg sdo_r; |
| reg sck_en_r; |
| reg csn_r; |
| |
| always @ (posedge clk or negedge rst_n) begin |
| if (!rst_n) begin |
| sdo_r <= 1'b0; |
| csn_r <= 1'b1; |
| sck_en_r <= 1'b0; |
| end else begin |
| sdo_r <= sdo; |
| csn_r <= csn; |
| sck_en_r <= sck_en; |
| end |
| end |
| |
| assign padout_sdo = sdo_r; |
| assign padout_csn = csn_r; |
| |
| // Through-path for clock input to SCK output. TODO clock gating cell |
| // required? This is sampled by the scan flops at the tile output. |
| assign padout_sck = sck_en_r && !clk; |
| |
| // ---------------------------------------------------------------------------- |
| // Input paths |
| |
| `ifdef WHISK_CELLS_SKY130 |
| |
| // ASIC version |
| |
| // TODO find a suitable delay buffer cell for hold buffering, and decide how to |
| // dimension it against i[7:0] skew |
| |
| // TODO find a suitable latch cell (possibly sky130_fd_sc_hd__dlxtp) |
| |
| wire padin_sdi_delay = padin_sdi; |
| |
| reg sdi_latch; |
| |
| always @ (*) begin |
| if (clk) begin |
| sdi_latch <= padin_sdi_delay; |
| end |
| end |
| |
| assign sdi = sdi_latch; |
| |
| `else |
| |
| // Dodgy sim-only version |
| |
| reg padin_sdi_reg; |
| always @ (negedge clk) begin |
| padin_sdi_reg <= padin_sdi; |
| end |
| |
| // FIXME there is something I don't understand here with the CXXRTL delta cycles |
| // assign sdi = padin_sdi_reg; |
| assign sdi = padin_sdi; |
| |
| `endif |
| |
| endmodule |
| |
| // ============================================================================ |
| // Module whisk_ioport_serdes: similar to whisk_spi_serdes, but for the |
| // shift-register-based IO port. |
| // ============================================================================ |
| |
| module whisk_ioport_serdes( |
| input wire clk, |
| input wire rst_n, |
| |
| // Core |
| input wire sdo, |
| input wire sck_en, |
| input wire latch_i, |
| input wire latch_o, |
| output wire sdi, |
| |
| // IOs |
| output wire padout_sdo, |
| output wire padout_sck, |
| output wire padout_latch_i, |
| output wire padout_latch_o, |
| input wire padin_sdi |
| ); |
| |
| // ---------------------------------------------------------------------------- |
| // Output paths |
| |
| reg sdo_r; |
| reg sck_en_r; |
| reg latch_i_r; |
| reg latch_o_r; |
| |
| always @ (posedge clk or negedge rst_n) begin |
| if (!rst_n) begin |
| sdo_r <= 1'b0; |
| sck_en_r <= 1'b0; |
| latch_i_r <= 1'b0; |
| latch_o_r <= 1'b0; |
| end else begin |
| sdo_r <= sdo; |
| sck_en_r <= sck_en; |
| latch_i_r <= latch_i; |
| latch_o_r <= latch_o; |
| end |
| end |
| |
| assign padout_sdo = sdo_r; |
| assign padout_latch_i = latch_i_r; |
| assign padout_latch_o = latch_o_r; |
| |
| // TODO clock gating cell? |
| assign padout_sck = sck_en_r && !clk; |
| |
| // ---------------------------------------------------------------------------- |
| // Input paths |
| |
| // FIXME this is actually different from SPI, right? Probably transitions on |
| // posedge? Need to find some actual datasheets for candidate shift |
| // registers. |
| |
| `ifdef WHISK_CELLS_SKY130 |
| |
| // ASIC version |
| |
| // TODO find a suitable delay buffer cell for hold buffering, and decide how to |
| // dimension it against i[7:0] skew |
| |
| // TODO find a suitable latch cell (possibly sky130_fd_sc_hd__dlxtp) |
| |
| wire padin_sdi_delay = padin_sdi; |
| |
| reg sdi_latch; |
| |
| always @ (*) begin |
| if (clk) begin |
| sdi_latch <= padin_sdi_delay; |
| end |
| end |
| |
| assign sdi = sdi_latch; |
| |
| `else |
| |
| // Dodgy sim-only version |
| |
| reg padin_sdi_reg; |
| always @ (negedge clk) begin |
| padin_sdi_reg <= padin_sdi; |
| end |
| |
| assign sdi = padin_sdi_reg; |
| |
| `endif |
| |
| endmodule |
| |
| // ============================================================================ |
| // |
| // _ _ _ |
| // | | (_) | | |
| // __ _| |__ _ ___| | __ |
| // \ \ /\ / / '_ \| / __| |/ / |
| // \ V V /| | | | \__ \ < |
| // \_/\_/ |_| |_|_|___/_|\_\ |
| // |
| // |
| // When I was 16 I designed a 7400-series breadboard processor called Fork, |
| // with a language called Spoon. Now I'm 26 and I'm designing a processor |
| // called Whisk. I wonder what I'll do when I grow up. |
| // |
| // Many mistakes were made in this ISA. What did you think? My aim with this |
| // version of Whisk is to run enough software to discover exactly why my |
| // instruction set is bad. Hopefully Tiny Tapeout 3 will bring faster IOs, |
| // with 2D muxing instead of a scan chain, and then I can try getting some |
| // serious software running on Whisk v2, at a few MHz instead of 12 kHz. |