blob: 99fe60ad120ae32361e561797dab7d52eb76d27f [file] [log] [blame]
/**
* Arithmetic logic unit
*/
module brq_exu_alu #(
parameter brq_pkg::rv32b_e RV32B = brq_pkg::RV32BNone
) (
input brq_pkg::alu_op_e operator_i,
input logic [31:0] operand_a_i,
input logic [31:0] operand_b_i,
input logic instr_first_cycle_i,
input logic [32:0] multdiv_operand_a_i,
input logic [32:0] multdiv_operand_b_i,
input logic multdiv_sel_i,
input logic [31:0] imd_val_q_i[2],
output logic [31:0] imd_val_d_o[2],
output logic [1:0] imd_val_we_o,
output logic [31:0] adder_result_o,
output logic [33:0] adder_result_ext_o,
output logic [31:0] result_o,
output logic comparison_result_o,
output logic is_equal_result_o
);
import brq_pkg::*;
logic [31:0] operand_a_rev;
logic [32:0] operand_b_neg;
// bit reverse operand_a for left shifts and bit counting
for (genvar k = 0; k < 32; k++) begin : gen_rev_operand_a
assign operand_a_rev[k] = operand_a_i[31-k];
end
///////////
// Adder //
///////////
logic adder_op_b_negate;
logic [32:0] adder_in_a, adder_in_b;
logic [31:0] adder_result;
always_comb begin
adder_op_b_negate = 1'b0;
unique case (operator_i)
// Adder OPs
ALU_SUB,
// Comparator OPs
ALU_EQ, ALU_NE,
ALU_GE, ALU_GEU,
ALU_LT, ALU_LTU,
ALU_SLT, ALU_SLTU,
// MinMax OPs (RV32B Ops)
ALU_MIN, ALU_MINU,
ALU_MAX, ALU_MAXU: adder_op_b_negate = 1'b1;
default:;
endcase
end
// prepare operand a
assign adder_in_a = multdiv_sel_i ? multdiv_operand_a_i : {operand_a_i,1'b1};
// prepare operand b
assign operand_b_neg = {operand_b_i,1'b0} ^ {33{1'b1}};
always_comb begin
unique case(1'b1)
multdiv_sel_i: adder_in_b = multdiv_operand_b_i;
adder_op_b_negate: adder_in_b = operand_b_neg;
default : adder_in_b = {operand_b_i, 1'b0};
endcase
end
// actual adder
assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b);
assign adder_result = adder_result_ext_o[32:1];
assign adder_result_o = adder_result;
////////////////
// Comparison //
////////////////
logic is_equal;
logic is_greater_equal; // handles both signed and unsigned forms
logic cmp_signed;
always_comb begin
unique case (operator_i)
ALU_GE,
ALU_LT,
ALU_SLT,
// RV32B only
ALU_MIN,
ALU_MAX: cmp_signed = 1'b1;
default: cmp_signed = 1'b0;
endcase
end
assign is_equal = (adder_result == 32'b0);
assign is_equal_result_o = is_equal;
// Is greater equal
always_comb begin
if ((operand_a_i[31] ^ operand_b_i[31]) == 1'b0) begin
is_greater_equal = (adder_result[31] == 1'b0);
end else begin
is_greater_equal = operand_a_i[31] ^ (cmp_signed);
end
end
// GTE unsigned:
// (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
// (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
// (a[31] == 1 && b[31] == 0) => 1
// (a[31] == 0 && b[31] == 1) => 0
// GTE signed:
// (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
// (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
// (a[31] == 1 && b[31] == 0) => 0
// (a[31] == 0 && b[31] == 1) => 1
// generate comparison result
logic cmp_result;
always_comb begin
unique case (operator_i)
ALU_EQ: cmp_result = is_equal;
ALU_NE: cmp_result = ~is_equal;
ALU_GE, ALU_GEU,
ALU_MAX, ALU_MAXU: cmp_result = is_greater_equal; // RV32B only
ALU_LT, ALU_LTU,
ALU_MIN, ALU_MINU, //RV32B only
ALU_SLT, ALU_SLTU: cmp_result = ~is_greater_equal;
default: cmp_result = is_equal;
endcase
end
assign comparison_result_o = cmp_result;
///////////
// Shift //
///////////
// The shifter structure consists of a 33-bit shifter: 32-bit operand + 1 bit extension for
// arithmetic shifts and one-shift support.
// Rotations and funnel shifts are implemented as multi-cycle instructions.
// The shifter is also used for single-bit instructions and bit-field place as detailed below.
//
// Standard Shifts
// ===============
// For standard shift instructions, the direction of the shift is to the right by default. For
// left shifts, the signal shift_left signal is set. If so, the operand is initially reversed,
// shifted to the right by the specified amount and shifted back again. For arithmetic- and
// one-shifts the 33rd bit of the shifter operand can is set accordingly.
//
// Multicycle Shifts
// =================
//
// Rotation
// --------
// For rotations, the operand signals operand_a_i and operand_b_i are kept constant to rs1 and
// rs2 respectively.
//
// Rotation pseudocode:
// shift_amt = rs2 & 31;
// multicycle_result = (rs1 >> shift_amt) | (rs1 << (32 - shift_amt));
// ^-- cycle 0 -----^ ^-- cycle 1 --------------^
//
// Funnel Shifts
// -------------
// For funnel shifs, operand_a_i is tied to rs1 in the first cycle and rs3 in the
// second cycle. operand_b_i is always tied to rs2. The order of applying the shift amount or
// its complement is determined by bit [5] of shift_amt.
//
// Funnel shift Pseudocode: (fsl)
// shift_amt = rs2 & 63;
// shift_amt_compl = 32 - shift_amt[4:0]
// if (shift_amt >=33):
// multicycle_result = (rs1 >> shift_amt_compl[4:0]) | (rs3 << shift_amt[4:0]);
// ^-- cycle 0 ----------------^ ^-- cycle 1 ------------^
// else if (shift_amt <= 31 && shift_amt > 0):
// multicycle_result = (rs1 << shift_amt[4:0]) | (rs3 >> shift_amt_compl[4:0]);
// ^-- cycle 0 ----------^ ^-- cycle 1 -------------------^
// For shift_amt == 0, 32, both shift_amt[4:0] and shift_amt_compl[4:0] == '0.
// these cases need to be handled separately outside the shifting structure:
// else if (shift_amt == 32):
// multicycle_result = rs3
// else if (shift_amt == 0):
// multicycle_result = rs1.
//
// Single-Bit Instructions
// =======================
// Single bit instructions operate on bit operand_b_i[4:0] of operand_a_i.
// The operations sbset, sbclr and sbinv are implemented by generation of a bit-mask using the
// shifter structure. This is done by left-shifting the operand 32'h1 by the required amount.
// The signal shift_sbmode multiplexes the shifter input and sets the signal shift_left.
// Further processing is taken care of by a separate structure.
//
// For sbext, the bit defined by operand_b_i[4:0] is to be returned. This is done by simply
// shifting operand_a_i to the right by the required amount and returning bit [0] of the result.
//
// Bit-Field Place
// ===============
// The shifter structure is shared to compute bfp_mask << bfp_off.
logic shift_left;
logic shift_ones;
logic shift_arith;
logic shift_funnel;
logic shift_sbmode;
logic [5:0] shift_amt;
logic [5:0] shift_amt_compl; // complementary shift amount (32 - shift_amt)
logic [31:0] shift_operand;
logic [32:0] shift_result_ext;
logic unused_shift_result_ext;
logic [31:0] shift_result;
logic [31:0] shift_result_rev;
// zbf
logic bfp_op;
logic [4:0] bfp_len;
logic [4:0] bfp_off;
logic [31:0] bfp_mask;
logic [31:0] bfp_mask_rev;
logic [31:0] bfp_result;
// bfp: shares the shifter structure to compute bfp_mask << bfp_off
assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0;
assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16
assign bfp_off = operand_b_i[20:16];
assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0;
for (genvar i=0; i<32; i++) begin : gen_rev_bfp_mask
assign bfp_mask_rev[i] = bfp_mask[31-i];
end
assign bfp_result =(RV32B != RV32BNone) ?
(~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;
// bit shift_amt[5]: word swap bit: only considered for FSL/FSR.
// if set, reverse operations in first and second cycle.
assign shift_amt[5] = operand_b_i[5] & shift_funnel;
assign shift_amt_compl = 32 - operand_b_i[4:0];
always_comb begin
if (bfp_op) begin
shift_amt[4:0] = bfp_off ; // length field of bfp control word
end else begin
shift_amt[4:0] = instr_first_cycle_i ?
(operand_b_i[5] && shift_funnel ? shift_amt_compl[4:0] : operand_b_i[4:0]) :
(operand_b_i[5] && shift_funnel ? operand_b_i[4:0] : shift_amt_compl[4:0]);
end
end
// single-bit mode: shift
assign shift_sbmode = (RV32B != RV32BNone) ?
(operator_i == ALU_SBSET) | (operator_i == ALU_SBCLR) | (operator_i == ALU_SBINV) : 1'b0;
// left shift if this is:
// * a standard left shift (slo, sll)
// * a rol in the first cycle
// * a ror in the second cycle
// * fsl: without word-swap bit: first cycle, else: second cycle
// * fsr: without word-swap bit: second cycle, else: first cycle
// * a single-bit instruction: sbclr, sbset, sbinv (excluding sbext)
// * bfp: bfp_mask << bfp_off
always_comb begin
unique case (operator_i)
ALU_SLL: shift_left = 1'b1;
ALU_SLO,
ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0;
ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0;
ALU_FSL: shift_left = (RV32B != RV32BNone) ?
(shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
ALU_FSR: shift_left = (RV32B != RV32BNone) ?
(shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
default: shift_left = 1'b0;
endcase
if (shift_sbmode) begin
shift_left = 1'b1;
end
end
assign shift_arith = (operator_i == ALU_SRA);
assign shift_ones =
(RV32B != RV32BNone) ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
assign shift_funnel =
(RV32B != RV32BNone) ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;
// shifter structure.
always_comb begin
// select shifter input
// for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
if (RV32B == RV32BNone) begin
shift_operand = shift_left ? operand_a_rev : operand_a_i;
end else begin
unique case (1'b1)
bfp_op: shift_operand = bfp_mask_rev;
shift_sbmode: shift_operand = 32'h8000_0000;
default: shift_operand = shift_left ? operand_a_rev : operand_a_i;
endcase
end
shift_result_ext =
$unsigned($signed({shift_ones | (shift_arith & shift_operand[31]), shift_operand}) >>>
shift_amt[4:0]);
shift_result = shift_result_ext[31:0];
unused_shift_result_ext = shift_result_ext[32];
for (int unsigned i=0; i<32; i++) begin
shift_result_rev[i] = shift_result[31-i];
end
shift_result = shift_left ? shift_result_rev : shift_result;
end
///////////////////
// Bitwise Logic //
///////////////////
logic bwlogic_or;
logic bwlogic_and;
logic [31:0] bwlogic_operand_b;
logic [31:0] bwlogic_or_result;
logic [31:0] bwlogic_and_result;
logic [31:0] bwlogic_xor_result;
logic [31:0] bwlogic_result;
logic bwlogic_op_b_negate;
always_comb begin
unique case (operator_i)
// Logic-with-negate OPs (RV32B Ops)
ALU_XNOR,
ALU_ORN,
ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0;
default: bwlogic_op_b_negate = 1'b0;
endcase
end
assign bwlogic_operand_b = bwlogic_op_b_negate ? operand_b_neg[32:1] : operand_b_i;
assign bwlogic_or_result = operand_a_i | bwlogic_operand_b;
assign bwlogic_and_result = operand_a_i & bwlogic_operand_b;
assign bwlogic_xor_result = operand_a_i ^ bwlogic_operand_b;
assign bwlogic_or = (operator_i == ALU_OR) | (operator_i == ALU_ORN);
assign bwlogic_and = (operator_i == ALU_AND) | (operator_i == ALU_ANDN);
always_comb begin
unique case (1'b1)
bwlogic_or: bwlogic_result = bwlogic_or_result;
bwlogic_and: bwlogic_result = bwlogic_and_result;
default: bwlogic_result = bwlogic_xor_result;
endcase
end
logic [5:0] bitcnt_result;
logic [31:0] minmax_result;
logic [31:0] pack_result;
logic [31:0] sext_result;
logic [31:0] singlebit_result;
logic [31:0] rev_result;
logic [31:0] shuffle_result;
logic [31:0] butterfly_result;
logic [31:0] invbutterfly_result;
logic [31:0] clmul_result;
logic [31:0] multicycle_result;
if (RV32B != RV32BNone) begin : g_alu_rvb
/////////////////
// Bitcounting //
/////////////////
// The bit-counter structure computes the number of set bits in its operand. Partial results
// (from left to right) are needed to compute the control masks for computation of bext/bdep
// by the butterfly network, if implemented.
// For pcnt, clz and ctz, only the end result is used.
logic zbe_op;
logic bitcnt_ctz;
logic bitcnt_clz;
logic bitcnt_cz;
logic [31:0] bitcnt_bits;
logic [31:0] bitcnt_mask_op;
logic [31:0] bitcnt_bit_mask;
logic [ 5:0] bitcnt_partial [32];
logic [31:0] bitcnt_partial_lsb_d;
logic [31:0] bitcnt_partial_msb_d;
assign bitcnt_ctz = operator_i == ALU_CTZ;
assign bitcnt_clz = operator_i == ALU_CLZ;
assign bitcnt_cz = bitcnt_ctz | bitcnt_clz;
assign bitcnt_result = bitcnt_partial[31];
// Bit-mask generation for clz and ctz:
// The bit mask is generated by spreading the lowest-order set bit in the operand to all
// higher order bits. The resulting mask is inverted to cover the lowest order zeros. In order
// to create the bit mask for leading zeros, the input operand needs to be reversed.
assign bitcnt_mask_op = bitcnt_clz ? operand_a_rev : operand_a_i;
always_comb begin
bitcnt_bit_mask = bitcnt_mask_op;
bitcnt_bit_mask |= bitcnt_bit_mask << 1;
bitcnt_bit_mask |= bitcnt_bit_mask << 2;
bitcnt_bit_mask |= bitcnt_bit_mask << 4;
bitcnt_bit_mask |= bitcnt_bit_mask << 8;
bitcnt_bit_mask |= bitcnt_bit_mask << 16;
bitcnt_bit_mask = ~bitcnt_bit_mask;
end
assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP);
always_comb begin
case(1'b1)
zbe_op: bitcnt_bits = operand_b_i;
bitcnt_cz: bitcnt_bits = bitcnt_bit_mask & ~bitcnt_mask_op; // clz / ctz
default: bitcnt_bits = operand_a_i; // pcnt
endcase
end
// The parallel prefix counter is of the structure of a Brent-Kung Adder. In the first
// log2(width) stages, the sum of the n preceding bit lines is computed for the bit lines at
// positions 2**n-1 (power-of-two positions) where n denotes the current stage.
// In stage n=log2(width), the count for position width-1 (the MSB) is finished.
// For the intermediate values, an inverse adder tree then computes the bit counts for the bit
// lines at positions
// m = 2**(n-1) + i*2**(n-2), where i = [1 ... width / 2**(n-1)-1] and n = [log2(width) ... 2].
// Thus, at every subsequent stage the result of two previously unconnected sub-trees is
// summed, starting at the node summing bits [width/2-1 : 0] and [3*width/4-1: width/2]
// and moving to iteratively sum up all the sub-trees.
// The inverse adder tree thus features log2(width) - 1 stages the first of these stages is a
// single addition at position 3*width/4 - 1. It does not interfere with the last
// stage of the primary adder tree. These stages can thus be folded together, resulting in a
// total of 2*log2(width)-2 stages.
// For more details refer to R. Brent, H. T. Kung, "A Regular Layout for Parallel Adders",
// (1982).
// For a bitline at position p, only bits
// bitcnt_partial[max(i, such that p % log2(i) == 0)-1 : 0] are needed for generation of the
// butterfly network control signals. The adders in the intermediate value adder tree thus need
// not be full 5-bit adders. We leave the optimization to the synthesis tools.
//
// Consider the following 8-bit example for illustraton.
//
// let bitcnt_bits = 8'babcdefgh.
//
// a b c d e f g h
// | /: | /: | /: | /:
// |/ : |/ : |/ : |/ :
// stage 1: + : + : + : + :
// | : /: : | : /: :
// |,--+ : : |,--+ : :
// stage 2: + : : : + : : :
// | : | : /: : : :
// |,-----,--+ : : : : ^-primary adder tree
// stage 3: + : + : : : : : -------------------------
// : | /| /| /| /| /| : ,-intermediate adder tree
// : |/ |/ |/ |/ |/ : :
// stage 4 : + + + + + : :
// : : : : : : : :
// bitcnt_partial[i] 7 6 5 4 3 2 1 0
always_comb begin
bitcnt_partial = '{default: '0};
// stage 1
for (int unsigned i=1; i<32; i+=2) begin
bitcnt_partial[i] = {5'h0, bitcnt_bits[i]} + {5'h0, bitcnt_bits[i-1]};
end
// stage 2
for (int unsigned i=3; i<32; i+=4) begin
bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
end
// stage 3
for (int unsigned i=7; i<32; i+=8) begin
bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
end
// stage 4
for (int unsigned i=15; i <32; i+=16) begin
bitcnt_partial[i] = bitcnt_partial[i-8] + bitcnt_partial[i];
end
// stage 5
bitcnt_partial[31] = bitcnt_partial[15] + bitcnt_partial[31];
// ^- primary adder tree
// -------------------------------
// ,-intermediate value adder tree
bitcnt_partial[23] = bitcnt_partial[15] + bitcnt_partial[23];
// stage 6
for (int unsigned i=11; i<32; i+=8) begin
bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
end
// stage 7
for (int unsigned i=5; i<32; i+=4) begin
bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
end
// stage 8
bitcnt_partial[0] = {5'h0, bitcnt_bits[0]};
for (int unsigned i=2; i<32; i+=2) begin
bitcnt_partial[i] = bitcnt_partial[i-1] + {5'h0, bitcnt_bits[i]};
end
end
///////////////
// Min / Max //
///////////////
assign minmax_result = cmp_result ? operand_a_i : operand_b_i;
//////////
// Pack //
//////////
logic packu;
logic packh;
assign packu = operator_i == ALU_PACKU;
assign packh = operator_i == ALU_PACKH;
always_comb begin
unique case (1'b1)
packu: pack_result = {operand_b_i[31:16], operand_a_i[31:16]};
packh: pack_result = {16'h0, operand_b_i[7:0], operand_a_i[7:0]};
default: pack_result = {operand_b_i[15:0], operand_a_i[15:0]};
endcase
end
//////////
// Sext //
//////////
assign sext_result = (operator_i == ALU_SEXTB) ?
{ {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};
/////////////////////////////
// Single-bit Instructions //
/////////////////////////////
always_comb begin
unique case (operator_i)
ALU_SBSET: singlebit_result = operand_a_i | shift_result;
ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result;
ALU_SBINV: singlebit_result = operand_a_i ^ shift_result;
default: singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT
endcase
end
////////////////////////////////////
// General Reverse and Or-combine //
////////////////////////////////////
// Only a subset of the General reverse and or-combine instructions are implemented in the
// balanced version of the B extension. Currently rev, rev8 and orc.b are supported in the
// base extension.
logic [4:0] zbp_shift_amt;
logic gorc_op;
assign gorc_op = (operator_i == ALU_GORC);
assign zbp_shift_amt[2:0] = (RV32B == RV32BFull) ? shift_amt[2:0] : {3{&shift_amt[2:0]}};
assign zbp_shift_amt[4:3] = (RV32B == RV32BFull) ? shift_amt[4:3] : {2{&shift_amt[4:3]}};
always_comb begin
rev_result = operand_a_i;
if (zbp_shift_amt[0]) begin
rev_result = (gorc_op ? rev_result : 32'h0) |
((rev_result & 32'h5555_5555) << 1) |
((rev_result & 32'haaaa_aaaa) >> 1);
end
if (zbp_shift_amt[1]) begin
rev_result = (gorc_op ? rev_result : 32'h0) |
((rev_result & 32'h3333_3333) << 2) |
((rev_result & 32'hcccc_cccc) >> 2);
end
if (zbp_shift_amt[2]) begin
rev_result = (gorc_op ? rev_result : 32'h0) |
((rev_result & 32'h0f0f_0f0f) << 4) |
((rev_result & 32'hf0f0_f0f0) >> 4);
end
if (zbp_shift_amt[3]) begin
rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) |
((rev_result & 32'h00ff_00ff) << 8) |
((rev_result & 32'hff00_ff00) >> 8);
end
if (zbp_shift_amt[4]) begin
rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) |
((rev_result & 32'h0000_ffff) << 16) |
((rev_result & 32'hffff_0000) >> 16);
end
end
logic crc_hmode;
logic crc_bmode;
logic [31:0] clmul_result_rev;
if (RV32B == RV32BFull) begin : gen_alu_rvb_full
/////////////////////////
// Shuffle / Unshuffle //
/////////////////////////
localparam logic [31:0] SHUFFLE_MASK_L [4] =
'{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444};
localparam logic [31:0] SHUFFLE_MASK_R [4] =
'{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222};
localparam logic [31:0] FLIP_MASK_L [4] =
'{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000};
localparam logic [31:0] FLIP_MASK_R [4] =
'{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088};
logic [31:0] SHUFFLE_MASK_NOT [4];
for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
end
logic shuffle_flip;
assign shuffle_flip = operator_i == ALU_UNSHFL;
logic [3:0] shuffle_mode;
always_comb begin
shuffle_result = operand_a_i;
if (shuffle_flip) begin
shuffle_mode[3] = shift_amt[0];
shuffle_mode[2] = shift_amt[1];
shuffle_mode[1] = shift_amt[2];
shuffle_mode[0] = shift_amt[3];
end else begin
shuffle_mode = shift_amt[3:0];
end
if (shuffle_flip) begin
shuffle_result = (shuffle_result & 32'h8822_4411) |
((shuffle_result << 6) & FLIP_MASK_L[0]) |
((shuffle_result >> 6) & FLIP_MASK_R[0]) |
((shuffle_result << 9) & FLIP_MASK_L[1]) |
((shuffle_result >> 9) & FLIP_MASK_R[1]) |
((shuffle_result << 15) & FLIP_MASK_L[2]) |
((shuffle_result >> 15) & FLIP_MASK_R[2]) |
((shuffle_result << 21) & FLIP_MASK_L[3]) |
((shuffle_result >> 21) & FLIP_MASK_R[3]);
end
if (shuffle_mode[3]) begin
shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
(((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
end
if (shuffle_mode[2]) begin
shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
(((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
end
if (shuffle_mode[1]) begin
shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
(((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
end
if (shuffle_mode[0]) begin
shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
(((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
end
if (shuffle_flip) begin
shuffle_result = (shuffle_result & 32'h8822_4411) |
((shuffle_result << 6) & FLIP_MASK_L[0]) |
((shuffle_result >> 6) & FLIP_MASK_R[0]) |
((shuffle_result << 9) & FLIP_MASK_L[1]) |
((shuffle_result >> 9) & FLIP_MASK_R[1]) |
((shuffle_result << 15) & FLIP_MASK_L[2]) |
((shuffle_result >> 15) & FLIP_MASK_R[2]) |
((shuffle_result << 21) & FLIP_MASK_L[3]) |
((shuffle_result >> 21) & FLIP_MASK_R[3]);
end
end
///////////////
// Butterfly //
///////////////
// The butterfly / inverse butterfly network executing bext/bdep (zbe) instructions.
// For bdep, the control bits mask of a local left region is generated by
// the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number
// of ones in the deposit bitmask to the right of the segment. n hereby denotes the width
// of the according segment. The bitmask for a pertaining local right region is equal to the
// corresponding local left region. Bext uses an analogue inverse process.
// Consider the following 8-bit example. For details, see Hilewitz et al. "Fast Bit Gather,
// Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
//
// The bext/bdep instructions are completed in 2 cycles. In the first cycle, the control
// bitmask is prepared by executing the parallel prefix bit count. In the second cycle,
// the bit swapping is executed according to the control masks.
// 8-bit example: (Hilewitz et al.)
// Consider the instruction bdep operand_a_i deposit_mask
// Let operand_a_i = 8'babcd_efgh
// deposit_mask = 8'b1010_1101
//
// control bitmask for stage 1:
// - number of ones in the right half of the deposit bitmask: 3
// - width of the segment: 4
// - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
//
// control bitmask: c3 c2 c1 c0 c3 c2 c1 c0
// 1 0 0 0 1 0 0 0
// <- L -----> <- R ----->
// operand_a_i a b c d e f g h
// :\ | | | /: | | |
// : +|---|--|-+ : | | |
// :/ | | | \: | | |
// stage 1 e b c d a f g h
// <L-> <R-> <L-> <R->
// control bitmask: c3 c2 c3 c2 c1 c0 c1 c0
// 1 1 1 1 1 0 1 0
// :\ :\ /: /: :\ | /: |
// : +:-+-:+ : : +|-+ : |
// :/ :/ \: \: :/ | \: |
// stage 2 c d e b g f a h
// L R L R L R L R
// control bitmask: c3 c3 c2 c2 c1 c1 c0 c0
// 1 1 0 0 1 1 0 0
// :\/: | | :\/: | |
// : : | | : : | |
// :/\: | | :/\: | |
// stage 3 d c e b f g a h
// & deposit bitmask: 1 0 1 0 1 1 0 1
// result: d 0 e 0 f g 0 h
logic [ 5:0] bitcnt_partial_q [32];
// first cycle
// Store partial bitcnts
for (genvar i=0; i<32; i++) begin : gen_bitcnt_reg_in_lsb
assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0];
end
for (genvar i=0; i<16; i++) begin : gen_bitcnt_reg_in_b1
assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1];
end
for (genvar i=0; i<8; i++) begin : gen_bitcnt_reg_in_b2
assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2];
end
for (genvar i=0; i<4; i++) begin : gen_bitcnt_reg_in_b3
assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3];
end
for (genvar i=0; i<2; i++) begin : gen_bitcnt_reg_in_b4
assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4];
end
assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5];
assign bitcnt_partial_msb_d[31] = 1'b0; // unused
// Second cycle
// Load partial bitcnts
always_comb begin
bitcnt_partial_q = '{default: '0};
for (int unsigned i=0; i<32; i++) begin : gen_bitcnt_reg_out_lsb
bitcnt_partial_q[i][0] = imd_val_q_i[0][i];
end
for (int unsigned i=0; i<16; i++) begin : gen_bitcnt_reg_out_b1
bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i];
end
for (int unsigned i=0; i<8; i++) begin : gen_bitcnt_reg_out_b2
bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i];
end
for (int unsigned i=0; i<4; i++) begin : gen_bitcnt_reg_out_b3
bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i];
end
for (int unsigned i=0; i<2; i++) begin : gen_bitcnt_reg_out_b4
bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i];
end
bitcnt_partial_q[31][5] = imd_val_q_i[1][30];
end
logic [31:0] butterfly_mask_l[5];
logic [31:0] butterfly_mask_r[5];
logic [31:0] butterfly_mask_not[5];
logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap
// number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
`define _N(stg) (16 >> stg)
// bext / bdep control bit generation
for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_ctrl_stage
// number of segs: 2** stg
for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl
assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
{{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];
assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
= ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
= ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] = '0;
assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
end
end
`undef _N
for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_not
assign butterfly_mask_not[stg] =
~(butterfly_mask_l[stg] | butterfly_mask_r[stg]);
end
always_comb begin
butterfly_result = operand_a_i;
butterfly_result = butterfly_result & butterfly_mask_not[0] |
((butterfly_result & butterfly_mask_l[0]) >> 16)|
((butterfly_result & butterfly_mask_r[0]) << 16);
butterfly_result = butterfly_result & butterfly_mask_not[1] |
((butterfly_result & butterfly_mask_l[1]) >> 8)|
((butterfly_result & butterfly_mask_r[1]) << 8);
butterfly_result = butterfly_result & butterfly_mask_not[2] |
((butterfly_result & butterfly_mask_l[2]) >> 4)|
((butterfly_result & butterfly_mask_r[2]) << 4);
butterfly_result = butterfly_result & butterfly_mask_not[3] |
((butterfly_result & butterfly_mask_l[3]) >> 2)|
((butterfly_result & butterfly_mask_r[3]) << 2);
butterfly_result = butterfly_result & butterfly_mask_not[4] |
((butterfly_result & butterfly_mask_l[4]) >> 1)|
((butterfly_result & butterfly_mask_r[4]) << 1);
butterfly_result = butterfly_result & operand_b_i;
end
always_comb begin
invbutterfly_result = operand_a_i & operand_b_i;
invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
((invbutterfly_result & butterfly_mask_r[4]) << 1);
invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
((invbutterfly_result & butterfly_mask_r[3]) << 2);
invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
((invbutterfly_result & butterfly_mask_r[2]) << 4);
invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
((invbutterfly_result & butterfly_mask_r[1]) << 8);
invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
((invbutterfly_result & butterfly_mask_r[0]) << 16);
end
///////////////////////////////////////////////////
// Carry-less Multiply + Cyclic Redundancy Check //
///////////////////////////////////////////////////
// Carry-less multiplication can be understood as multiplication based on
// the addition interpreted as the bit-wise xor operation.
//
// Example: 1101 X 1011 = 1111111:
//
// 1011 X 1101
// -----------
// 1101
// xor 1101
// ---------
// 10111
// xor 0000
// ----------
// 010111
// xor 1101
// -----------
// 1111111
//
// Architectural details:
// A 32 x 32-bit array
// [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
// is generated. The entries of the array are pairwise 'xor-ed'
// together in a 5-stage binary tree.
//
//
// Cyclic Redundancy Check:
//
// CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
// documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
// see http://reveng.sourceforge.net/crc-catalogue/all.htm
// A useful guide to crc arithmetic and algorithms is given here:
// http://www.piclist.com/techref/method/math/crcguide.html.
//
// The CRC operation solves the following equation using binary polynomial arithmetic:
//
// rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
//
// where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
// of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
//
// Using barret reduction, one can show that
//
// M(x) mod P(x) = R(x) =
// (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
//
// Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
// multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
// rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
//
// rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P)
// = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
// ^-- cycle 0--------------------^
// ^- cycle 1 -------------------------------------------^
//
// In the last step we used the fact that carry-less multiplication is bit-order agnostic:
// rev(a cx b) = rev(a) cx rev(b).
logic clmul_rmode;
logic clmul_hmode;
logic [31:0] clmul_op_a;
logic [31:0] clmul_op_b;
logic [31:0] operand_b_rev;
logic [31:0] clmul_and_stage[32];
logic [31:0] clmul_xor_stage1[16];
logic [31:0] clmul_xor_stage2[8];
logic [31:0] clmul_xor_stage3[4];
logic [31:0] clmul_xor_stage4[2];
logic [31:0] clmul_result_raw;
for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
assign operand_b_rev[i] = operand_b_i[31-i];
end
assign clmul_rmode = operator_i == ALU_CLMULR;
assign clmul_hmode = operator_i == ALU_CLMULH;
// CRC
localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
logic crc_op;
logic crc_cpoly;
logic [31:0] crc_operand;
logic [31:0] crc_poly;
logic [31:0] crc_mu_rev;
assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
(operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
(operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
(operator_i == ALU_CRC32C_H) |
(operator_i == ALU_CRC32C_B);
assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
always_comb begin
unique case(1'b1)
crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
default: crc_operand = operand_a_i;
endcase
end
// Select clmul input
always_comb begin
if (crc_op) begin
clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0];
clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
end else begin
clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
end
end
for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
end
for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
end
for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
end
for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
end
for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
end
assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
assign clmul_result_rev[i] = clmul_result_raw[31-i];
end
// clmulr_result = rev(clmul(rev(a), rev(b)))
// clmulh_result = clmulr_result >> 1
always_comb begin
case(1'b1)
clmul_rmode: clmul_result = clmul_result_rev;
clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
default: clmul_result = clmul_result_raw;
endcase
end
end else begin : gen_alu_rvb_notfull
logic [31:0] unused_imd_val_q_1;
assign unused_imd_val_q_1 = imd_val_q_i[1];
assign shuffle_result = '0;
assign butterfly_result = '0;
assign invbutterfly_result = '0;
assign clmul_result = '0;
// support signals
assign bitcnt_partial_lsb_d = '0;
assign bitcnt_partial_msb_d = '0;
assign clmul_result_rev = '0;
assign crc_bmode = '0;
assign crc_hmode = '0;
end
//////////////////////////////////////
// Multicycle Bitmanip Instructions //
//////////////////////////////////////
// Ternary instructions + Shift Rotations + Bit extract/deposit + CRC
// For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
// second cycle. operand_b_i is always tied to rs2.
always_comb begin
unique case (operator_i)
ALU_CMOV: begin
multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0];
imd_val_d_o = '{operand_a_i, 32'h0};
if (instr_first_cycle_i) begin
imd_val_we_o = 2'b01;
end else begin
imd_val_we_o = 2'b00;
end
end
ALU_CMIX: begin
multicycle_result = imd_val_q_i[0] | bwlogic_and_result;
imd_val_d_o = '{bwlogic_and_result, 32'h0};
if (instr_first_cycle_i) begin
imd_val_we_o = 2'b01;
end else begin
imd_val_we_o = 2'b00;
end
end
ALU_FSR, ALU_FSL,
ALU_ROL, ALU_ROR: begin
if (shift_amt[4:0] == 5'h0) begin
multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0];
end else begin
multicycle_result = imd_val_q_i[0] | shift_result;
end
imd_val_d_o = '{shift_result, 32'h0};
if (instr_first_cycle_i) begin
imd_val_we_o = 2'b01;
end else begin
imd_val_we_o = 2'b00;
end
end
ALU_CRC32_W, ALU_CRC32C_W,
ALU_CRC32_H, ALU_CRC32C_H,
ALU_CRC32_B, ALU_CRC32C_B: begin
if (RV32B == RV32BFull) begin
unique case(1'b1)
crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
default: multicycle_result = clmul_result_rev;
endcase
imd_val_d_o = '{clmul_result_rev, 32'h0};
if (instr_first_cycle_i) begin
imd_val_we_o = 2'b01;
end else begin
imd_val_we_o = 2'b00;
end
end else begin
imd_val_d_o = '{operand_a_i, 32'h0};
imd_val_we_o = 2'b00;
multicycle_result = '0;
end
end
ALU_BEXT, ALU_BDEP: begin
if (RV32B == RV32BFull) begin
multicycle_result = (operator_i == ALU_BDEP) ? butterfly_result : invbutterfly_result;
imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d};
if (instr_first_cycle_i) begin
imd_val_we_o = 2'b11;
end else begin
imd_val_we_o = 2'b00;
end
end else begin
imd_val_d_o = '{operand_a_i, 32'h0};
imd_val_we_o = 2'b00;
multicycle_result = '0;
end
end
default: begin
imd_val_d_o = '{operand_a_i, 32'h0};
imd_val_we_o = 2'b00;
multicycle_result = '0;
end
endcase
end
end else begin : g_no_alu_rvb
logic [31:0] unused_imd_val_q[2];
assign unused_imd_val_q = imd_val_q_i;
logic [31:0] unused_butterfly_result;
assign unused_butterfly_result = butterfly_result;
logic [31:0] unused_invbutterfly_result;
assign unused_invbutterfly_result = invbutterfly_result;
// RV32B result signals
assign bitcnt_result = '0;
assign minmax_result = '0;
assign pack_result = '0;
assign sext_result = '0;
assign singlebit_result = '0;
assign rev_result = '0;
assign shuffle_result = '0;
assign butterfly_result = '0;
assign invbutterfly_result = '0;
assign clmul_result = '0;
assign multicycle_result = '0;
// RV32B support signals
assign imd_val_d_o = '{default: '0};
assign imd_val_we_o = '{default: '0};
end
////////////////
// Result mux //
////////////////
always_comb begin
result_o = '0;
unique case (operator_i)
// Bitwise Logic Operations (negate: RV32B)
ALU_XOR, ALU_XNOR,
ALU_OR, ALU_ORN,
ALU_AND, ALU_ANDN: result_o = bwlogic_result;
// Adder Operations
ALU_ADD, ALU_SUB: result_o = adder_result;
// Shift Operations
ALU_SLL, ALU_SRL,
ALU_SRA,
// RV32B
ALU_SLO, ALU_SRO: result_o = shift_result;
// Shuffle Operations (RV32B)
ALU_SHFL, ALU_UNSHFL: result_o = shuffle_result;
// Comparison Operations
ALU_EQ, ALU_NE,
ALU_GE, ALU_GEU,
ALU_LT, ALU_LTU,
ALU_SLT, ALU_SLTU: result_o = {31'h0,cmp_result};
// MinMax Operations (RV32B)
ALU_MIN, ALU_MAX,
ALU_MINU, ALU_MAXU: result_o = minmax_result;
// Bitcount Operations (RV32B)
ALU_CLZ, ALU_CTZ,
ALU_PCNT: result_o = {26'h0, bitcnt_result};
// Pack Operations (RV32B)
ALU_PACK, ALU_PACKH,
ALU_PACKU: result_o = pack_result;
// Sign-Extend (RV32B)
ALU_SEXTB, ALU_SEXTH: result_o = sext_result;
// Ternary Bitmanip Operations (RV32B)
ALU_CMIX, ALU_CMOV,
ALU_FSL, ALU_FSR,
// Rotate Shift (RV32B)
ALU_ROL, ALU_ROR,
// Cyclic Redundancy Checks (RV32B)
ALU_CRC32_W, ALU_CRC32C_W,
ALU_CRC32_H, ALU_CRC32C_H,
ALU_CRC32_B, ALU_CRC32C_B,
// Bit Extract / Deposit (RV32B)
ALU_BEXT, ALU_BDEP: result_o = multicycle_result;
// Single-Bit Bitmanip Operations (RV32B)
ALU_SBSET, ALU_SBCLR,
ALU_SBINV, ALU_SBEXT: result_o = singlebit_result;
// General Reverse / Or-combine (RV32B)
ALU_GREV, ALU_GORC: result_o = rev_result;
// Bit Field Place (RV32B)
ALU_BFP: result_o = bfp_result;
// Carry-less Multiply Operations (RV32B)
ALU_CLMUL, ALU_CLMULR,
ALU_CLMULH: result_o = clmul_result;
default: ;
endcase
end
logic unused_shift_amt_compl;
assign unused_shift_amt_compl = shift_amt_compl[5];
endmodule