blob: 840b8897118b246fe4e80360985c2fae70207e18 [file] [log] [blame]
// Copyright 2019 ETH Zurich and University of Bologna.
//
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
// Author: Stefan Mach <smach@iis.ee.ethz.ch>
module fpnew_fma_multi #(
parameter fpnew_pkg::fmt_logic_t FpFmtConfig = '1,
parameter int unsigned NumPipeRegs = 0,
parameter fpnew_pkg::pipe_config_t PipeConfig = fpnew_pkg::BEFORE,
parameter type TagType = logic,
parameter type AuxType = logic,
// Do not change
localparam int unsigned WIDTH = fpnew_pkg::max_fp_width(FpFmtConfig),
localparam int unsigned NUM_FORMATS = fpnew_pkg::NUM_FP_FORMATS
) (
input logic clk_i,
input logic rst_ni,
// Input signals
input logic [2:0][WIDTH-1:0] operands_i, // 3 operands
input logic [NUM_FORMATS-1:0][2:0] is_boxed_i, // 3 operands
input fpnew_pkg::roundmode_e rnd_mode_i,
input fpnew_pkg::operation_e op_i,
input logic op_mod_i,
input fpnew_pkg::fp_format_e src_fmt_i, // format of the multiplicands
input fpnew_pkg::fp_format_e dst_fmt_i, // format of the addend and result
input TagType tag_i,
input AuxType aux_i,
// Input Handshake
input logic in_valid_i,
output logic in_ready_o,
input logic flush_i,
// Output signals
output logic [WIDTH-1:0] result_o,
output fpnew_pkg::status_t status_o,
output logic extension_bit_o,
output TagType tag_o,
output AuxType aux_o,
// Output handshake
output logic out_valid_o,
input logic out_ready_i,
// Indication of valid data in flight
output logic busy_o
);
// ----------
// Constants
// ----------
// The super-format that can hold all formats
localparam fpnew_pkg::fp_encoding_t SUPER_FORMAT = fpnew_pkg::super_format(FpFmtConfig);
localparam int unsigned SUPER_EXP_BITS = SUPER_FORMAT.exp_bits;
localparam int unsigned SUPER_MAN_BITS = SUPER_FORMAT.man_bits;
// Precision bits 'p' include the implicit bit
localparam int unsigned PRECISION_BITS = SUPER_MAN_BITS + 1;
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
localparam int unsigned LOWER_SUM_WIDTH = 2 * PRECISION_BITS + 3;
localparam int unsigned LZC_RESULT_WIDTH = $clog2(LOWER_SUM_WIDTH);
// Internal exponent width of FMA must accomodate all meaningful exponent values in order to avoid
// datapath leakage. This is either given by the exponent bits or the width of the LZC result.
// In most reasonable FP formats the internal exponent will be wider than the LZC result.
localparam int unsigned EXP_WIDTH = fpnew_pkg::maximum(SUPER_EXP_BITS + 2, LZC_RESULT_WIDTH);
// Shift amount width: maximum internal mantissa size is 3p+3 bits
localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 3);
// Pipelines
localparam NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
? NumPipeRegs
: (PipeConfig == fpnew_pkg::DISTRIBUTED
? ((NumPipeRegs + 1) / 3) // Second to get distributed regs
: 0); // no regs here otherwise
localparam NUM_MID_REGS = PipeConfig == fpnew_pkg::INSIDE
? NumPipeRegs
: (PipeConfig == fpnew_pkg::DISTRIBUTED
? ((NumPipeRegs + 2) / 3) // First to get distributed regs
: 0); // no regs here otherwise
localparam NUM_OUT_REGS = PipeConfig == fpnew_pkg::AFTER
? NumPipeRegs
: (PipeConfig == fpnew_pkg::DISTRIBUTED
? (NumPipeRegs / 3) // Last to get distributed regs
: 0); // no regs here otherwise
// ----------------
// Type definition
// ----------------
typedef struct packed {
logic sign;
logic [SUPER_EXP_BITS-1:0] exponent;
logic [SUPER_MAN_BITS-1:0] mantissa;
} fp_t;
// ---------------
// Input pipeline
// ---------------
// Selected pipeline output signals as non-arrays
logic [2:0][WIDTH-1:0] operands_q;
fpnew_pkg::fp_format_e src_fmt_q;
fpnew_pkg::fp_format_e dst_fmt_q;
// Input pipeline signals, index i holds signal after i register stages
logic [0:NUM_INP_REGS][2:0][WIDTH-1:0] inp_pipe_operands_q;
logic [0:NUM_INP_REGS][NUM_FORMATS-1:0][2:0] inp_pipe_is_boxed_q;
fpnew_pkg::roundmode_e [0:NUM_INP_REGS] inp_pipe_rnd_mode_q;
fpnew_pkg::operation_e [0:NUM_INP_REGS] inp_pipe_op_q;
logic [0:NUM_INP_REGS] inp_pipe_op_mod_q;
fpnew_pkg::fp_format_e [0:NUM_INP_REGS] inp_pipe_src_fmt_q;
fpnew_pkg::fp_format_e [0:NUM_INP_REGS] inp_pipe_dst_fmt_q;
TagType [0:NUM_INP_REGS] inp_pipe_tag_q;
AuxType [0:NUM_INP_REGS] inp_pipe_aux_q;
logic [0:NUM_INP_REGS] inp_pipe_valid_q;
// Ready signal is combinatorial for all stages
logic [0:NUM_INP_REGS] inp_pipe_ready;
// Input stage: First element of pipeline is taken from inputs
assign inp_pipe_operands_q[0] = operands_i;
assign inp_pipe_is_boxed_q[0] = is_boxed_i;
assign inp_pipe_rnd_mode_q[0] = rnd_mode_i;
assign inp_pipe_op_q[0] = op_i;
assign inp_pipe_op_mod_q[0] = op_mod_i;
assign inp_pipe_src_fmt_q[0] = src_fmt_i;
assign inp_pipe_dst_fmt_q[0] = dst_fmt_i;
assign inp_pipe_tag_q[0] = tag_i;
assign inp_pipe_aux_q[0] = aux_i;
assign inp_pipe_valid_q[0] = in_valid_i;
// Input stage: Propagate pipeline ready signal to updtream circuitry
assign in_ready_o = inp_pipe_ready[0];
// Generate the register stages
for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
assign reg_ena = inp_pipe_ready[i] & inp_pipe_valid_q[i];
// Generate the pipeline registers within the stages, use enable-registers
`FFL(inp_pipe_operands_q[i+1], inp_pipe_operands_q[i], reg_ena, '0)
`FFL(inp_pipe_is_boxed_q[i+1], inp_pipe_is_boxed_q[i], reg_ena, '0)
`FFL(inp_pipe_rnd_mode_q[i+1], inp_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE)
`FFL(inp_pipe_op_q[i+1], inp_pipe_op_q[i], reg_ena, fpnew_pkg::FMADD)
`FFL(inp_pipe_op_mod_q[i+1], inp_pipe_op_mod_q[i], reg_ena, '0)
`FFL(inp_pipe_src_fmt_q[i+1], inp_pipe_src_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(inp_pipe_dst_fmt_q[i+1], inp_pipe_dst_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(inp_pipe_tag_q[i+1], inp_pipe_tag_q[i], reg_ena, TagType'('0))
`FFL(inp_pipe_aux_q[i+1], inp_pipe_aux_q[i], reg_ena, AuxType'('0))
end
// Output stage: assign selected pipe outputs to signals for later use
assign operands_q = inp_pipe_operands_q[NUM_INP_REGS];
assign src_fmt_q = inp_pipe_src_fmt_q[NUM_INP_REGS];
assign dst_fmt_q = inp_pipe_dst_fmt_q[NUM_INP_REGS];
// -----------------
// Input processing
// -----------------
logic [NUM_FORMATS-1:0][2:0] fmt_sign;
logic signed [NUM_FORMATS-1:0][2:0][SUPER_EXP_BITS-1:0] fmt_exponent;
logic [NUM_FORMATS-1:0][2:0][SUPER_MAN_BITS-1:0] fmt_mantissa;
fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][2:0] info_q;
// FP Input initialization
for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_init_inputs
// Set up some constants
localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
if (FpFmtConfig[fmt]) begin : active_format
logic [2:0][FP_WIDTH-1:0] trimmed_ops;
// Classify input
fpnew_classifier #(
.FpFormat ( fpnew_pkg::fp_format_e'(fmt) ),
.NumOperands ( 3 )
) i_fpnew_classifier (
.operands_i ( trimmed_ops ),
.is_boxed_i ( inp_pipe_is_boxed_q[NUM_INP_REGS][fmt] ),
.info_o ( info_q[fmt] )
);
for (genvar op = 0; op < 3; op++) begin : gen_operands
assign trimmed_ops[op] = operands_q[op][FP_WIDTH-1:0];
assign fmt_sign[fmt][op] = operands_q[op][FP_WIDTH-1];
assign fmt_exponent[fmt][op] = signed'({1'b0, operands_q[op][MAN_BITS+:EXP_BITS]});
assign fmt_mantissa[fmt][op] = {info_q[fmt][op].is_normal, operands_q[op][MAN_BITS-1:0]} <<
(SUPER_MAN_BITS - MAN_BITS); // move to left of mantissa
end
end else begin : inactive_format
assign info_q[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
assign fmt_sign[fmt] = fpnew_pkg::DONT_CARE; // format disabled
assign fmt_exponent[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
assign fmt_mantissa[fmt] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
end
end
fp_t operand_a, operand_b, operand_c;
fpnew_pkg::fp_info_t info_a, info_b, info_c;
// Operation selection and operand adjustment
// | \c op_q | \c op_mod_q | Operation Adjustment
// |:--------:|:-----------:|---------------------
// | FMADD | \c 0 | FMADD: none
// | FMADD | \c 1 | FMSUB: Invert sign of operand C
// | FNMSUB | \c 0 | FNMSUB: Invert sign of operand A
// | FNMSUB | \c 1 | FNMADD: Invert sign of operands A and C
// | ADD | \c 0 | ADD: Set operand A to +1.0
// | ADD | \c 1 | SUB: Set operand A to +1.0, invert sign of operand C
// | MUL | \c 0 | MUL: Set operand C to +0.0
// | *others* | \c - | *invalid*
// \note \c op_mod_q always inverts the sign of the addend.
always_comb begin : op_select
// Default assignments - packing-order-agnostic
operand_a = {fmt_sign[src_fmt_q][0], fmt_exponent[src_fmt_q][0], fmt_mantissa[src_fmt_q][0]};
operand_b = {fmt_sign[src_fmt_q][1], fmt_exponent[src_fmt_q][1], fmt_mantissa[src_fmt_q][1]};
operand_c = {fmt_sign[dst_fmt_q][2], fmt_exponent[dst_fmt_q][2], fmt_mantissa[dst_fmt_q][2]};
info_a = info_q[src_fmt_q][0];
info_b = info_q[src_fmt_q][1];
info_c = info_q[dst_fmt_q][2];
// op_mod_q inverts sign of operand C
operand_c.sign = operand_c.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];
unique case (inp_pipe_op_q[NUM_INP_REGS])
fpnew_pkg::FMADD: ; // do nothing
fpnew_pkg::FNMSUB: operand_a.sign = ~operand_a.sign; // invert sign of product
fpnew_pkg::ADD: begin // Set multiplicand to +1
operand_a = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
info_a = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
end
fpnew_pkg::MUL: begin // Set addend to -0 (for proper rounding with RDN)
operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
info_c = '{is_zero: 1'b1, is_boxed: 1'b1, default: 1'b0}; //zero, boxed value.
end
default: begin // propagate don't cares
operand_a = '{default: fpnew_pkg::DONT_CARE};
operand_b = '{default: fpnew_pkg::DONT_CARE};
operand_c = '{default: fpnew_pkg::DONT_CARE};
info_a = '{default: fpnew_pkg::DONT_CARE};
info_b = '{default: fpnew_pkg::DONT_CARE};
info_c = '{default: fpnew_pkg::DONT_CARE};
end
endcase
end
// ---------------------
// Input classification
// ---------------------
logic any_operand_inf;
logic any_operand_nan;
logic signalling_nan;
logic effective_subtraction;
logic tentative_sign;
// Reduction for special case handling
assign any_operand_inf = (| {info_a.is_inf, info_b.is_inf, info_c.is_inf});
assign any_operand_nan = (| {info_a.is_nan, info_b.is_nan, info_c.is_nan});
assign signalling_nan = (| {info_a.is_signalling, info_b.is_signalling, info_c.is_signalling});
// Effective subtraction in FMA occurs when product and addend signs differ
assign effective_subtraction = operand_a.sign ^ operand_b.sign ^ operand_c.sign;
// The tentative sign of the FMA shall be the sign of the product
assign tentative_sign = operand_a.sign ^ operand_b.sign;
// ----------------------
// Special case handling
// ----------------------
logic [WIDTH-1:0] special_result;
fpnew_pkg::status_t special_status;
logic result_is_special;
logic [NUM_FORMATS-1:0][WIDTH-1:0] fmt_special_result;
fpnew_pkg::status_t [NUM_FORMATS-1:0] fmt_special_status;
logic [NUM_FORMATS-1:0] fmt_result_is_special;
for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_special_results
// Set up some constants
localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = '1;
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
localparam logic [MAN_BITS-1:0] ZERO_MANTISSA = '0;
if (FpFmtConfig[fmt]) begin : active_format
always_comb begin : special_results
logic [FP_WIDTH-1:0] special_res;
// Default assignment
special_res = {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
fmt_special_status[fmt] = '0;
fmt_result_is_special[fmt] = 1'b0;
// Handle potentially mixed nan & infinity input => important for the case where infinity and
// zero are multiplied and added to a qnan.
// RISC-V mandates raising the NV exception in these cases:
// (inf * 0) + c or (0 * inf) + c INVALID, no matter c (even quiet NaNs)
if ((info_a.is_inf && info_b.is_zero) || (info_a.is_zero && info_b.is_inf)) begin
fmt_result_is_special[fmt] = 1'b1; // bypass FMA, output is the canonical qNaN
fmt_special_status[fmt].NV = 1'b1; // invalid operation
// NaN Inputs cause canonical quiet NaN at the output and maybe invalid OP
end else if (any_operand_nan) begin
fmt_result_is_special[fmt] = 1'b1; // bypass FMA, output is the canonical qNaN
fmt_special_status[fmt].NV = signalling_nan; // raise the invalid operation flag if signalling
// Special cases involving infinity
end else if (any_operand_inf) begin
fmt_result_is_special[fmt] = 1'b1; // bypass FMA
// Effective addition of opposite infinities (±inf - ±inf) is invalid!
if ((info_a.is_inf || info_b.is_inf) && info_c.is_inf && effective_subtraction)
fmt_special_status[fmt].NV = 1'b1; // invalid operation
// Handle cases where output will be inf because of inf product input
else if (info_a.is_inf || info_b.is_inf) begin
// Result is infinity with the sign of the product
special_res = {operand_a.sign ^ operand_b.sign, QNAN_EXPONENT, ZERO_MANTISSA};
// Handle cases where the addend is inf
end else if (info_c.is_inf) begin
// Result is inifinity with sign of the addend (= operand_c)
special_res = {operand_c.sign, QNAN_EXPONENT, ZERO_MANTISSA};
end
end
// Initialize special result with ones (NaN-box)
fmt_special_result[fmt] = '1;
fmt_special_result[fmt][FP_WIDTH-1:0] = special_res;
end
end else begin : inactive_format
assign fmt_special_result[fmt] = '{default: fpnew_pkg::DONT_CARE};
assign fmt_special_status[fmt] = '0;
assign fmt_result_is_special[fmt] = 1'b0;
end
end
// Detect special case from source format, I2F casts don't produce a special result
assign result_is_special = fmt_result_is_special[dst_fmt_q]; // they're all the same
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign special_status = fmt_special_status[dst_fmt_q];
// Assemble result according to destination format
assign special_result = fmt_special_result[dst_fmt_q]; // destination format
// ---------------------------
// Initial exponent data path
// ---------------------------
logic signed [EXP_WIDTH-1:0] exponent_a, exponent_b, exponent_c;
logic signed [EXP_WIDTH-1:0] exponent_addend, exponent_product, exponent_difference;
logic signed [EXP_WIDTH-1:0] tentative_exponent;
// Zero-extend exponents into signed container - implicit width extension
assign exponent_a = signed'({1'b0, operand_a.exponent});
assign exponent_b = signed'({1'b0, operand_b.exponent});
assign exponent_c = signed'({1'b0, operand_c.exponent});
// Calculate internal exponents from encoded values. Real exponents are (ex = Ex - bias + 1 - nx)
// with Ex the encoded exponent and nx the implicit bit. Internal exponents are biased to dst fmt.
assign exponent_addend = signed'(exponent_c + $signed({1'b0, ~info_c.is_normal})); // 0 as subnorm
// Biased product exponent is the sum of encoded exponents minus the bias.
assign exponent_product = (info_a.is_zero || info_b.is_zero) // in case the product is zero, set minimum exp.
? 2 - signed'(fpnew_pkg::bias(dst_fmt_q))
: signed'(exponent_a + info_a.is_subnormal
+ exponent_b + info_b.is_subnormal
- 2*signed'(fpnew_pkg::bias(src_fmt_q))
+ signed'(fpnew_pkg::bias(dst_fmt_q))); // rebias for dst fmt
// Exponent difference is the addend exponent minus the product exponent
assign exponent_difference = exponent_addend - exponent_product;
// The tentative exponent will be the larger of the product or addend exponent
assign tentative_exponent = (exponent_difference > 0) ? exponent_addend : exponent_product;
// Shift amount for addend based on exponents (unsigned as only right shifts)
logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt;
always_comb begin : addend_shift_amount
// Product-anchored case, saturated shift (addend is only in the sticky bit)
if (exponent_difference <= signed'(-2 * PRECISION_BITS - 1))
addend_shamt = 3 * PRECISION_BITS + 4;
// Addend and product will have mutual bits to add
else if (exponent_difference <= signed'(PRECISION_BITS + 2))
addend_shamt = unsigned'(signed'(PRECISION_BITS) + 3 - exponent_difference);
// Addend-anchored case, saturated shift (product is only in the sticky bit)
else
addend_shamt = 0;
end
// ------------------
// Product data path
// ------------------
logic [PRECISION_BITS-1:0] mantissa_a, mantissa_b, mantissa_c;
logic [2*PRECISION_BITS-1:0] product; // the p*p product is 2p bits wide
logic [3*PRECISION_BITS+3:0] product_shifted; // addends are 3p+4 bit wide (including G/R)
// Add implicit bits to mantissae
assign mantissa_a = {info_a.is_normal, operand_a.mantissa};
assign mantissa_b = {info_b.is_normal, operand_b.mantissa};
assign mantissa_c = {info_c.is_normal, operand_c.mantissa};
// Mantissa multiplier (a*b)
assign product = mantissa_a * mantissa_b;
// Product is placed into a 3p+4 bit wide vector, padded with 2 bits for round and sticky:
// | 000...000 | product | RS |
// <- p+2 -> <- 2p -> < 2>
assign product_shifted = product << 2; // constant shift
// -----------------
// Addend data path
// -----------------
logic [3*PRECISION_BITS+3:0] addend_after_shift; // upper 3p+4 bits are needed to go on
logic [PRECISION_BITS-1:0] addend_sticky_bits; // up to p bit of shifted addend are sticky
logic sticky_before_add; // they are compressed into a single sticky bit
logic [3*PRECISION_BITS+3:0] addend_shifted; // addends are 3p+4 bit wide (including G/R)
logic inject_carry_in; // inject carry for subtractions if needed
// In parallel, the addend is right-shifted according to the exponent difference. Up to p bits are
// shifted out and compressed into a sticky bit.
// BEFORE THE SHIFT:
// | mantissa_c | 000..000 |
// <- p -> <- 3p+4 ->
// AFTER THE SHIFT:
// | 000..........000 | mantissa_c | 000...............0GR | sticky bits |
// <- addend_shamt -> <- p -> <- 2p+4-addend_shamt -> <- up to p ->
assign {addend_after_shift, addend_sticky_bits} =
(mantissa_c << (3 * PRECISION_BITS + 4)) >> addend_shamt;
assign sticky_before_add = (| addend_sticky_bits);
// In case of a subtraction, the addend is inverted
assign addend_shifted = (effective_subtraction) ? ~addend_after_shift : addend_after_shift;
assign inject_carry_in = effective_subtraction & ~sticky_before_add;
// ------
// Adder
// ------
logic [3*PRECISION_BITS+4:0] sum_raw; // added one bit for the carry
logic sum_carry; // observe carry bit from sum for sign fixing
logic [3*PRECISION_BITS+3:0] sum; // discard carry as sum won't overflow
logic final_sign;
//Mantissa adder (ab+c). In normal addition, it cannot overflow.
assign sum_raw = product_shifted + addend_shifted + inject_carry_in;
assign sum_carry = sum_raw[3*PRECISION_BITS+4];
// Complement negative sum (can only happen in subtraction -> overflows for positive results)
assign sum = (effective_subtraction && ~sum_carry) ? -sum_raw : sum_raw;
// In case of a mispredicted subtraction result, do a sign flip
assign final_sign = (effective_subtraction && (sum_carry == tentative_sign))
? 1'b1
: (effective_subtraction ? 1'b0 : tentative_sign);
// ---------------
// Internal pipeline
// ---------------
// Pipeline output signals as non-arrays
logic effective_subtraction_q;
logic signed [EXP_WIDTH-1:0] exponent_product_q;
logic signed [EXP_WIDTH-1:0] exponent_difference_q;
logic signed [EXP_WIDTH-1:0] tentative_exponent_q;
logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_q;
logic sticky_before_add_q;
logic [3*PRECISION_BITS+3:0] sum_q;
logic final_sign_q;
fpnew_pkg::fp_format_e dst_fmt_q2;
fpnew_pkg::roundmode_e rnd_mode_q;
logic result_is_special_q;
fp_t special_result_q;
fpnew_pkg::status_t special_status_q;
// Internal pipeline signals, index i holds signal after i register stages
logic [0:NUM_MID_REGS] mid_pipe_eff_sub_q;
logic signed [0:NUM_MID_REGS][EXP_WIDTH-1:0] mid_pipe_exp_prod_q;
logic signed [0:NUM_MID_REGS][EXP_WIDTH-1:0] mid_pipe_exp_diff_q;
logic signed [0:NUM_MID_REGS][EXP_WIDTH-1:0] mid_pipe_tent_exp_q;
logic [0:NUM_MID_REGS][SHIFT_AMOUNT_WIDTH-1:0] mid_pipe_add_shamt_q;
logic [0:NUM_MID_REGS] mid_pipe_sticky_q;
logic [0:NUM_MID_REGS][3*PRECISION_BITS+3:0] mid_pipe_sum_q;
logic [0:NUM_MID_REGS] mid_pipe_final_sign_q;
fpnew_pkg::roundmode_e [0:NUM_MID_REGS] mid_pipe_rnd_mode_q;
fpnew_pkg::fp_format_e [0:NUM_MID_REGS] mid_pipe_dst_fmt_q;
logic [0:NUM_MID_REGS] mid_pipe_res_is_spec_q;
fp_t [0:NUM_MID_REGS] mid_pipe_spec_res_q;
fpnew_pkg::status_t [0:NUM_MID_REGS] mid_pipe_spec_stat_q;
TagType [0:NUM_MID_REGS] mid_pipe_tag_q;
AuxType [0:NUM_MID_REGS] mid_pipe_aux_q;
logic [0:NUM_MID_REGS] mid_pipe_valid_q;
// Ready signal is combinatorial for all stages
logic [0:NUM_MID_REGS] mid_pipe_ready;
// Input stage: First element of pipeline is taken from upstream logic
assign mid_pipe_eff_sub_q[0] = effective_subtraction;
assign mid_pipe_exp_prod_q[0] = exponent_product;
assign mid_pipe_exp_diff_q[0] = exponent_difference;
assign mid_pipe_tent_exp_q[0] = tentative_exponent;
assign mid_pipe_add_shamt_q[0] = addend_shamt;
assign mid_pipe_sticky_q[0] = sticky_before_add;
assign mid_pipe_sum_q[0] = sum;
assign mid_pipe_final_sign_q[0] = final_sign;
assign mid_pipe_rnd_mode_q[0] = inp_pipe_rnd_mode_q[NUM_INP_REGS];
assign mid_pipe_dst_fmt_q[0] = dst_fmt_q;
assign mid_pipe_res_is_spec_q[0] = result_is_special;
assign mid_pipe_spec_res_q[0] = special_result;
assign mid_pipe_spec_stat_q[0] = special_status;
assign mid_pipe_tag_q[0] = inp_pipe_tag_q[NUM_INP_REGS];
assign mid_pipe_aux_q[0] = inp_pipe_aux_q[NUM_INP_REGS];
assign mid_pipe_valid_q[0] = inp_pipe_valid_q[NUM_INP_REGS];
// Input stage: Propagate pipeline ready signal to input pipe
assign inp_pipe_ready[NUM_INP_REGS] = mid_pipe_ready[0];
// Generate the register stages
for (genvar i = 0; i < NUM_MID_REGS; i++) begin : gen_inside_pipeline
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign mid_pipe_ready[i] = mid_pipe_ready[i+1] | ~mid_pipe_valid_q[i+1];
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(mid_pipe_valid_q[i+1], mid_pipe_valid_q[i], mid_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
assign reg_ena = mid_pipe_ready[i] & mid_pipe_valid_q[i];
// Generate the pipeline registers within the stages, use enable-registers
`FFL(mid_pipe_eff_sub_q[i+1], mid_pipe_eff_sub_q[i], reg_ena, '0)
`FFL(mid_pipe_exp_prod_q[i+1], mid_pipe_exp_prod_q[i], reg_ena, '0)
`FFL(mid_pipe_exp_diff_q[i+1], mid_pipe_exp_diff_q[i], reg_ena, '0)
`FFL(mid_pipe_tent_exp_q[i+1], mid_pipe_tent_exp_q[i], reg_ena, '0)
`FFL(mid_pipe_add_shamt_q[i+1], mid_pipe_add_shamt_q[i], reg_ena, '0)
`FFL(mid_pipe_sticky_q[i+1], mid_pipe_sticky_q[i], reg_ena, '0)
`FFL(mid_pipe_sum_q[i+1], mid_pipe_sum_q[i], reg_ena, '0)
`FFL(mid_pipe_final_sign_q[i+1], mid_pipe_final_sign_q[i], reg_ena, '0)
`FFL(mid_pipe_rnd_mode_q[i+1], mid_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE)
`FFL(mid_pipe_dst_fmt_q[i+1], mid_pipe_dst_fmt_q[i], reg_ena, fpnew_pkg::fp_format_e'(0))
`FFL(mid_pipe_res_is_spec_q[i+1], mid_pipe_res_is_spec_q[i], reg_ena, '0)
`FFL(mid_pipe_spec_res_q[i+1], mid_pipe_spec_res_q[i], reg_ena, '0)
`FFL(mid_pipe_spec_stat_q[i+1], mid_pipe_spec_stat_q[i], reg_ena, '0)
`FFL(mid_pipe_tag_q[i+1], mid_pipe_tag_q[i], reg_ena, TagType'('0))
`FFL(mid_pipe_aux_q[i+1], mid_pipe_aux_q[i], reg_ena, AuxType'('0))
end
// Output stage: assign selected pipe outputs to signals for later use
assign effective_subtraction_q = mid_pipe_eff_sub_q[NUM_MID_REGS];
assign exponent_product_q = mid_pipe_exp_prod_q[NUM_MID_REGS];
assign exponent_difference_q = mid_pipe_exp_diff_q[NUM_MID_REGS];
assign tentative_exponent_q = mid_pipe_tent_exp_q[NUM_MID_REGS];
assign addend_shamt_q = mid_pipe_add_shamt_q[NUM_MID_REGS];
assign sticky_before_add_q = mid_pipe_sticky_q[NUM_MID_REGS];
assign sum_q = mid_pipe_sum_q[NUM_MID_REGS];
assign final_sign_q = mid_pipe_final_sign_q[NUM_MID_REGS];
assign rnd_mode_q = mid_pipe_rnd_mode_q[NUM_MID_REGS];
assign dst_fmt_q2 = mid_pipe_dst_fmt_q[NUM_MID_REGS];
assign result_is_special_q = mid_pipe_res_is_spec_q[NUM_MID_REGS];
assign special_result_q = mid_pipe_spec_res_q[NUM_MID_REGS];
assign special_status_q = mid_pipe_spec_stat_q[NUM_MID_REGS];
// --------------
// Normalization
// --------------
logic [LOWER_SUM_WIDTH-1:0] sum_lower; // lower 2p+3 bits of sum are searched
logic [LZC_RESULT_WIDTH-1:0] leading_zero_count; // the number of leading zeroes
logic signed [LZC_RESULT_WIDTH:0] leading_zero_count_sgn; // signed leading-zero count
logic lzc_zeroes; // in case only zeroes found
logic [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
logic signed [EXP_WIDTH-1:0] normalized_exponent;
logic [3*PRECISION_BITS+4:0] sum_shifted; // result after first normalization shift
logic [PRECISION_BITS:0] final_mantissa; // final mantissa before rounding with round bit
logic [2*PRECISION_BITS+2:0] sum_sticky_bits; // remaining 2p+3 sticky bits after normalization
logic sticky_after_norm; // sticky bit after normalization
logic signed [EXP_WIDTH-1:0] final_exponent;
assign sum_lower = sum_q[LOWER_SUM_WIDTH-1:0];
// Leading zero counter for cancellations
lzc #(
.WIDTH ( LOWER_SUM_WIDTH ),
.MODE ( 1 ) // MODE = 1 counts leading zeroes
) i_lzc (
.in_i ( sum_lower ),
.cnt_o ( leading_zero_count ),
.empty_o ( lzc_zeroes )
);
assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});
// Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
always_comb begin : norm_shift_amount
// Product-anchored case or cancellations require LZC
if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin
// Normal result (biased exponent > 0 and not a zero)
if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !lzc_zeroes) begin
// Undo initial product shift, remove the counted zeroes
norm_shamt = PRECISION_BITS + 2 + leading_zero_count;
normalized_exponent = exponent_product_q - leading_zero_count_sgn + 1; // account for shift
// Subnormal result
end else begin
// Cap the shift distance to align mantissa with minimum exponent
norm_shamt = unsigned'(signed'(PRECISION_BITS + 2 + exponent_product_q));
normalized_exponent = 0; // subnormals encoded as 0
end
// Addend-anchored case
end else begin
norm_shamt = addend_shamt_q; // Undo the initial shift
normalized_exponent = tentative_exponent_q;
end
end
// Do the large normalization shift
assign sum_shifted = sum_q << norm_shamt;
// The addend-anchored case needs a 1-bit normalization since the leading-one can be to the left
// or right of the (non-carry) MSB of the sum.
always_comb begin : small_norm
// Default assignment, discarding carry bit
{final_mantissa, sum_sticky_bits} = sum_shifted;
final_exponent = normalized_exponent;
// The normalized sum has overflown, align right and fix exponent
if (sum_shifted[3*PRECISION_BITS+4]) begin // check the carry bit
{final_mantissa, sum_sticky_bits} = sum_shifted >> 1;
final_exponent = normalized_exponent + 1;
// The normalized sum is normal, nothing to do
end else if (sum_shifted[3*PRECISION_BITS+3]) begin // check the sum MSB
// do nothing
// The normalized sum is still denormal, align left - unless the result is not already subnormal
end else if (normalized_exponent > 1) begin
{final_mantissa, sum_sticky_bits} = sum_shifted << 1;
final_exponent = normalized_exponent - 1;
// Otherwise we're denormal
end else begin
final_exponent = '0;
end
end
// Update the sticky bit with the shifted-out bits
assign sticky_after_norm = (| {sum_sticky_bits}) | sticky_before_add_q;
// ----------------------------
// Rounding and classification
// ----------------------------
logic pre_round_sign;
logic [SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] pre_round_abs; // absolute value of result before rounding
logic [1:0] round_sticky_bits;
logic of_before_round, of_after_round; // overflow
logic uf_before_round, uf_after_round; // underflow
logic [NUM_FORMATS-1:0][SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] fmt_pre_round_abs; // per format
logic [NUM_FORMATS-1:0][1:0] fmt_round_sticky_bits;
logic [NUM_FORMATS-1:0] fmt_of_after_round;
logic [NUM_FORMATS-1:0] fmt_uf_after_round;
logic rounded_sign;
logic [SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] rounded_abs; // absolute value of result after rounding
logic result_zero;
// Classification before round. RISC-V mandates checking underflow AFTER rounding!
assign of_before_round = final_exponent >= 2**(fpnew_pkg::exp_bits(dst_fmt_q2))-1; // infinity exponent is all ones
assign uf_before_round = final_exponent == 0; // exponent for subnormals capped to 0
// Pack exponent and mantissa into proper rounding form
for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_res_assemble
// Set up some constants
localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
logic [EXP_BITS-1:0] pre_round_exponent;
logic [MAN_BITS-1:0] pre_round_mantissa;
if (FpFmtConfig[fmt]) begin : active_format
assign pre_round_exponent = (of_before_round) ? 2**EXP_BITS-2 : final_exponent[EXP_BITS-1:0];
assign pre_round_mantissa = (of_before_round) ? '1 : final_mantissa[SUPER_MAN_BITS-:MAN_BITS];
// Assemble result before rounding. In case of overflow, the largest normal value is set.
assign fmt_pre_round_abs[fmt] = {pre_round_exponent, pre_round_mantissa}; // 0-extend
// Round bit is after mantissa (1 in case of overflow for rounding)
assign fmt_round_sticky_bits[fmt][1] = final_mantissa[SUPER_MAN_BITS-MAN_BITS] |
of_before_round;
// remaining bits in mantissa to sticky (1 in case of overflow for rounding)
if (MAN_BITS < SUPER_MAN_BITS) begin : narrow_sticky
assign fmt_round_sticky_bits[fmt][0] = (| final_mantissa[SUPER_MAN_BITS-MAN_BITS-1:0]) |
sticky_after_norm | of_before_round;
end else begin : normal_sticky
assign fmt_round_sticky_bits[fmt][0] = sticky_after_norm | of_before_round;
end
end else begin : inactive_format
assign fmt_pre_round_abs[fmt] = '{default: fpnew_pkg::DONT_CARE};
assign fmt_round_sticky_bits[fmt] = '{default: fpnew_pkg::DONT_CARE};
end
end
// Assemble result before rounding. In case of overflow, the largest normal value is set.
assign pre_round_sign = final_sign_q;
assign pre_round_abs = fmt_pre_round_abs[dst_fmt_q2];
// In case of overflow, the round and sticky bits are set for proper rounding
assign round_sticky_bits = fmt_round_sticky_bits[dst_fmt_q2];
// Perform the rounding
fpnew_rounding #(
.AbsWidth ( SUPER_EXP_BITS + SUPER_MAN_BITS )
) i_fpnew_rounding (
.abs_value_i ( pre_round_abs ),
.sign_i ( pre_round_sign ),
.round_sticky_bits_i ( round_sticky_bits ),
.rnd_mode_i ( rnd_mode_q ),
.effective_subtraction_i ( effective_subtraction_q ),
.abs_rounded_o ( rounded_abs ),
.sign_o ( rounded_sign ),
.exact_zero_o ( result_zero )
);
logic [NUM_FORMATS-1:0][WIDTH-1:0] fmt_result;
for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_sign_inject
// Set up some constants
localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
if (FpFmtConfig[fmt]) begin : active_format
always_comb begin : post_process
// detect of / uf
fmt_uf_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // denormal
fmt_of_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // inf exp.
// Assemble regular result, nan box short ones.
fmt_result[fmt] = '1;
fmt_result[fmt][FP_WIDTH-1:0] = {rounded_sign, rounded_abs[EXP_BITS+MAN_BITS-1:0]};
end
end else begin : inactive_format
assign fmt_uf_after_round[fmt] = fpnew_pkg::DONT_CARE;
assign fmt_of_after_round[fmt] = fpnew_pkg::DONT_CARE;
assign fmt_result[fmt] = '{default: fpnew_pkg::DONT_CARE};
end
end
// Classification after rounding select by destination format
assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
assign of_after_round = fmt_of_after_round[dst_fmt_q2];
// -----------------
// Result selection
// -----------------
logic [WIDTH-1:0] regular_result;
fpnew_pkg::status_t regular_status;
// Assemble regular result
assign regular_result = fmt_result[dst_fmt_q2];
assign regular_status.NV = 1'b0; // only valid cases are handled in regular path
assign regular_status.DZ = 1'b0; // no divisions
assign regular_status.OF = of_before_round | of_after_round; // rounding can introduce overflow
assign regular_status.UF = uf_after_round & regular_status.NX; // only inexact results raise UF
assign regular_status.NX = (| round_sticky_bits) | of_before_round | of_after_round;
// Final results for output pipeline
logic [WIDTH-1:0] result_d;
fpnew_pkg::status_t status_d;
// Select output depending on special case detection
assign result_d = result_is_special_q ? special_result_q : regular_result;
assign status_d = result_is_special_q ? special_status_q : regular_status;
// ----------------
// Output Pipeline
// ----------------
// Output pipeline signals, index i holds signal after i register stages
logic [0:NUM_OUT_REGS][WIDTH-1:0] out_pipe_result_q;
fpnew_pkg::status_t [0:NUM_OUT_REGS] out_pipe_status_q;
TagType [0:NUM_OUT_REGS] out_pipe_tag_q;
AuxType [0:NUM_OUT_REGS] out_pipe_aux_q;
logic [0:NUM_OUT_REGS] out_pipe_valid_q;
// Ready signal is combinatorial for all stages
logic [0:NUM_OUT_REGS] out_pipe_ready;
// Input stage: First element of pipeline is taken from inputs
assign out_pipe_result_q[0] = result_d;
assign out_pipe_status_q[0] = status_d;
assign out_pipe_tag_q[0] = mid_pipe_tag_q[NUM_MID_REGS];
assign out_pipe_aux_q[0] = mid_pipe_aux_q[NUM_MID_REGS];
assign out_pipe_valid_q[0] = mid_pipe_valid_q[NUM_MID_REGS];
// Input stage: Propagate pipeline ready signal to inside pipe
assign mid_pipe_ready[NUM_MID_REGS] = out_pipe_ready[0];
// Generate the register stages
for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
// Internal register enable for this stage
logic reg_ena;
// Determine the ready signal of the current stage - advance the pipeline:
// 1. if the next stage is ready for our data
// 2. if the next stage only holds a bubble (not valid) -> we can pop it
assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
// Valid: enabled by ready signal, synchronous clear with the flush signal
`FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
// Enable register if pipleine ready and a valid data item is present
assign reg_ena = out_pipe_ready[i] & out_pipe_valid_q[i];
// Generate the pipeline registers within the stages, use enable-registers
`FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
`FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
`FFL(out_pipe_tag_q[i+1], out_pipe_tag_q[i], reg_ena, TagType'('0))
`FFL(out_pipe_aux_q[i+1], out_pipe_aux_q[i], reg_ena, AuxType'('0))
end
// Output stage: Ready travels backwards from output side, driven by downstream circuitry
assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
// Output stage: assign module outputs
assign result_o = out_pipe_result_q[NUM_OUT_REGS];
assign status_o = out_pipe_status_q[NUM_OUT_REGS];
assign extension_bit_o = 1'b1; // always NaN-Box result
assign tag_o = out_pipe_tag_q[NUM_OUT_REGS];
assign aux_o = out_pipe_aux_q[NUM_OUT_REGS];
assign out_valid_o = out_pipe_valid_q[NUM_OUT_REGS];
assign busy_o = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
endmodule