verilog/rtl/fpnew_fma_multi.sv - third_party/shuttle/sky130/mpw-002/slot-018 - Git at Google

 // Copyright 2019 ETH Zurich and University of Bologna.
 //
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 0.51 (the "License"); you may not use this file except in
 // compliance with the License. You may obtain a copy of the License at
 // http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
 // or agreed to in writing, software, hardware and materials distributed under
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 // Author: Stefan Mach <smach@iis.ee.ethz.ch>

 `include "registers.svh"
 module fpnew_fma_multi #(
   parameter fpnew_pkg::fmt_logic_t   FpFmtConfig = '1,
   parameter int unsigned             NumPipeRegs = 0,
   parameter fpnew_pkg::pipe_config_t PipeConfig  = fpnew_pkg::BEFORE,
   parameter type                     TagType     = logic,
   parameter type                     AuxType     = logic,
   // Do not change
   localparam int unsigned WIDTH       = fpnew_pkg::max_fp_width(FpFmtConfig),
   localparam int unsigned NUM_FORMATS = fpnew_pkg::NUM_FP_FORMATS
 ) (
   input  logic                        clk_i,
   input  logic                        rst_ni,
   // Input signals
   input  logic [2:0][WIDTH-1:0]       operands_i, // 3 operands
   input  logic [NUM_FORMATS-1:0][2:0] is_boxed_i, // 3 operands
   input  fpnew_pkg::roundmode_e       rnd_mode_i,
   input  fpnew_pkg::operation_e       op_i,
   input  logic                        op_mod_i,
   input  fpnew_pkg::fp_format_e       src_fmt_i, // format of the multiplicands
   input  fpnew_pkg::fp_format_e       dst_fmt_i, // format of the addend and result
   input  TagType                      tag_i,
   input  AuxType                      aux_i,
   // Input Handshake
   input  logic                        in_valid_i,
   output logic                        in_ready_o,
   input  logic                        flush_i,
   // Output signals
   output logic [WIDTH-1:0]            result_o,
   output fpnew_pkg::status_t          status_o,
   output logic                        extension_bit_o,
   output TagType                      tag_o,
   output AuxType                      aux_o,
   // Output handshake
   output logic                        out_valid_o,
   input  logic                        out_ready_i,
   // Indication of valid data in flight
   output logic                        busy_o
 );

   // ----------
   // Constants
   // ----------
   // The super-format that can hold all formats
   localparam fpnew_pkg::fp_encoding_t SUPER_FORMAT = fpnew_pkg::super_format(FpFmtConfig);

   localparam int unsigned SUPER_EXP_BITS = SUPER_FORMAT.exp_bits;
   localparam int unsigned SUPER_MAN_BITS = SUPER_FORMAT.man_bits;

   // Precision bits 'p' include the implicit bit
   localparam int unsigned PRECISION_BITS = SUPER_MAN_BITS + 1;
   // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
   localparam int unsigned LOWER_SUM_WIDTH  = 2 * PRECISION_BITS + 3;
   localparam int unsigned LZC_RESULT_WIDTH = $clog2(LOWER_SUM_WIDTH);
   // Internal exponent width of FMA must accomodate all meaningful exponent values in order to avoid
   // datapath leakage. This is either given by the exponent bits or the width of the LZC result.
   // In most reasonable FP formats the internal exponent will be wider than the LZC result.
   localparam int unsigned EXP_WIDTH = fpnew_pkg::maximum(SUPER_EXP_BITS + 2, LZC_RESULT_WIDTH);
   // Shift amount width: maximum internal mantissa size is 3p+3 bits
   localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(3 * PRECISION_BITS + 3);
   // Pipelines
   localparam NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
                             ? NumPipeRegs
                             : (PipeConfig == fpnew_pkg::DISTRIBUTED
                                ? ((NumPipeRegs + 1) / 3) // Second to get distributed regs
                                : 0); // no regs here otherwise
   localparam NUM_MID_REGS = PipeConfig == fpnew_pkg::INSIDE
                           ? NumPipeRegs
                           : (PipeConfig == fpnew_pkg::DISTRIBUTED
                              ? ((NumPipeRegs + 2) / 3) // First to get distributed regs
                              : 0); // no regs here otherwise
   localparam NUM_OUT_REGS = PipeConfig == fpnew_pkg::AFTER
                             ? NumPipeRegs
                             : (PipeConfig == fpnew_pkg::DISTRIBUTED
                                ? (NumPipeRegs / 3) // Last to get distributed regs
                                : 0); // no regs here otherwise

   // ----------------
   // Type definition
   // ----------------
   typedef struct packed {
     logic                      sign;
     logic [SUPER_EXP_BITS-1:0] exponent;
     logic [SUPER_MAN_BITS-1:0] mantissa;
   } fp_t;

   // ---------------
   // Input pipeline
   // ---------------
   // Selected pipeline output signals as non-arrays
   logic [2:0][WIDTH-1:0] operands_q;
   fpnew_pkg::fp_format_e src_fmt_q;
   fpnew_pkg::fp_format_e dst_fmt_q;

   // Input pipeline signals, index i holds signal after i register stages
   logic                  [0:NUM_INP_REGS][2:0][WIDTH-1:0]       inp_pipe_operands_q;
   logic                  [0:NUM_INP_REGS][NUM_FORMATS-1:0][2:0] inp_pipe_is_boxed_q;
   fpnew_pkg::roundmode_e [0:NUM_INP_REGS]                       inp_pipe_rnd_mode_q;
   fpnew_pkg::operation_e [0:NUM_INP_REGS]                       inp_pipe_op_q;
   logic                  [0:NUM_INP_REGS]                       inp_pipe_op_mod_q;
   fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_src_fmt_q;
   fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
   TagType                [0:NUM_INP_REGS]                       inp_pipe_tag_q;
   AuxType                [0:NUM_INP_REGS]                       inp_pipe_aux_q;
   logic                  [0:NUM_INP_REGS]                       inp_pipe_valid_q;
   // Ready signal is combinatorial for all stages
   logic [0:NUM_INP_REGS] inp_pipe_ready;

   // Input stage: First element of pipeline is taken from inputs
   assign inp_pipe_operands_q[0] = operands_i;
   assign inp_pipe_is_boxed_q[0] = is_boxed_i;
   assign inp_pipe_rnd_mode_q[0] = rnd_mode_i;
   assign inp_pipe_op_q[0]       = op_i;
   assign inp_pipe_op_mod_q[0]   = op_mod_i;
   assign inp_pipe_src_fmt_q[0]  = src_fmt_i;
   assign inp_pipe_dst_fmt_q[0]  = dst_fmt_i;
   assign inp_pipe_tag_q[0]      = tag_i;
   assign inp_pipe_aux_q[0]      = aux_i;
   assign inp_pipe_valid_q[0]    = in_valid_i;
   // Input stage: Propagate pipeline ready signal to updtream circuitry
   assign in_ready_o = inp_pipe_ready[0];
   // Generate the register stages
   for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
     // Internal register enable for this stage
     logic reg_ena;
     // Determine the ready signal of the current stage - advance the pipeline:
     // 1. if the next stage is ready for our data
     // 2. if the next stage only holds a bubble (not valid) -> we can pop it
     assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
     // Valid: enabled by ready signal, synchronous clear with the flush signal
     `FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
     // Enable register if pipleine ready and a valid data item is present
     assign reg_ena = inp_pipe_ready[i] & inp_pipe_valid_q[i];
     // Generate the pipeline registers within the stages, use enable-registers
     `FFL(inp_pipe_operands_q[i+1], inp_pipe_operands_q[i], reg_ena, '0)
     `FFL(inp_pipe_is_boxed_q[i+1], inp_pipe_is_boxed_q[i], reg_ena, '0)
     `FFL(inp_pipe_rnd_mode_q[i+1], inp_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE)
     `FFL(inp_pipe_op_q[i+1],       inp_pipe_op_q[i],       reg_ena, fpnew_pkg::FMADD)
     `FFL(inp_pipe_op_mod_q[i+1],   inp_pipe_op_mod_q[i],   reg_ena, '0)
     `FFL(inp_pipe_src_fmt_q[i+1],  inp_pipe_src_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_dst_fmt_q[i+1],  inp_pipe_dst_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
     `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
   assign operands_q = inp_pipe_operands_q[NUM_INP_REGS];
   assign src_fmt_q  = inp_pipe_src_fmt_q[NUM_INP_REGS];
   assign dst_fmt_q  = inp_pipe_dst_fmt_q[NUM_INP_REGS];

   // -----------------
   // Input processing
   // -----------------
   logic        [NUM_FORMATS-1:0][2:0]                     fmt_sign;
   logic signed [NUM_FORMATS-1:0][2:0][SUPER_EXP_BITS-1:0] fmt_exponent;
   logic        [NUM_FORMATS-1:0][2:0][SUPER_MAN_BITS-1:0] fmt_mantissa;

   fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][2:0] info_q;

   // FP Input initialization
   for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_init_inputs
     // Set up some constants
     localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));

     if (FpFmtConfig[fmt]) begin : active_format
       logic [2:0][FP_WIDTH-1:0] trimmed_ops;

       // Classify input
       fpnew_classifier #(
         .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
         .NumOperands ( 3                            )
       ) i_fpnew_classifier (
         .operands_i ( trimmed_ops                            ),
         .is_boxed_i ( inp_pipe_is_boxed_q[NUM_INP_REGS][fmt] ),
         .info_o     ( info_q[fmt]                            )
       );
       for (genvar op = 0; op < 3; op++) begin : gen_operands
         assign trimmed_ops[op]       = operands_q[op][FP_WIDTH-1:0];
         assign fmt_sign[fmt][op]     = operands_q[op][FP_WIDTH-1];
         assign fmt_exponent[fmt][op] = signed'({1'b0, operands_q[op][MAN_BITS+:EXP_BITS]});
         assign fmt_mantissa[fmt][op] = {info_q[fmt][op].is_normal, operands_q[op][MAN_BITS-1:0]} <<
                                        (SUPER_MAN_BITS - MAN_BITS); // move to left of mantissa
       end
     end else begin : inactive_format
       assign info_q[fmt]                 = '{default: fpnew_pkg::DONT_CARE}; // format disabled
       assign fmt_sign[fmt]               = fpnew_pkg::DONT_CARE;             // format disabled
       assign fmt_exponent[fmt]           = '{default: fpnew_pkg::DONT_CARE}; // format disabled
       assign fmt_mantissa[fmt]           = '{default: fpnew_pkg::DONT_CARE}; // format disabled
     end
   end

   fp_t                 operand_a, operand_b, operand_c;
   fpnew_pkg::fp_info_t info_a,    info_b,    info_c;

   // Operation selection and operand adjustment
   // | \c op_q  | \c op_mod_q | Operation Adjustment
   // |:--------:|:-----------:|---------------------
   // | FMADD    | \c 0        | FMADD: none
   // | FMADD    | \c 1        | FMSUB: Invert sign of operand C
   // | FNMSUB   | \c 0        | FNMSUB: Invert sign of operand A
   // | FNMSUB   | \c 1        | FNMADD: Invert sign of operands A and C
   // | ADD      | \c 0        | ADD: Set operand A to +1.0
   // | ADD      | \c 1        | SUB: Set operand A to +1.0, invert sign of operand C
   // | MUL      | \c 0        | MUL: Set operand C to +0.0
   // | *others* | \c -        | *invalid*
   // \note \c op_mod_q always inverts the sign of the addend.
   always_comb begin : op_select

     // Default assignments - packing-order-agnostic
     operand_a = {fmt_sign[src_fmt_q][0], fmt_exponent[src_fmt_q][0], fmt_mantissa[src_fmt_q][0]};
     operand_b = {fmt_sign[src_fmt_q][1], fmt_exponent[src_fmt_q][1], fmt_mantissa[src_fmt_q][1]};
     operand_c = {fmt_sign[dst_fmt_q][2], fmt_exponent[dst_fmt_q][2], fmt_mantissa[dst_fmt_q][2]};
     info_a    = info_q[src_fmt_q][0];
     info_b    = info_q[src_fmt_q][1];
     info_c    = info_q[dst_fmt_q][2];

     // op_mod_q inverts sign of operand C
     operand_c.sign = operand_c.sign ^ inp_pipe_op_mod_q[NUM_INP_REGS];

     unique case (inp_pipe_op_q[NUM_INP_REGS])
       fpnew_pkg::FMADD:  ; // do nothing
       fpnew_pkg::FNMSUB: operand_a.sign = ~operand_a.sign; // invert sign of product
       fpnew_pkg::ADD: begin // Set multiplicand to +1
         operand_a = '{sign: 1'b0, exponent: fpnew_pkg::bias(src_fmt_q), mantissa: '0};
         info_a    = '{is_normal: 1'b1, is_boxed: 1'b1, default: 1'b0}; //normal, boxed value.
       end
       fpnew_pkg::MUL: begin // Set addend to -0 (for proper rounding with RDN)
         operand_c = '{sign: 1'b1, exponent: '0, mantissa: '0};
         info_c    = '{is_zero: 1'b1, is_boxed: 1'b1, default: 1'b0}; //zero, boxed value.
       end
       default: begin // propagate don't cares
         operand_a  = '{default: fpnew_pkg::DONT_CARE};
         operand_b  = '{default: fpnew_pkg::DONT_CARE};
         operand_c  = '{default: fpnew_pkg::DONT_CARE};
         info_a     = '{default: fpnew_pkg::DONT_CARE};
         info_b     = '{default: fpnew_pkg::DONT_CARE};
         info_c     = '{default: fpnew_pkg::DONT_CARE};
       end
     endcase
   end

   // ---------------------
   // Input classification
   // ---------------------
   logic any_operand_inf;
   logic any_operand_nan;
   logic signalling_nan;
   logic effective_subtraction;
   logic tentative_sign;

   // Reduction for special case handling
   assign any_operand_inf = (| {info_a.is_inf,        info_b.is_inf,        info_c.is_inf});
   assign any_operand_nan = (| {info_a.is_nan,        info_b.is_nan,        info_c.is_nan});
   assign signalling_nan  = (| {info_a.is_signalling, info_b.is_signalling, info_c.is_signalling});
   // Effective subtraction in FMA occurs when product and addend signs differ
   assign effective_subtraction = operand_a.sign ^ operand_b.sign ^ operand_c.sign;
   // The tentative sign of the FMA shall be the sign of the product
   assign tentative_sign = operand_a.sign ^ operand_b.sign;

   // ----------------------
   // Special case handling
   // ----------------------
   logic [WIDTH-1:0]   special_result;
   fpnew_pkg::status_t special_status;
   logic               result_is_special;

   logic [NUM_FORMATS-1:0][WIDTH-1:0]    fmt_special_result;
   fpnew_pkg::status_t [NUM_FORMATS-1:0] fmt_special_status;
   logic [NUM_FORMATS-1:0]               fmt_result_is_special;


   for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_special_results
     // Set up some constants
     localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));

     localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = '1;
     localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
     localparam logic [MAN_BITS-1:0] ZERO_MANTISSA = '0;

     if (FpFmtConfig[fmt]) begin : active_format
       always_comb begin : special_results
         logic [FP_WIDTH-1:0] special_res;

         // Default assignment
         special_res                = {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
         fmt_special_status[fmt]    = '0;
         fmt_result_is_special[fmt] = 1'b0;

         // Handle potentially mixed nan & infinity input => important for the case where infinity and
         // zero are multiplied and added to a qnan.
         // RISC-V mandates raising the NV exception in these cases:
         // (inf * 0) + c or (0 * inf) + c INVALID, no matter c (even quiet NaNs)
         if ((info_a.is_inf && info_b.is_zero) || (info_a.is_zero && info_b.is_inf)) begin
           fmt_result_is_special[fmt] = 1'b1; // bypass FMA, output is the canonical qNaN
           fmt_special_status[fmt].NV = 1'b1; // invalid operation
         // NaN Inputs cause canonical quiet NaN at the output and maybe invalid OP
         end else if (any_operand_nan) begin
           fmt_result_is_special[fmt] = 1'b1;           // bypass FMA, output is the canonical qNaN
           fmt_special_status[fmt].NV = signalling_nan; // raise the invalid operation flag if signalling
         // Special cases involving infinity
         end else if (any_operand_inf) begin
           fmt_result_is_special[fmt] = 1'b1; // bypass FMA
           // Effective addition of opposite infinities (±inf - ±inf) is invalid!
           if ((info_a.is_inf || info_b.is_inf) && info_c.is_inf && effective_subtraction)
             fmt_special_status[fmt].NV = 1'b1; // invalid operation
           // Handle cases where output will be inf because of inf product input
           else if (info_a.is_inf || info_b.is_inf) begin
             // Result is infinity with the sign of the product
             special_res = {operand_a.sign ^ operand_b.sign, QNAN_EXPONENT, ZERO_MANTISSA};
           // Handle cases where the addend is inf
           end else if (info_c.is_inf) begin
             // Result is inifinity with sign of the addend (= operand_c)
             special_res = {operand_c.sign, QNAN_EXPONENT, ZERO_MANTISSA};
           end
         end
         // Initialize special result with ones (NaN-box)
         fmt_special_result[fmt]               = '1;
         fmt_special_result[fmt][FP_WIDTH-1:0] = special_res;
       end
     end else begin : inactive_format
       assign fmt_special_result[fmt] = '{default: fpnew_pkg::DONT_CARE};
       assign fmt_special_status[fmt] = '0;
       assign fmt_result_is_special[fmt] = 1'b0;
     end
   end

   // Detect special case from source format, I2F casts don't produce a special result
   assign result_is_special = fmt_result_is_special[dst_fmt_q]; // they're all the same
   // Signalling input NaNs raise invalid flag, otherwise no flags set
   assign special_status = fmt_special_status[dst_fmt_q];
   // Assemble result according to destination format
   assign special_result = fmt_special_result[dst_fmt_q]; // destination format

   // ---------------------------
   // Initial exponent data path
   // ---------------------------
   logic signed [EXP_WIDTH-1:0] exponent_a, exponent_b, exponent_c;
   logic signed [EXP_WIDTH-1:0] exponent_addend, exponent_product, exponent_difference;
   logic signed [EXP_WIDTH-1:0] tentative_exponent;

   // Zero-extend exponents into signed container - implicit width extension
   assign exponent_a = signed'({1'b0, operand_a.exponent});
   assign exponent_b = signed'({1'b0, operand_b.exponent});
   assign exponent_c = signed'({1'b0, operand_c.exponent});

   // Calculate internal exponents from encoded values. Real exponents are (ex = Ex - bias + 1 - nx)
   // with Ex the encoded exponent and nx the implicit bit. Internal exponents are biased to dst fmt.
   assign exponent_addend = signed'(exponent_c + $signed({1'b0, ~info_c.is_normal})); // 0 as subnorm
   // Biased product exponent is the sum of encoded exponents minus the bias.
   assign exponent_product = (info_a.is_zero || info_b.is_zero) // in case the product is zero, set minimum exp.
                             ? 2 - signed'(fpnew_pkg::bias(dst_fmt_q))
                             : signed'(exponent_a + info_a.is_subnormal
                                       + exponent_b + info_b.is_subnormal
                                       - 2*signed'(fpnew_pkg::bias(src_fmt_q))
                                       + signed'(fpnew_pkg::bias(dst_fmt_q))); // rebias for dst fmt
   // Exponent difference is the addend exponent minus the product exponent
   assign exponent_difference = exponent_addend - exponent_product;
   // The tentative exponent will be the larger of the product or addend exponent
   assign tentative_exponent = (exponent_difference > 0) ? exponent_addend : exponent_product;

   // Shift amount for addend based on exponents (unsigned as only right shifts)
   logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt;

   always_comb begin : addend_shift_amount
     // Product-anchored case, saturated shift (addend is only in the sticky bit)
     if (exponent_difference <= signed'(-2 * PRECISION_BITS - 1))
       addend_shamt = 3 * PRECISION_BITS + 4;
     // Addend and product will have mutual bits to add
     else if (exponent_difference <= signed'(PRECISION_BITS + 2))
       addend_shamt = unsigned'(signed'(PRECISION_BITS) + 3 - exponent_difference);
     // Addend-anchored case, saturated shift (product is only in the sticky bit)
     else
       addend_shamt = 0;
   end

   // ------------------
   // Product data path
   // ------------------
   logic [PRECISION_BITS-1:0]   mantissa_a, mantissa_b, mantissa_c;
   logic [2*PRECISION_BITS-1:0] product;             // the p*p product is 2p bits wide
   logic [3*PRECISION_BITS+3:0] product_shifted;     // addends are 3p+4 bit wide (including G/R)

   // Add implicit bits to mantissae
   assign mantissa_a = {info_a.is_normal, operand_a.mantissa};
   assign mantissa_b = {info_b.is_normal, operand_b.mantissa};
   assign mantissa_c = {info_c.is_normal, operand_c.mantissa};

   // Mantissa multiplier (a*b)
   assign product = mantissa_a * mantissa_b;

   // Product is placed into a 3p+4 bit wide vector, padded with 2 bits for round and sticky:
   // | 000...000 | product | RS |
   //  <-  p+2  -> <-  2p -> < 2>
   assign product_shifted = product << 2; // constant shift

   // -----------------
   // Addend data path
   // -----------------
   logic [3*PRECISION_BITS+3:0] addend_after_shift;  // upper 3p+4 bits are needed to go on
   logic [PRECISION_BITS-1:0]   addend_sticky_bits;  // up to p bit of shifted addend are sticky
   logic                        sticky_before_add;   // they are compressed into a single sticky bit
   logic [3*PRECISION_BITS+3:0] addend_shifted;      // addends are 3p+4 bit wide (including G/R)
   logic                        inject_carry_in;     // inject carry for subtractions if needed

   // In parallel, the addend is right-shifted according to the exponent difference. Up to p bits are
   // shifted out and compressed into a sticky bit.
   // BEFORE THE SHIFT:
   // | mantissa_c | 000..000 |
   //  <-    p   -> <- 3p+4 ->
   // AFTER THE SHIFT:
   // | 000..........000 | mantissa_c | 000...............0GR |  sticky bits  |
   //  <- addend_shamt -> <-    p   -> <- 2p+4-addend_shamt -> <-  up to p  ->
   assign {addend_after_shift, addend_sticky_bits} =
       (mantissa_c << (3 * PRECISION_BITS + 4)) >> addend_shamt;

   assign sticky_before_add     = (| addend_sticky_bits);

   // In case of a subtraction, the addend is inverted
   assign addend_shifted = (effective_subtraction) ? ~addend_after_shift : addend_after_shift;
   assign inject_carry_in = effective_subtraction & ~sticky_before_add;

   // ------
   // Adder
   // ------
   logic [3*PRECISION_BITS+4:0] sum_raw;   // added one bit for the carry
   logic                        sum_carry; // observe carry bit from sum for sign fixing
   logic [3*PRECISION_BITS+3:0] sum;       // discard carry as sum won't overflow
   logic                        final_sign;

   //Mantissa adder (ab+c). In normal addition, it cannot overflow.
   assign sum_raw = product_shifted + addend_shifted + inject_carry_in;
   assign sum_carry = sum_raw[3*PRECISION_BITS+4];

   // Complement negative sum (can only happen in subtraction -> overflows for positive results)
   assign sum        = (effective_subtraction && ~sum_carry) ? -sum_raw : sum_raw;

   // In case of a mispredicted subtraction result, do a sign flip
   assign final_sign = (effective_subtraction && (sum_carry == tentative_sign))
                       ? 1'b1
                       : (effective_subtraction ? 1'b0 : tentative_sign);

   // ---------------
   // Internal pipeline
   // ---------------
   // Pipeline output signals as non-arrays
   logic                          effective_subtraction_q;
   logic signed [EXP_WIDTH-1:0]   exponent_product_q;
   logic signed [EXP_WIDTH-1:0]   exponent_difference_q;
   logic signed [EXP_WIDTH-1:0]   tentative_exponent_q;
   logic [SHIFT_AMOUNT_WIDTH-1:0] addend_shamt_q;
   logic                          sticky_before_add_q;
   logic [3*PRECISION_BITS+3:0]   sum_q;
   logic                          final_sign_q;
   fpnew_pkg::fp_format_e         dst_fmt_q2;
   fpnew_pkg::roundmode_e         rnd_mode_q;
   logic                          result_is_special_q;
   fp_t                           special_result_q;
   fpnew_pkg::status_t            special_status_q;
   // Internal pipeline signals, index i holds signal after i register stages
   logic                  [0:NUM_MID_REGS]                         mid_pipe_eff_sub_q;
   logic signed           [0:NUM_MID_REGS][EXP_WIDTH-1:0]          mid_pipe_exp_prod_q;
   logic signed           [0:NUM_MID_REGS][EXP_WIDTH-1:0]          mid_pipe_exp_diff_q;
   logic signed           [0:NUM_MID_REGS][EXP_WIDTH-1:0]          mid_pipe_tent_exp_q;
   logic                  [0:NUM_MID_REGS][SHIFT_AMOUNT_WIDTH-1:0] mid_pipe_add_shamt_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_sticky_q;
   logic                  [0:NUM_MID_REGS][3*PRECISION_BITS+3:0]   mid_pipe_sum_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_final_sign_q;
   fpnew_pkg::roundmode_e [0:NUM_MID_REGS]                         mid_pipe_rnd_mode_q;
   fpnew_pkg::fp_format_e [0:NUM_MID_REGS]                         mid_pipe_dst_fmt_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_res_is_spec_q;
   fp_t                   [0:NUM_MID_REGS]                         mid_pipe_spec_res_q;
   fpnew_pkg::status_t    [0:NUM_MID_REGS]                         mid_pipe_spec_stat_q;
   TagType                [0:NUM_MID_REGS]                         mid_pipe_tag_q;
   AuxType                [0:NUM_MID_REGS]                         mid_pipe_aux_q;
   logic                  [0:NUM_MID_REGS]                         mid_pipe_valid_q;
   // Ready signal is combinatorial for all stages
   logic [0:NUM_MID_REGS] mid_pipe_ready;

   // Input stage: First element of pipeline is taken from upstream logic
   assign mid_pipe_eff_sub_q[0]     = effective_subtraction;
   assign mid_pipe_exp_prod_q[0]    = exponent_product;
   assign mid_pipe_exp_diff_q[0]    = exponent_difference;
   assign mid_pipe_tent_exp_q[0]    = tentative_exponent;
   assign mid_pipe_add_shamt_q[0]   = addend_shamt;
   assign mid_pipe_sticky_q[0]      = sticky_before_add;
   assign mid_pipe_sum_q[0]         = sum;
   assign mid_pipe_final_sign_q[0]  = final_sign;
   assign mid_pipe_rnd_mode_q[0]    = inp_pipe_rnd_mode_q[NUM_INP_REGS];
   assign mid_pipe_dst_fmt_q[0]     = dst_fmt_q;
   assign mid_pipe_res_is_spec_q[0] = result_is_special;
   assign mid_pipe_spec_res_q[0]    = special_result;
   assign mid_pipe_spec_stat_q[0]   = special_status;
   assign mid_pipe_tag_q[0]         = inp_pipe_tag_q[NUM_INP_REGS];
   assign mid_pipe_aux_q[0]         = inp_pipe_aux_q[NUM_INP_REGS];
   assign mid_pipe_valid_q[0]       = inp_pipe_valid_q[NUM_INP_REGS];
   // Input stage: Propagate pipeline ready signal to input pipe
   assign inp_pipe_ready[NUM_INP_REGS] = mid_pipe_ready[0];

   // Generate the register stages
   for (genvar i = 0; i < NUM_MID_REGS; i++) begin : gen_inside_pipeline
     // Internal register enable for this stage
     logic reg_ena;
     // Determine the ready signal of the current stage - advance the pipeline:
     // 1. if the next stage is ready for our data
     // 2. if the next stage only holds a bubble (not valid) -> we can pop it
     assign mid_pipe_ready[i] = mid_pipe_ready[i+1] | ~mid_pipe_valid_q[i+1];
     // Valid: enabled by ready signal, synchronous clear with the flush signal
     `FFLARNC(mid_pipe_valid_q[i+1], mid_pipe_valid_q[i], mid_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
     // Enable register if pipleine ready and a valid data item is present
     assign reg_ena = mid_pipe_ready[i] & mid_pipe_valid_q[i];
     // Generate the pipeline registers within the stages, use enable-registers
     `FFL(mid_pipe_eff_sub_q[i+1],     mid_pipe_eff_sub_q[i],     reg_ena, '0)
     `FFL(mid_pipe_exp_prod_q[i+1],    mid_pipe_exp_prod_q[i],    reg_ena, '0)
     `FFL(mid_pipe_exp_diff_q[i+1],    mid_pipe_exp_diff_q[i],    reg_ena, '0)
     `FFL(mid_pipe_tent_exp_q[i+1],    mid_pipe_tent_exp_q[i],    reg_ena, '0)
     `FFL(mid_pipe_add_shamt_q[i+1],   mid_pipe_add_shamt_q[i],   reg_ena, '0)
     `FFL(mid_pipe_sticky_q[i+1],      mid_pipe_sticky_q[i],      reg_ena, '0)
     `FFL(mid_pipe_sum_q[i+1],         mid_pipe_sum_q[i],         reg_ena, '0)
     `FFL(mid_pipe_final_sign_q[i+1],  mid_pipe_final_sign_q[i],  reg_ena, '0)
     `FFL(mid_pipe_rnd_mode_q[i+1],    mid_pipe_rnd_mode_q[i],    reg_ena, fpnew_pkg::RNE)
     `FFL(mid_pipe_dst_fmt_q[i+1],     mid_pipe_dst_fmt_q[i],     reg_ena, fpnew_pkg::fp_format_e'(0))
     `FFL(mid_pipe_res_is_spec_q[i+1], mid_pipe_res_is_spec_q[i], reg_ena, '0)
     `FFL(mid_pipe_spec_res_q[i+1],    mid_pipe_spec_res_q[i],    reg_ena, '0)
     `FFL(mid_pipe_spec_stat_q[i+1],   mid_pipe_spec_stat_q[i],   reg_ena, '0)
     `FFL(mid_pipe_tag_q[i+1],         mid_pipe_tag_q[i],         reg_ena, TagType'('0))
     `FFL(mid_pipe_aux_q[i+1],         mid_pipe_aux_q[i],         reg_ena, AuxType'('0))
   end
   // Output stage: assign selected pipe outputs to signals for later use
   assign effective_subtraction_q = mid_pipe_eff_sub_q[NUM_MID_REGS];
   assign exponent_product_q      = mid_pipe_exp_prod_q[NUM_MID_REGS];
   assign exponent_difference_q   = mid_pipe_exp_diff_q[NUM_MID_REGS];
   assign tentative_exponent_q    = mid_pipe_tent_exp_q[NUM_MID_REGS];
   assign addend_shamt_q          = mid_pipe_add_shamt_q[NUM_MID_REGS];
   assign sticky_before_add_q     = mid_pipe_sticky_q[NUM_MID_REGS];
   assign sum_q                   = mid_pipe_sum_q[NUM_MID_REGS];
   assign final_sign_q            = mid_pipe_final_sign_q[NUM_MID_REGS];
   assign rnd_mode_q              = mid_pipe_rnd_mode_q[NUM_MID_REGS];
   assign dst_fmt_q2              = mid_pipe_dst_fmt_q[NUM_MID_REGS];
   assign result_is_special_q     = mid_pipe_res_is_spec_q[NUM_MID_REGS];
   assign special_result_q        = mid_pipe_spec_res_q[NUM_MID_REGS];
   assign special_status_q        = mid_pipe_spec_stat_q[NUM_MID_REGS];

   // --------------
   // Normalization
   // --------------
   logic        [LOWER_SUM_WIDTH-1:0]  sum_lower;              // lower 2p+3 bits of sum are searched
   logic        [LZC_RESULT_WIDTH-1:0] leading_zero_count;     // the number of leading zeroes
   logic signed [LZC_RESULT_WIDTH:0]   leading_zero_count_sgn; // signed leading-zero count
   logic                               lzc_zeroes;             // in case only zeroes found

   logic        [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
   logic signed [EXP_WIDTH-1:0]          normalized_exponent;

   logic [3*PRECISION_BITS+4:0] sum_shifted;       // result after first normalization shift
   logic [PRECISION_BITS:0]     final_mantissa;    // final mantissa before rounding with round bit
   logic [2*PRECISION_BITS+2:0] sum_sticky_bits;   // remaining 2p+3 sticky bits after normalization
   logic                        sticky_after_norm; // sticky bit after normalization

   logic signed [EXP_WIDTH-1:0] final_exponent;

   assign sum_lower = sum_q[LOWER_SUM_WIDTH-1:0];

   // Leading zero counter for cancellations
   lzc #(
     .WIDTH ( LOWER_SUM_WIDTH ),
     .MODE  ( 1               ) // MODE = 1 counts leading zeroes
   ) i_lzc (
     .in_i    ( sum_lower          ),
     .cnt_o   ( leading_zero_count ),
     .empty_o ( lzc_zeroes         )
   );

   assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});

   // Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
   always_comb begin : norm_shift_amount
     // Product-anchored case or cancellations require LZC
     if ((exponent_difference_q <= 0) || (effective_subtraction_q && (exponent_difference_q <= 2))) begin
       // Normal result (biased exponent > 0 and not a zero)
       if ((exponent_product_q - leading_zero_count_sgn + 1 >= 0) && !lzc_zeroes) begin
         // Undo initial product shift, remove the counted zeroes
         norm_shamt          = PRECISION_BITS + 2 + leading_zero_count;
         normalized_exponent = exponent_product_q - leading_zero_count_sgn + 1; // account for shift
       // Subnormal result
       end else begin
         // Cap the shift distance to align mantissa with minimum exponent
         norm_shamt          = unsigned'(signed'(PRECISION_BITS + 2 + exponent_product_q));
         normalized_exponent = 0; // subnormals encoded as 0
       end
     // Addend-anchored case
     end else begin
       norm_shamt          = addend_shamt_q; // Undo the initial shift
       normalized_exponent = tentative_exponent_q;
     end
   end

   // Do the large normalization shift
   assign sum_shifted       = sum_q << norm_shamt;

   // The addend-anchored case needs a 1-bit normalization since the leading-one can be to the left
   // or right of the (non-carry) MSB of the sum.
   always_comb begin : small_norm
     // Default assignment, discarding carry bit
     {final_mantissa, sum_sticky_bits} = sum_shifted;
     final_exponent                    = normalized_exponent;

     // The normalized sum has overflown, align right and fix exponent
     if (sum_shifted[3*PRECISION_BITS+4]) begin // check the carry bit
       {final_mantissa, sum_sticky_bits} = sum_shifted >> 1;
       final_exponent                    = normalized_exponent + 1;
     // The normalized sum is normal, nothing to do
     end else if (sum_shifted[3*PRECISION_BITS+3]) begin // check the sum MSB
       // do nothing
     // The normalized sum is still denormal, align left - unless the result is not already subnormal
     end else if (normalized_exponent > 1) begin
       {final_mantissa, sum_sticky_bits} = sum_shifted << 1;
       final_exponent                    = normalized_exponent - 1;
     // Otherwise we're denormal
     end else begin
       final_exponent = '0;
     end
   end

   // Update the sticky bit with the shifted-out bits
   assign sticky_after_norm = (| {sum_sticky_bits}) | sticky_before_add_q;

   // ----------------------------
   // Rounding and classification
   // ----------------------------
   logic                                     pre_round_sign;
   logic [SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] pre_round_abs; // absolute value of result before rounding
   logic [1:0]                               round_sticky_bits;

   logic of_before_round, of_after_round; // overflow
   logic uf_before_round, uf_after_round; // underflow

   logic [NUM_FORMATS-1:0][SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] fmt_pre_round_abs; // per format
   logic [NUM_FORMATS-1:0][1:0]                               fmt_round_sticky_bits;

   logic [NUM_FORMATS-1:0]                                    fmt_of_after_round;
   logic [NUM_FORMATS-1:0]                                    fmt_uf_after_round;

   logic                                     rounded_sign;
   logic [SUPER_EXP_BITS+SUPER_MAN_BITS-1:0] rounded_abs; // absolute value of result after rounding
   logic                                     result_zero;

   // Classification before round. RISC-V mandates checking underflow AFTER rounding!
   assign of_before_round = final_exponent >= 2**(fpnew_pkg::exp_bits(dst_fmt_q2))-1; // infinity exponent is all ones
   assign uf_before_round = final_exponent == 0;               // exponent for subnormals capped to 0

   // Pack exponent and mantissa into proper rounding form
   for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_res_assemble
     // Set up some constants
     localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));

     logic [EXP_BITS-1:0] pre_round_exponent;
     logic [MAN_BITS-1:0] pre_round_mantissa;

     if (FpFmtConfig[fmt]) begin : active_format

       assign pre_round_exponent = (of_before_round) ? 2**EXP_BITS-2 : final_exponent[EXP_BITS-1:0];
       assign pre_round_mantissa = (of_before_round) ? '1 : final_mantissa[SUPER_MAN_BITS-:MAN_BITS];
       // Assemble result before rounding. In case of overflow, the largest normal value is set.
       assign fmt_pre_round_abs[fmt] = {pre_round_exponent, pre_round_mantissa}; // 0-extend

       // Round bit is after mantissa (1 in case of overflow for rounding)
       assign fmt_round_sticky_bits[fmt][1] = final_mantissa[SUPER_MAN_BITS-MAN_BITS] |
                                              of_before_round;

       // remaining bits in mantissa to sticky (1 in case of overflow for rounding)
       if (MAN_BITS < SUPER_MAN_BITS) begin : narrow_sticky
         assign fmt_round_sticky_bits[fmt][0] = (| final_mantissa[SUPER_MAN_BITS-MAN_BITS-1:0]) |
                                                sticky_after_norm | of_before_round;
       end else begin : normal_sticky
         assign fmt_round_sticky_bits[fmt][0] = sticky_after_norm | of_before_round;
       end
     end else begin : inactive_format
       assign fmt_pre_round_abs[fmt] = '{default: fpnew_pkg::DONT_CARE};
       assign fmt_round_sticky_bits[fmt] = '{default: fpnew_pkg::DONT_CARE};
     end
   end

   // Assemble result before rounding. In case of overflow, the largest normal value is set.
   assign pre_round_sign     = final_sign_q;
   assign pre_round_abs      = fmt_pre_round_abs[dst_fmt_q2];

   // In case of overflow, the round and sticky bits are set for proper rounding
   assign round_sticky_bits  = fmt_round_sticky_bits[dst_fmt_q2];

   // Perform the rounding
   fpnew_rounding #(
     .AbsWidth ( SUPER_EXP_BITS + SUPER_MAN_BITS )
   ) i_fpnew_rounding (
     .abs_value_i             ( pre_round_abs           ),
     .sign_i                  ( pre_round_sign          ),
     .round_sticky_bits_i     ( round_sticky_bits       ),
     .rnd_mode_i              ( rnd_mode_q              ),
     .effective_subtraction_i ( effective_subtraction_q ),
     .abs_rounded_o           ( rounded_abs             ),
     .sign_o                  ( rounded_sign            ),
     .exact_zero_o            ( result_zero             )
   );

   logic [NUM_FORMATS-1:0][WIDTH-1:0] fmt_result;

   for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_sign_inject
     // Set up some constants
     localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
     localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));

     if (FpFmtConfig[fmt]) begin : active_format
       always_comb begin : post_process
         // detect of / uf
         fmt_uf_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // denormal
         fmt_of_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // inf exp.

         // Assemble regular result, nan box short ones.
         fmt_result[fmt]               = '1;
         fmt_result[fmt][FP_WIDTH-1:0] = {rounded_sign, rounded_abs[EXP_BITS+MAN_BITS-1:0]};
       end
     end else begin : inactive_format
       assign fmt_uf_after_round[fmt] = fpnew_pkg::DONT_CARE;
       assign fmt_of_after_round[fmt] = fpnew_pkg::DONT_CARE;
       assign fmt_result[fmt]         = '{default: fpnew_pkg::DONT_CARE};
     end
   end

   // Classification after rounding select by destination format
   assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
   assign of_after_round = fmt_of_after_round[dst_fmt_q2];


   // -----------------
   // Result selection
   // -----------------
   logic [WIDTH-1:0]     regular_result;
   fpnew_pkg::status_t   regular_status;

   // Assemble regular result
   assign regular_result = fmt_result[dst_fmt_q2];
   assign regular_status.NV = 1'b0; // only valid cases are handled in regular path
   assign regular_status.DZ = 1'b0; // no divisions
   assign regular_status.OF = of_before_round | of_after_round;   // rounding can introduce overflow
   assign regular_status.UF = uf_after_round & regular_status.NX; // only inexact results raise UF
   assign regular_status.NX = (| round_sticky_bits) | of_before_round | of_after_round;

   // Final results for output pipeline
   logic [WIDTH-1:0]   result_d;
   fpnew_pkg::status_t status_d;

   // Select output depending on special case detection
   assign result_d = result_is_special_q ? special_result_q : regular_result;
   assign status_d = result_is_special_q ? special_status_q : regular_status;

   // ----------------
   // Output Pipeline
   // ----------------
   // Output pipeline signals, index i holds signal after i register stages
   logic               [0:NUM_OUT_REGS][WIDTH-1:0] out_pipe_result_q;
   fpnew_pkg::status_t [0:NUM_OUT_REGS]            out_pipe_status_q;
   TagType             [0:NUM_OUT_REGS]            out_pipe_tag_q;
   AuxType             [0:NUM_OUT_REGS]            out_pipe_aux_q;
   logic               [0:NUM_OUT_REGS]            out_pipe_valid_q;
   // Ready signal is combinatorial for all stages
   logic [0:NUM_OUT_REGS] out_pipe_ready;

   // Input stage: First element of pipeline is taken from inputs
   assign out_pipe_result_q[0] = result_d;
   assign out_pipe_status_q[0] = status_d;
   assign out_pipe_tag_q[0]    = mid_pipe_tag_q[NUM_MID_REGS];
   assign out_pipe_aux_q[0]    = mid_pipe_aux_q[NUM_MID_REGS];
   assign out_pipe_valid_q[0]  = mid_pipe_valid_q[NUM_MID_REGS];
   // Input stage: Propagate pipeline ready signal to inside pipe
   assign mid_pipe_ready[NUM_MID_REGS] = out_pipe_ready[0];
   // Generate the register stages
   for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
     // Internal register enable for this stage
     logic reg_ena;
     // Determine the ready signal of the current stage - advance the pipeline:
     // 1. if the next stage is ready for our data
     // 2. if the next stage only holds a bubble (not valid) -> we can pop it
     assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
     // Valid: enabled by ready signal, synchronous clear with the flush signal
     `FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
     // Enable register if pipleine ready and a valid data item is present
     assign reg_ena = out_pipe_ready[i] & out_pipe_valid_q[i];
     // Generate the pipeline registers within the stages, use enable-registers
     `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
     `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
     `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
     `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
   end
   // Output stage: Ready travels backwards from output side, driven by downstream circuitry
   assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
   // Output stage: assign module outputs
   assign result_o        = out_pipe_result_q[NUM_OUT_REGS];
   assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
   assign extension_bit_o = 1'b1; // always NaN-Box result
   assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
   assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
   assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
   assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
 endmodule