openlane/designs/BM64/src/BM64.v - third_party/shuttle/sky130/mpw-006/slot-033 - Git at Google

 ///////////////////////////////////////////////////////////////////////////////
 //
 //  Copyright 2010-2012 by Michael A. Morris, dba M. A. Morris & Associates
 //
 //  All rights reserved. The source code contained herein is publicly released
 //  under the terms and conditions of the GNU Lesser Public License. No part of
 //  this source code may be reproduced or transmitted in any form or by any
 //  means, electronic or mechanical, including photocopying, recording, or any
 //  information storage and retrieval system in violation of the license under
 //  which the source code is released.
 //
 //  The souce code contained herein is free; it may be redistributed and/or
 //  modified in accordance with the terms of the GNU Lesser General Public
 //  License as published by the Free Software Foundation; either version 2.1 of
 //  the GNU Lesser General Public License, or any later version.
 //
 //  The souce code contained herein is freely released WITHOUT ANY WARRANTY;
 //  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 //  PARTICULAR PURPOSE. (Refer to the GNU Lesser General Public License for
 //  more details.)
 //
 //  A copy of the GNU Lesser General Public License should have been received
 //  along with the source code contained herein; if not, a copy can be obtained
 //  by writing to:
 //
 //  Free Software Foundation, Inc.
 //  51 Franklin Street, Fifth Floor
 //  Boston, MA  02110-1301 USA
 //
 //  Further, no use of this source code is permitted in any form or means
 //  without inclusion of this banner prominently in any derived works.
 //
 //  Michael A. Morris
 //  Huntsville, AL
 //
 ///////////////////////////////////////////////////////////////////////////////

 `timescale 1ns / 1ps

 ////////////////////////////////////////////////////////////////////////////////
 // Company:         M. A. Morris & Associates
 // Engineer:        Michael A. Morris
 //
 // Create Date:     12:59:58 10/02/2010
 // Design Name:     Fast 4-bit Booth Multiplier
 // Module Name:     Booth_Multiplier_4xA.v
 // Project Name:    Booth_Multiplier
 // Target Devices:  Spartan-3AN
 // Tool versions:   Xilinx ISE 10.1 SP3
 //
 // Description:
 //
 //  This module implements a parameterized multiplier which uses a Modified
 //  Booth algorithm for its implementation. The implementation is based on the
 //  algorithm described in "Computer Organization", Hamacher et al, McGraw-
 //  Hill Book Company, New York, NY, 1978, ISBN: 0-07-025681-0.
 //
 //  Compared to the standard, 1-bit at a time Booth algorithm, this modified
 //  Booth multiplier algorithm shifts the multiplier 4 bits at a time. Thus,
 //  this algorithm will compute a 2's complement product four times as fast as
 //  the base algorithm.
 //
 //  This particular module attempts to optimize the synthesis and implementation
 //  results relative to those of the Booth_Multiplier_4x.v module. Examination
 //  of the synthesis results of that module indicate that 16 20-bit adders and
 //  16 20-bit subtractors are needed to implement the partial products. This
 //  module uses a different approach to eliminate the large number of adders
 //  and subtractors such that only two cascaded adders are required; When sub-
 //  traction is required, complementing the input and adding a carry into the
 //  sum is how the subtractions are implemented. 32:1 multiplexers are used to
 //  evaluated the Booth recoding value and determine the value and operation to
 //  be performed by the two cascaded adders.
 //
 // Dependencies:
 //
 // Revision:
 //
 //  0.01    10J02   MAM     File Created
 //
 //  1.0     12I02   MAM     Changed the implementation of the base module to
 //                          reduced the number of inferred adders and reduce
 //                          the number of multiplexers required.
 //
 // Additional Comments:
 //
 //  The basic operations follow those of the standard Booth multiplier except
 //  that the transitions are being tracked across 4 bits plus the guard bit.
 //  The result is that the operations required are 0, �1, �2, �3, �4, �5, �6,
 //  �7, and �8 times the multiplicand (M). However, it is possible to reduce
 //  the number of partial products required to implement the multiplication to
 //  two. That is, �3, �5, �6, and �7 can be written in terms of combinations of
 //  �1, �2, �4, and �8. For example, 3M = (2M + 1M), 5M = (4M + M), 6M = (4M
 //  + 2M), and 7M = (8M - M). Thus, the following 32 entry table defines the
 //  operations required for generating the partial products through each pass
 //  of the algorithm over the multiplier:
 //
 //  Prod[4:0]       Operation
 //    00000      Prod <= (Prod + 0*M + 0*M) >> 4;
 //    00001      Prod <= (Prod + 0*M + 1*M) >> 4;
 //    00010      Prod <= (Prod + 0*M + 1*M) >> 4;
 //    00011      Prod <= (Prod + 2*M + 0*M) >> 4;
 //    00100      Prod <= (Prod + 2*M + 0*M) >> 4;
 //    00101      Prod <= (Prod + 2*M + 1*M) >> 4;
 //    00110      Prod <= (Prod + 2*M + 1*M) >> 4;
 //    00111      Prod <= (Prod + 4*M + 0*M) >> 4;
 //    01000      Prod <= (Prod + 4*M + 0*M) >> 4;
 //    01001      Prod <= (Prod + 4*M + 1*M) >> 4;
 //    01010      Prod <= (Prod + 4*M + 1*M) >> 4;
 //    01011      Prod <= (Prod + 4*M + 2*M) >> 4;
 //    01100      Prod <= (Prod + 4*M + 2*M) >> 4;
 //    01101      Prod <= (Prod + 8*M - 1*M) >> 4;
 //    01110      Prod <= (Prod + 8*M - 1*M) >> 4;
 //    01111      Prod <= (Prod + 8*M + 0*M) >> 4;
 //    10000      Prod <= (Prod - 8*M - 0*M) >> 4;
 //    10001      Prod <= (Prod - 8*M + 1*M) >> 4;
 //    10010      Prod <= (Prod - 8*M + 1*M) >> 4;
 //    10011      Prod <= (Prod - 4*M - 2*M) >> 4;
 //    10100      Prod <= (Prod - 4*M - 2*M) >> 4;
 //    10101      Prod <= (Prod - 4*M - 1*M) >> 4;
 //    10110      Prod <= (Prod - 4*M - 1*M) >> 4;
 //    10111      Prod <= (Prod - 4*M - 0*M) >> 4;
 //    11000      Prod <= (Prod - 4*M - 0*M) >> 4;
 //    11001      Prod <= (Prod - 2*M - 1*M) >> 4;
 //    11010      Prod <= (Prod - 2*M - 1*M) >> 4;
 //    11011      Prod <= (Prod - 2*M - 0*M) >> 4;
 //    11100      Prod <= (Prod - 2*M - 0*M) >> 4;
 //    11101      Prod <= (Prod - 0*M - 1*M) >> 4;
 //    11110      Prod <= (Prod - 0*M - 1*M) >> 4;
 //    11111      Prod <= (Prod - 0*M - 0*M) >> 4;
 //
 //  One approach to implementing the recoding table is to use a 32:1 multiplexer
 //  and simply write out the necessary operations. This is the approach used in
 //  the first version of the 4 bits at a time Booth multiplier. The problem is
 //  that the implementation of the preceding recoding table in a first prin-
 //  ciples manner results in a large number of adders and subtractors being
 //  synthesized. Apparently, the structure and character of the RTL code is such
 //  that the synthesizer is unable to use multiplexers to determine the operands
 //  of the two adders which are required.
 //
 //  Examining the previous recoding table shows that seven multiples of the
 //  multiplicand (M) are represented in the first multiplicand column, 0x, �2x,
 //  �4x, and �8x, and five are represented in the second multiplicand column,
 //  0x, �1x, and �2. The synthesizer is able to identify the need for adders and
 //  subractors, but is unable to morph the structure from one embedded in a 32:1
 //  multiplexer into one where an 8:1 multiplexer feeds the first multiplicand
 //  products into one adder, and another 8:1 multiplexer feeds the second multi-
 //  plicand product into another adder cascaded with the first.
 //
 //  A review of the corresponding synthesis report shows that the synthesizer
 //  extracted 8 adders and 8 subtractors. Refering to the 32 line recoding table
 //  above, shows that there are 8 diffent combinations of the multiplicand pro-
 //  ducts that must be added/subtracted to the partial product, Prod, to deter-
 //  mine the final product. The resulting implementation is correct, as deter-
 //  mined by its testbench, but the implementation certainly uses more resources
 //  than would be expected when increasing the number of bits processed per
 //  stage/iteration from 2 to 4. A natural assumption is that the resources uti-
 //  lized would increase by a factor close to 2 as the number of bits processed
 //  is increased by powers of 2: 1, 2, 4, etc.
 //
 //  It is now clear that the synthesizer is unable to transform the multiplexed
 //  adder structure inherent in the current specification into a structure which
 //  is composed of multiplexers followed by two cascaded adders. Therefore, that
 //  simpler structure must be explcitly specified in this module's RTL. In order
 //  to minimize the multiplexers, the recoding table needs to be modified from
 //  its current definition to an equivalent definition that can be implemented
 //  using just two multiplicand products. The following recoding table can be
 //  compared to the one above to see the adjustments made. In essence, the first
 //  multiplicand column allows only five multiplicand product values, 0M, �4M,
 //  and �8M, and the second multiplicand column also allows only five values,
 //  0M, �1M, and �2M.
 //
 //  Prod[4:0]       Operation
 //    00000      Prod <= (Prod + 0*M + 0*M) >> 4;
 //    00001      Prod <= (Prod + 0*M + 1*M) >> 4;
 //    00010      Prod <= (Prod + 0*M + 1*M) >> 4;
 //    00011      Prod <= (Prod + 0*M + 2*M) >> 4;
 //    00100      Prod <= (Prod + 0*M + 2*M) >> 4;
 //    00101      Prod <= (Prod + 4*M - 1*M) >> 4;
 //    00110      Prod <= (Prod + 4*M - 1*M) >> 4;
 //    00111      Prod <= (Prod + 4*M + 0*M) >> 4;
 //    01000      Prod <= (Prod + 4*M + 0*M) >> 4;
 //    01001      Prod <= (Prod + 4*M + 1*M) >> 4;
 //    01010      Prod <= (Prod + 4*M + 1*M) >> 4;
 //    01011      Prod <= (Prod + 4*M + 2*M) >> 4;
 //    01100      Prod <= (Prod + 4*M + 2*M) >> 4;
 //    01101      Prod <= (Prod + 8*M - 1*M) >> 4
 //    01110      Prod <= (Prod + 8*M - 1*M) >> 4;
 //    01111      Prod <= (Prod + 8*M + 0*M) >> 4;
 //    10000      Prod <= (Prod - 8*M - 0*M) >> 4;
 //    10001      Prod <= (Prod - 8*M + 1*M) >> 4;
 //    10010      Prod <= (Prod - 8*M + 1*M) >> 4;
 //    10011      Prod <= (Prod - 4*M - 2*M) >> 4;
 //    10100      Prod <= (Prod - 4*M - 2*M) >> 4;
 //    10101      Prod <= (Prod - 4*M - 1*M) >> 4;
 //    10110      Prod <= (Prod - 4*M - 1*M) >> 4;
 //    10111      Prod <= (Prod - 4*M - 0*M) >> 4;
 //    11000      Prod <= (Prod - 4*M - 0*M) >> 4;
 //    11001      Prod <= (Prod - 4*M + 1*M) >> 4;
 //    11010      Prod <= (Prod - 4*M + 1*M) >> 4;
 //    11011      Prod <= (Prod - 0*M - 2*M) >> 4;
 //    11100      Prod <= (Prod - 0*M - 2*M) >> 4;
 //    11101      Prod <= (Prod - 0*M - 1*M) >> 4;
 //    11110      Prod <= (Prod - 0*M - 1*M) >> 4;
 //    11111      Prod <= (Prod - 0*M - 0*M) >> 4;
 //
 //  The zero terms and the subtractions will be implemented using logic: bus AND
 //  for 0, and bus XOR and carry input for subtraction, i.e. 2sC add. With this
 //  additional reduction of operands, the multiplexers at the input to the adder
 //  tree only multiplex two values each, {4M, 8M} or {1M, 2M}, respectively. The
 //  operations required of the multiplexer and adder for each multiplicand
 //  column can be defined by the triple: {PnM, M_Sel, En}. En is the control for
 //  the bus AND which forms 0*M. M_Sel is the control for the multiplexer that
 //  selects {4M, 8M} or {1M, 2M}, respectively. PnM is the input to the bus XOR
 //  and carry input of the adder, and if 0 an addition is performed with the
 //  operand at the output of the bus AND, and if 1, the adder is presented with
 //  the complement of that operand plus an input carry. The bus AND can be built
 //  explicitly after the multiplexer, or it can be included as the default case
 //  of the multiplexer itself.
 //
 //  In either case, the triples {PnM, M_Sel, En} can be constructed using a 32x6
 //  ROM. The first triple refers to the control signals for the first multipli-
 //  cand column and the second refers to the control signals for the second
 //  multiplicand column. To force the synthesizer to infer a ROM, a fully defin-
 //  ed case statement of 32 entries for each column is required:
 //
 //  For the first column - B
 //
 //  case(Booth)
 //      5'b00000 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
 //      5'b00001 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
 //      5'b00010 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
 //      5'b00011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
 //      5'b00100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
 //      5'b00101 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
 //      5'b00110 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
 //      5'b00111 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
 //      5'b01000 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
 //      5'b01001 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
 //      5'b01010 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
 //      5'b01011 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
 //      5'b01100 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
 //      5'b01101 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4
 //      5'b01110 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
 //      5'b01111 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M + 0*M) >> 4;
 //      5'b10000 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M - 0*M) >> 4;
 //      5'b10001 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
 //      5'b10010 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
 //      5'b10011 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
 //      5'b10100 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
 //      5'b10101 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
 //      5'b10110 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
 //      5'b10111 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
 //      5'b11000 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
 //      5'b11001 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
 //      5'b11010 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
 //      5'b11011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
 //      5'b11100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
 //      5'b11101 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
 //      5'b11110 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
 //      5'b11111 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
 //      default  : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
 //  endcase
 //
 //  For the second column - C
 //
 //  case(Booth)
 //      5'b00000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
 //      5'b00001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
 //      5'b00010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
 //      5'b00011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
 //      5'b00100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
 //      5'b00101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
 //      5'b00110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
 //      5'b00111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
 //      5'b01000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
 //      5'b01001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
 //      5'b01010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
 //      5'b01011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
 //      5'b01100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
 //      5'b01101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4
 //      5'b01110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
 //      5'b01111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 8*M + 0*M) >> 4;
 //      5'b10000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 8*M - 0*M) >> 4;
 //      5'b10001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
 //      5'b10010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
 //      5'b10011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
 //      5'b10100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
 //      5'b10101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
 //      5'b10110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
 //      5'b10111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
 //      5'b11000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
 //      5'b11001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
 //      5'b11010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
 //      5'b11011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
 //      5'b11100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
 //      5'b11101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
 //      5'b11110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
 //      5'b11111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
 //      default  : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
 //  endcase
 //
 ////////////////////////////////////////////////////////////////////////////////

 module BM64 #(
     parameter N = 256                // Width = N: multiplicand & multiplier
 )(
     input   Rst,                    // Reset
     input   Clk,                    // Clock

     input   Ld,                     // Load Registers and Start Multiplier
     input   [(N - 1):0] M,          // Multiplicand
     input   [(N - 1):0] R,          // Multiplier
     output  reg Valid,              // Product Valid
     output  reg [((2*N) - 1):0] P   // Product <= M * R
 );

 ////////////////////////////////////////////////////////////////////////////////
 //
 //  Local Parameters
 //

 localparam pNumCycles = ((N + 1)/4);    // No. of cycles required for product

 ////////////////////////////////////////////////////////////////////////////////
 //
 //  Declarations
 //

 reg     [4:0] Cntr;                     // Operation Counter
 reg     [4:0] Booth;                    // Booth Recoding Field
 reg     Guard;                          // Shift Bit for Booth Recoding
 reg     [(N + 3):0] A;                  // Multiplicand w/ guards
 wire    [(N + 3):0] Mx8, Mx4, Mx2, Mx1; // Multiplicand products w/ guards
 reg     PnM_B, M_Sel_B, En_B;           // Operand B Control Triple
 reg     PnM_C, M_Sel_C, En_C;           // Operand C Control Triple
 wire    [(N + 3):0] Hi;                 // Upper Half of Product w/ guards
 reg     [(N + 3):0] B, C;               // Adder tree Operand Inputs
 reg     Ci_B, Ci_C;                     // Adder tree Carry Inputs
 wire    [(N + 3):0] T, S;               // Adder Tree Outputs w/ guards
 reg     [((2*N) + 3):0] Prod;           // Double Length Product w/ guards

 ////////////////////////////////////////////////////////////////////////////////
 //
 //  Implementation
 //

 always @(posedge Clk)
 begin
     if(Rst)
         Cntr <= #1 0;
     else if(Ld)
         Cntr <= #1 pNumCycles;
     else if(|Cntr)
         Cntr <= #1 (Cntr - 1);
 end

 //  Multiplicand Register
 //      includes 4 bits to guard sign of multiplicand in the event the most
 //      negative value is provided as the input.

 always @(posedge Clk)
 begin
     if(Rst)
         A <= #1 0;
     else if(Ld)
         A <= #1 {{4{M[(N - 1)]}}, M};
 end

 assign Mx8 = {A, 3'b0};
 assign Mx4 = {A, 2'b0};
 assign Mx2 = {A, 1'b0};
 assign Mx1 = A;

 //  Compute Upper Partial Product: (N + 4) bits in width

 always @(*) Booth <= {Prod[3:0], Guard};    // Booth's Multiplier Recoding field

 assign Hi = Prod[((2*N) + 3):N];            // Upper Half of Product Register

 // Compute the Control Triples for the First and Second Multiplicand Columns

 //  For the first column - B

 always @(*)
 begin
     case(Booth)
         5'b00000 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
         5'b00001 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
         5'b00010 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
         5'b00011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
         5'b00100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
         5'b00101 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
         5'b00110 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
         5'b00111 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
         5'b01000 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
         5'b01001 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
         5'b01010 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
         5'b01011 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
         5'b01100 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
         5'b01101 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
         5'b01110 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
         5'b01111 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M + 0*M) >> 4;
         5'b10000 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M - 0*M) >> 4;
         5'b10001 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
         5'b10010 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
         5'b10011 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
         5'b10100 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
         5'b10101 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
         5'b10110 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
         5'b10111 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
         5'b11000 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
         5'b11001 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
         5'b11010 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
         5'b11011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
         5'b11100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
         5'b11101 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
         5'b11110 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
         5'b11111 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
         default  : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
     endcase
 end

 //  For the second column - C

 always @(*)
 begin
     case(Booth)
         5'b00000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
         5'b00001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
         5'b00010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
         5'b00011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
         5'b00100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
         5'b00101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
         5'b00110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
         5'b00111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
         5'b01000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
         5'b01001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
         5'b01010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
         5'b01011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
         5'b01100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
         5'b01101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
         5'b01110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
         5'b01111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 8*M + 0*M) >> 4;
         5'b10000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 8*M - 0*M) >> 4;
         5'b10001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
         5'b10010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
         5'b10011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
         5'b10100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
         5'b10101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
         5'b10110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
         5'b10111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
         5'b11000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
         5'b11001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
         5'b11010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
         5'b11011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
         5'b11100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
         5'b11101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
         5'b11110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
         5'b11111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
         default  : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
     endcase
 end

 //  Compute the first operand - B

 always @(*)
 begin
     case({PnM_B, M_Sel_B, En_B})
         3'b001  : {Ci_B, B} <= {1'b0,  Mx4};
         3'b011  : {Ci_B, B} <= {1'b0,  Mx8};
         3'b101  : {Ci_B, B} <= {1'b1, ~Mx4};
         3'b111  : {Ci_B, B} <= {1'b1, ~Mx8};
         default : {Ci_B, B} <= 0;
     endcase
 end

 //  Compute the second operand - C

 always @(*)
 begin
     case({PnM_C, M_Sel_C, En_C})
         3'b001  : {Ci_C, C} <= {1'b0,  Mx1};
         3'b011  : {Ci_C, C} <= {1'b0,  Mx2};
         3'b101  : {Ci_C, C} <= {1'b1, ~Mx1};
         3'b111  : {Ci_C, C} <= {1'b1, ~Mx2};
         default : {Ci_C, C} <= 0;
     endcase
 end

 //  Compute Partial Sum - Cascaded Adders

 assign T = Hi + B + Ci_B;
 assign S =  T + C + Ci_C;

 //  Double Length Product Register
 //      Multiplier, R, is loaded into the least significant half on load, Ld
 //      Shifted right four places as the product is computed iteratively.

 always @(posedge Clk)
 begin
     if(Rst)
         Prod <= #1 0;
     else if(Ld)
         Prod <= #1 R;
     else if(|Cntr)  // Shift right four bits
         Prod <= #1 {{4{S[(N + 3)]}}, S, Prod[(N - 1):4]};
 end

 always @(posedge Clk)
 begin
     if(Rst)
         Guard <= #1 0;
     else if(Ld)
         Guard <= #1 0;
     else if(|Cntr)
         Guard <= #1 Prod[3];
 end

 //  Assign the product less the four guard bits to the output port
 //      A 4-bit right shift is required since the output product is stored
 //      into a synchronous register on the last cycle of the multiply.

 always @(posedge Clk)
 begin
     if(Rst)
         P <= #1 0;
     else if(Cntr == 1)
         P <= #1 {S, Prod[(N - 1):4]};
 end

 //  Count the number of shifts
 //      This implementation does not use any optimizations to perform multiple
 //      bit shifts to skip over runs of 1s or 0s.

 always @(posedge Clk)
 begin
     if(Rst)
         Valid <= #1 0;
     else
         Valid <= #1 (Cntr == 1);
 end

 endmodule