blob: 5e2cd268c420386710262532335cfab5f30cddff [file] [log] [blame]
///////////////////////////////////////////////////////////////////////////////
//
// Copyright 2010-2012 by Michael A. Morris, dba M. A. Morris & Associates
//
// All rights reserved. The source code contained herein is publicly released
// under the terms and conditions of the GNU Lesser Public License. No part of
// this source code may be reproduced or transmitted in any form or by any
// means, electronic or mechanical, including photocopying, recording, or any
// information storage and retrieval system in violation of the license under
// which the source code is released.
//
// The souce code contained herein is free; it may be redistributed and/or
// modified in accordance with the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either version 2.1 of
// the GNU Lesser General Public License, or any later version.
//
// The souce code contained herein is freely released WITHOUT ANY WARRANTY;
// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
// PARTICULAR PURPOSE. (Refer to the GNU Lesser General Public License for
// more details.)
//
// A copy of the GNU Lesser General Public License should have been received
// along with the source code contained herein; if not, a copy can be obtained
// by writing to:
//
// Free Software Foundation, Inc.
// 51 Franklin Street, Fifth Floor
// Boston, MA 02110-1301 USA
//
// Further, no use of this source code is permitted in any form or means
// without inclusion of this banner prominently in any derived works.
//
// Michael A. Morris
// Huntsville, AL
//
///////////////////////////////////////////////////////////////////////////////
`timescale 1ns / 1ps
////////////////////////////////////////////////////////////////////////////////
// Company: M. A. Morris & Associates
// Engineer: Michael A. Morris
//
// Create Date: 12:59:58 10/02/2010
// Design Name: Fast 4-bit Booth Multiplier
// Module Name: Booth_Multiplier_4xA.v
// Project Name: Booth_Multiplier
// Target Devices: Spartan-3AN
// Tool versions: Xilinx ISE 10.1 SP3
//
// Description:
//
// This module implements a parameterized multiplier which uses a Modified
// Booth algorithm for its implementation. The implementation is based on the
// algorithm described in "Computer Organization", Hamacher et al, McGraw-
// Hill Book Company, New York, NY, 1978, ISBN: 0-07-025681-0.
//
// Compared to the standard, 1-bit at a time Booth algorithm, this modified
// Booth multiplier algorithm shifts the multiplier 4 bits at a time. Thus,
// this algorithm will compute a 2's complement product four times as fast as
// the base algorithm.
//
// This particular module attempts to optimize the synthesis and implementation
// results relative to those of the Booth_Multiplier_4x.v module. Examination
// of the synthesis results of that module indicate that 16 20-bit adders and
// 16 20-bit subtractors are needed to implement the partial products. This
// module uses a different approach to eliminate the large number of adders
// and subtractors such that only two cascaded adders are required; When sub-
// traction is required, complementing the input and adding a carry into the
// sum is how the subtractions are implemented. 32:1 multiplexers are used to
// evaluated the Booth recoding value and determine the value and operation to
// be performed by the two cascaded adders.
//
// Dependencies:
//
// Revision:
//
// 0.01 10J02 MAM File Created
//
// 1.0 12I02 MAM Changed the implementation of the base module to
// reduced the number of inferred adders and reduce
// the number of multiplexers required.
//
// Additional Comments:
//
// The basic operations follow those of the standard Booth multiplier except
// that the transitions are being tracked across 4 bits plus the guard bit.
// The result is that the operations required are 0, �1, �2, �3, �4, �5, �6,
// �7, and �8 times the multiplicand (M). However, it is possible to reduce
// the number of partial products required to implement the multiplication to
// two. That is, �3, �5, �6, and �7 can be written in terms of combinations of
// �1, �2, �4, and �8. For example, 3M = (2M + 1M), 5M = (4M + M), 6M = (4M
// + 2M), and 7M = (8M - M). Thus, the following 32 entry table defines the
// operations required for generating the partial products through each pass
// of the algorithm over the multiplier:
//
// Prod[4:0] Operation
// 00000 Prod <= (Prod + 0*M + 0*M) >> 4;
// 00001 Prod <= (Prod + 0*M + 1*M) >> 4;
// 00010 Prod <= (Prod + 0*M + 1*M) >> 4;
// 00011 Prod <= (Prod + 2*M + 0*M) >> 4;
// 00100 Prod <= (Prod + 2*M + 0*M) >> 4;
// 00101 Prod <= (Prod + 2*M + 1*M) >> 4;
// 00110 Prod <= (Prod + 2*M + 1*M) >> 4;
// 00111 Prod <= (Prod + 4*M + 0*M) >> 4;
// 01000 Prod <= (Prod + 4*M + 0*M) >> 4;
// 01001 Prod <= (Prod + 4*M + 1*M) >> 4;
// 01010 Prod <= (Prod + 4*M + 1*M) >> 4;
// 01011 Prod <= (Prod + 4*M + 2*M) >> 4;
// 01100 Prod <= (Prod + 4*M + 2*M) >> 4;
// 01101 Prod <= (Prod + 8*M - 1*M) >> 4;
// 01110 Prod <= (Prod + 8*M - 1*M) >> 4;
// 01111 Prod <= (Prod + 8*M + 0*M) >> 4;
// 10000 Prod <= (Prod - 8*M - 0*M) >> 4;
// 10001 Prod <= (Prod - 8*M + 1*M) >> 4;
// 10010 Prod <= (Prod - 8*M + 1*M) >> 4;
// 10011 Prod <= (Prod - 4*M - 2*M) >> 4;
// 10100 Prod <= (Prod - 4*M - 2*M) >> 4;
// 10101 Prod <= (Prod - 4*M - 1*M) >> 4;
// 10110 Prod <= (Prod - 4*M - 1*M) >> 4;
// 10111 Prod <= (Prod - 4*M - 0*M) >> 4;
// 11000 Prod <= (Prod - 4*M - 0*M) >> 4;
// 11001 Prod <= (Prod - 2*M - 1*M) >> 4;
// 11010 Prod <= (Prod - 2*M - 1*M) >> 4;
// 11011 Prod <= (Prod - 2*M - 0*M) >> 4;
// 11100 Prod <= (Prod - 2*M - 0*M) >> 4;
// 11101 Prod <= (Prod - 0*M - 1*M) >> 4;
// 11110 Prod <= (Prod - 0*M - 1*M) >> 4;
// 11111 Prod <= (Prod - 0*M - 0*M) >> 4;
//
// One approach to implementing the recoding table is to use a 32:1 multiplexer
// and simply write out the necessary operations. This is the approach used in
// the first version of the 4 bits at a time Booth multiplier. The problem is
// that the implementation of the preceding recoding table in a first prin-
// ciples manner results in a large number of adders and subtractors being
// synthesized. Apparently, the structure and character of the RTL code is such
// that the synthesizer is unable to use multiplexers to determine the operands
// of the two adders which are required.
//
// Examining the previous recoding table shows that seven multiples of the
// multiplicand (M) are represented in the first multiplicand column, 0x, �2x,
// �4x, and �8x, and five are represented in the second multiplicand column,
// 0x, �1x, and �2. The synthesizer is able to identify the need for adders and
// subractors, but is unable to morph the structure from one embedded in a 32:1
// multiplexer into one where an 8:1 multiplexer feeds the first multiplicand
// products into one adder, and another 8:1 multiplexer feeds the second multi-
// plicand product into another adder cascaded with the first.
//
// A review of the corresponding synthesis report shows that the synthesizer
// extracted 8 adders and 8 subtractors. Refering to the 32 line recoding table
// above, shows that there are 8 diffent combinations of the multiplicand pro-
// ducts that must be added/subtracted to the partial product, Prod, to deter-
// mine the final product. The resulting implementation is correct, as deter-
// mined by its testbench, but the implementation certainly uses more resources
// than would be expected when increasing the number of bits processed per
// stage/iteration from 2 to 4. A natural assumption is that the resources uti-
// lized would increase by a factor close to 2 as the number of bits processed
// is increased by powers of 2: 1, 2, 4, etc.
//
// It is now clear that the synthesizer is unable to transform the multiplexed
// adder structure inherent in the current specification into a structure which
// is composed of multiplexers followed by two cascaded adders. Therefore, that
// simpler structure must be explcitly specified in this module's RTL. In order
// to minimize the multiplexers, the recoding table needs to be modified from
// its current definition to an equivalent definition that can be implemented
// using just two multiplicand products. The following recoding table can be
// compared to the one above to see the adjustments made. In essence, the first
// multiplicand column allows only five multiplicand product values, 0M, �4M,
// and �8M, and the second multiplicand column also allows only five values,
// 0M, �1M, and �2M.
//
// Prod[4:0] Operation
// 00000 Prod <= (Prod + 0*M + 0*M) >> 4;
// 00001 Prod <= (Prod + 0*M + 1*M) >> 4;
// 00010 Prod <= (Prod + 0*M + 1*M) >> 4;
// 00011 Prod <= (Prod + 0*M + 2*M) >> 4;
// 00100 Prod <= (Prod + 0*M + 2*M) >> 4;
// 00101 Prod <= (Prod + 4*M - 1*M) >> 4;
// 00110 Prod <= (Prod + 4*M - 1*M) >> 4;
// 00111 Prod <= (Prod + 4*M + 0*M) >> 4;
// 01000 Prod <= (Prod + 4*M + 0*M) >> 4;
// 01001 Prod <= (Prod + 4*M + 1*M) >> 4;
// 01010 Prod <= (Prod + 4*M + 1*M) >> 4;
// 01011 Prod <= (Prod + 4*M + 2*M) >> 4;
// 01100 Prod <= (Prod + 4*M + 2*M) >> 4;
// 01101 Prod <= (Prod + 8*M - 1*M) >> 4
// 01110 Prod <= (Prod + 8*M - 1*M) >> 4;
// 01111 Prod <= (Prod + 8*M + 0*M) >> 4;
// 10000 Prod <= (Prod - 8*M - 0*M) >> 4;
// 10001 Prod <= (Prod - 8*M + 1*M) >> 4;
// 10010 Prod <= (Prod - 8*M + 1*M) >> 4;
// 10011 Prod <= (Prod - 4*M - 2*M) >> 4;
// 10100 Prod <= (Prod - 4*M - 2*M) >> 4;
// 10101 Prod <= (Prod - 4*M - 1*M) >> 4;
// 10110 Prod <= (Prod - 4*M - 1*M) >> 4;
// 10111 Prod <= (Prod - 4*M - 0*M) >> 4;
// 11000 Prod <= (Prod - 4*M - 0*M) >> 4;
// 11001 Prod <= (Prod - 4*M + 1*M) >> 4;
// 11010 Prod <= (Prod - 4*M + 1*M) >> 4;
// 11011 Prod <= (Prod - 0*M - 2*M) >> 4;
// 11100 Prod <= (Prod - 0*M - 2*M) >> 4;
// 11101 Prod <= (Prod - 0*M - 1*M) >> 4;
// 11110 Prod <= (Prod - 0*M - 1*M) >> 4;
// 11111 Prod <= (Prod - 0*M - 0*M) >> 4;
//
// The zero terms and the subtractions will be implemented using logic: bus AND
// for 0, and bus XOR and carry input for subtraction, i.e. 2sC add. With this
// additional reduction of operands, the multiplexers at the input to the adder
// tree only multiplex two values each, {4M, 8M} or {1M, 2M}, respectively. The
// operations required of the multiplexer and adder for each multiplicand
// column can be defined by the triple: {PnM, M_Sel, En}. En is the control for
// the bus AND which forms 0*M. M_Sel is the control for the multiplexer that
// selects {4M, 8M} or {1M, 2M}, respectively. PnM is the input to the bus XOR
// and carry input of the adder, and if 0 an addition is performed with the
// operand at the output of the bus AND, and if 1, the adder is presented with
// the complement of that operand plus an input carry. The bus AND can be built
// explicitly after the multiplexer, or it can be included as the default case
// of the multiplexer itself.
//
// In either case, the triples {PnM, M_Sel, En} can be constructed using a 32x6
// ROM. The first triple refers to the control signals for the first multipli-
// cand column and the second refers to the control signals for the second
// multiplicand column. To force the synthesizer to infer a ROM, a fully defin-
// ed case statement of 32 entries for each column is required:
//
// For the first column - B
//
// case(Booth)
// 5'b00000 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
// 5'b00001 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
// 5'b00010 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
// 5'b00011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
// 5'b00100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
// 5'b00101 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
// 5'b00110 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
// 5'b00111 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
// 5'b01000 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
// 5'b01001 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
// 5'b01010 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
// 5'b01011 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
// 5'b01100 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
// 5'b01101 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4
// 5'b01110 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
// 5'b01111 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M + 0*M) >> 4;
// 5'b10000 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M - 0*M) >> 4;
// 5'b10001 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
// 5'b10010 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
// 5'b10011 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
// 5'b10100 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
// 5'b10101 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
// 5'b10110 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
// 5'b10111 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
// 5'b11000 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
// 5'b11001 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
// 5'b11010 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
// 5'b11011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
// 5'b11100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
// 5'b11101 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
// 5'b11110 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
// 5'b11111 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
// default : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
// endcase
//
// For the second column - C
//
// case(Booth)
// 5'b00000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
// 5'b00001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
// 5'b00010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
// 5'b00011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
// 5'b00100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
// 5'b00101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
// 5'b00110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
// 5'b00111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
// 5'b01000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
// 5'b01001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
// 5'b01010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
// 5'b01011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
// 5'b01100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
// 5'b01101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4
// 5'b01110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
// 5'b01111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 8*M + 0*M) >> 4;
// 5'b10000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 8*M - 0*M) >> 4;
// 5'b10001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
// 5'b10010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
// 5'b10011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
// 5'b10100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
// 5'b10101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
// 5'b10110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
// 5'b10111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
// 5'b11000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
// 5'b11001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
// 5'b11010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
// 5'b11011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
// 5'b11100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
// 5'b11101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
// 5'b11110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
// 5'b11111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
// default : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
// endcase
//
////////////////////////////////////////////////////////////////////////////////
module BM64 #(
parameter N = 256 // Width = N: multiplicand & multiplier
)(
input Rst, // Reset
input Clk, // Clock
input Ld, // Load Registers and Start Multiplier
input [(N - 1):0] M, // Multiplicand
input [(N - 1):0] R, // Multiplier
output reg Valid, // Product Valid
output reg [((2*N) - 1):0] P // Product <= M * R
);
////////////////////////////////////////////////////////////////////////////////
//
// Local Parameters
//
localparam pNumCycles = ((N + 1)/4); // No. of cycles required for product
////////////////////////////////////////////////////////////////////////////////
//
// Declarations
//
reg [4:0] Cntr; // Operation Counter
reg [4:0] Booth; // Booth Recoding Field
reg Guard; // Shift Bit for Booth Recoding
reg [(N + 3):0] A; // Multiplicand w/ guards
wire [(N + 3):0] Mx8, Mx4, Mx2, Mx1; // Multiplicand products w/ guards
reg PnM_B, M_Sel_B, En_B; // Operand B Control Triple
reg PnM_C, M_Sel_C, En_C; // Operand C Control Triple
wire [(N + 3):0] Hi; // Upper Half of Product w/ guards
reg [(N + 3):0] B, C; // Adder tree Operand Inputs
reg Ci_B, Ci_C; // Adder tree Carry Inputs
wire [(N + 3):0] T, S; // Adder Tree Outputs w/ guards
reg [((2*N) + 3):0] Prod; // Double Length Product w/ guards
////////////////////////////////////////////////////////////////////////////////
//
// Implementation
//
always @(posedge Clk)
begin
if(Rst)
Cntr <= #1 0;
else if(Ld)
Cntr <= #1 pNumCycles;
else if(|Cntr)
Cntr <= #1 (Cntr - 1);
end
// Multiplicand Register
// includes 4 bits to guard sign of multiplicand in the event the most
// negative value is provided as the input.
always @(posedge Clk)
begin
if(Rst)
A <= #1 0;
else if(Ld)
A <= #1 {{4{M[(N - 1)]}}, M};
end
assign Mx8 = {A, 3'b0};
assign Mx4 = {A, 2'b0};
assign Mx2 = {A, 1'b0};
assign Mx1 = A;
// Compute Upper Partial Product: (N + 4) bits in width
always @(*) Booth <= {Prod[3:0], Guard}; // Booth's Multiplier Recoding field
assign Hi = Prod[((2*N) + 3):N]; // Upper Half of Product Register
// Compute the Control Triples for the First and Second Multiplicand Columns
// For the first column - B
always @(*)
begin
case(Booth)
5'b00000 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
5'b00001 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
5'b00010 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 1*M) >> 4;
5'b00011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
5'b00100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod + 0*M + 2*M) >> 4;
5'b00101 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
5'b00110 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M - 1*M) >> 4;
5'b00111 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
5'b01000 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 0*M) >> 4;
5'b01001 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
5'b01010 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
5'b01011 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
5'b01100 : {PnM_B, M_Sel_B, En_B} <= 3'b001; // (Prod + 4*M + 2*M) >> 4;
5'b01101 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
5'b01110 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M - 1*M) >> 4;
5'b01111 : {PnM_B, M_Sel_B, En_B} <= 3'b011; // (Prod + 8*M + 0*M) >> 4;
5'b10000 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M - 0*M) >> 4;
5'b10001 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
5'b10010 : {PnM_B, M_Sel_B, En_B} <= 3'b111; // (Prod - 8*M + 1*M) >> 4;
5'b10011 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
5'b10100 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 2*M) >> 4;
5'b10101 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
5'b10110 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
5'b10111 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
5'b11000 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M - 0*M) >> 4;
5'b11001 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
5'b11010 : {PnM_B, M_Sel_B, En_B} <= 3'b101; // (Prod - 4*M + 1*M) >> 4;
5'b11011 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
5'b11100 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 2*M) >> 4;
5'b11101 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
5'b11110 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 1*M) >> 4;
5'b11111 : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
default : {PnM_B, M_Sel_B, En_B} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
endcase
end
// For the second column - C
always @(*)
begin
case(Booth)
5'b00000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 0*M + 0*M) >> 4;
5'b00001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
5'b00010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 0*M + 1*M) >> 4;
5'b00011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
5'b00100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 0*M + 2*M) >> 4;
5'b00101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
5'b00110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 4*M - 1*M) >> 4;
5'b00111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
5'b01000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 4*M + 0*M) >> 4;
5'b01001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
5'b01010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod + 4*M + 1*M) >> 4;
5'b01011 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
5'b01100 : {PnM_C, M_Sel_C, En_C} <= 3'b011; // (Prod + 4*M + 2*M) >> 4;
5'b01101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
5'b01110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod + 8*M - 1*M) >> 4;
5'b01111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod + 8*M + 0*M) >> 4;
5'b10000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 8*M - 0*M) >> 4;
5'b10001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
5'b10010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 8*M + 1*M) >> 4;
5'b10011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
5'b10100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 4*M - 2*M) >> 4;
5'b10101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
5'b10110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 4*M - 1*M) >> 4;
5'b10111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
5'b11000 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 4*M - 0*M) >> 4;
5'b11001 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
5'b11010 : {PnM_C, M_Sel_C, En_C} <= 3'b001; // (Prod - 4*M + 1*M) >> 4;
5'b11011 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
5'b11100 : {PnM_C, M_Sel_C, En_C} <= 3'b111; // (Prod - 0*M - 2*M) >> 4;
5'b11101 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
5'b11110 : {PnM_C, M_Sel_C, En_C} <= 3'b101; // (Prod - 0*M - 1*M) >> 4;
5'b11111 : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
default : {PnM_C, M_Sel_C, En_C} <= 3'b000; // (Prod - 0*M - 0*M) >> 4;
endcase
end
// Compute the first operand - B
always @(*)
begin
case({PnM_B, M_Sel_B, En_B})
3'b001 : {Ci_B, B} <= {1'b0, Mx4};
3'b011 : {Ci_B, B} <= {1'b0, Mx8};
3'b101 : {Ci_B, B} <= {1'b1, ~Mx4};
3'b111 : {Ci_B, B} <= {1'b1, ~Mx8};
default : {Ci_B, B} <= 0;
endcase
end
// Compute the second operand - C
always @(*)
begin
case({PnM_C, M_Sel_C, En_C})
3'b001 : {Ci_C, C} <= {1'b0, Mx1};
3'b011 : {Ci_C, C} <= {1'b0, Mx2};
3'b101 : {Ci_C, C} <= {1'b1, ~Mx1};
3'b111 : {Ci_C, C} <= {1'b1, ~Mx2};
default : {Ci_C, C} <= 0;
endcase
end
// Compute Partial Sum - Cascaded Adders
assign T = Hi + B + Ci_B;
assign S = T + C + Ci_C;
// Double Length Product Register
// Multiplier, R, is loaded into the least significant half on load, Ld
// Shifted right four places as the product is computed iteratively.
always @(posedge Clk)
begin
if(Rst)
Prod <= #1 0;
else if(Ld)
Prod <= #1 R;
else if(|Cntr) // Shift right four bits
Prod <= #1 {{4{S[(N + 3)]}}, S, Prod[(N - 1):4]};
end
always @(posedge Clk)
begin
if(Rst)
Guard <= #1 0;
else if(Ld)
Guard <= #1 0;
else if(|Cntr)
Guard <= #1 Prod[3];
end
// Assign the product less the four guard bits to the output port
// A 4-bit right shift is required since the output product is stored
// into a synchronous register on the last cycle of the multiply.
always @(posedge Clk)
begin
if(Rst)
P <= #1 0;
else if(Cntr == 1)
P <= #1 {S, Prod[(N - 1):4]};
end
// Count the number of shifts
// This implementation does not use any optimizations to perform multiple
// bit shifts to skip over runs of 1s or 0s.
always @(posedge Clk)
begin
if(Rst)
Valid <= #1 0;
else
Valid <= #1 (Cntr == 1);
end
endmodule