团队编号:CICC1230
团队名称:少吃米饭多吃肉
1.传统Booth算法+Wallace树加法器
以下数据在32位宽乘法实现时结果供参考:
相同条件下,阵列乘法器面积最小,Wallace树乘法器面积最大,Booth乘法器和Booth-WT乘法器面积相差不大且介于阵列乘法器和Wallace树乘法器之间;Wallace树乘法器速度最快,Booth-WT混合乘法器速度也很可观,阵列乘法器速度最慢;阵列乘法器功耗最低,Wallaace树乘法器功耗最高,Booth-WT混合乘法器功耗与Wallace树乘法器差不多;Wallace树乘法器面积最大、功耗最高但速度最快。在乘法器的实际应用中,只考虑速度方面的性能可以选择Wallace树乘法器或Booth-WT混合乘法器结构,只考虑面积方面的性能可以选择阵列乘法器或Booth乘法器,只考虑功耗方面的性能可以选择阵列乘法器结构,需要考虑速度、面积和功耗性能的折中可以选择Booth-WT混合乘法器结构。
这里贴出部分Booth+Wallace的代码实现
这是基4Booth编码模块
`timescale 1ns/1ps
module booth4code (
a_i,b_i,booth_o
);
parameter length = 32;
input [length-1 : 0] a_i; //full 32-bit input
input [2:0] b_i; //3 of 32 bit input
output reg [length : 0] booth_o; //booth output
always @(*) begin
case(b_i)
3'b000 : booth_o <= 0;
3'b001 : booth_o <= { a_i[length-1], a_i};
3'b010 : booth_o <= { a_i[length-1], a_i};
3'b011 : booth_o <= a_i<<1;
3'b100 : booth_o <= -(a_i<<1);
3'b101 : booth_o <= -{a_i[length-1],a_i};
3'b110 : booth_o <= -{a_i[length-1],a_i};
3'b111 : booth_o <= 0;
default: booth_o <= 0;
endcase
end
endmodule
这是4:2压缩器优化模块
`timescale 1ns/1ps
module compressor42 (
in1,in2,in3,in4,cin,out1,out2,cout
);
parameter length = 128;
input [length*2-1 : 0] in1,in2,in3,in4;
input cin;
output [length*2 : 0] out1,out2;
output cout;
wire [length*2-1 : 0] w1,w2,w3;
assign w1 = in1 ^ in2 ^ in3 ^ in4;
assign w2 = (in1 & in2) | (in3 & in4);
assign w3 = (in1 | in2) & (in3 | in4);
assign out2 = { w1[length*2-1] , w1} ^ {w3 , cin}; // sum
assign cout = w3[length*2-1];
assign out1 = ({ w1[length*2-1] , w1} & {w3 , cin}) | (( ~{w1[length*2-1] , w1}) & { w2[length*2-1] , w2});
endmodule
部分顶层架构模块
compressor42 compressor42_1_0 (pp0 ,pp1 ,pp2 ,pp3 , 1'b0 ,cpr_o_l1_0 ,cpr_o_l1_1 ,cout_l1_0 );
compressor42 compressor42_1_1 (pp4 ,pp5 ,pp6 ,pp7 , 1'b0 ,cpr_o_l1_2 ,cpr_o_l1_3 ,cout_l1_1 );
compressor42 compressor42_1_2 (pp8 ,pp9 ,pp10 ,pp11 , 1'b0 ,cpr_o_l1_4 ,cpr_o_l1_5 ,cout_l1_2 );
compressor42 compressor42_1_3 (pp12 ,pp13 ,pp14 ,pp15 , 1'b0 ,cpr_o_l1_6 ,cpr_o_l1_7 ,cout_l1_3 );
compressor42 compressor42_1_4 (pp16 ,pp17 ,pp18 ,pp19 , 1'b0 ,cpr_o_l1_8 ,cpr_o_l1_9 ,cout_l1_4 );
compressor42 compressor42_1_5 (pp20 ,pp21 ,pp22 ,pp23 , 1'b0 ,cpr_o_l1_10 ,cpr_o_l1_11 ,cout_l1_5 );
compressor42 compressor42_1_6 (pp24 ,pp25 ,pp26 ,pp27 , 1'b0 ,cpr_o_l1_12 ,cpr_o_l1_13 ,cout_l1_6 );
compressor42 compressor42_1_7 (pp28 ,pp29 ,pp30 ,pp31 , 1'b0 ,cpr_o_l1_14 ,cpr_o_l1_15 ,cout_l1_7 );
对于这种方法在压缩到最后一级时若是,32位乘法则需要做一个64位的两位数相加的计算,该模块可以优化的方案也很多,下面给出一个超前进位加法器的例子作为参考
compressor42 compressor42_5_0 (cpr_o_l4_0 [length*2-1:0]<<1 ,cpr_o_l4_1 [length*2-1:0] ,cpr_o_l4_2 [length*2-1:0]<<1 ,cpr_o_l4_3 [length*2-1:0] , 1'b0 ,cpr_o_l5_0 ,cpr_o_l5_1 ,cout_l5_0最终结果2
);
// cpr_o_l5_0最终结果1,cout_l5_0最终结果2
cla cla_0 (cpr_o_l5_0[length*2-1:0]<<1 ,cpr_o_l5_1[length*2-1:0] ,mul_o ,cout);
`timescale 1ns/1ps
module cla (
op1,op2,sum,cout
);
parameter width = 32 ;
input [width-1:0] op1;
input [width-1:0] op2;
output [width-1:0] sum;
output cout;
wire [width>>2:0] c;
assign c[0] = 1'b0;
assign cout = c[width>>2];
cla_4bit u_cla_4bit_0 (.op1( op1[ 0*4+3: 0*4] ),.op2( op2[ 0*4+3: 0*4] ),.cin( c[0 ] ),.sum( sum[ 0*4+3: 0*4] ),.cout( c[0 +1]));
cla_4bit u_cla_4bit_1 (.op1( op1[ 1*4+3: 1*4] ),.op2( op2[ 1*4+3: 1*4] ),.cin( c[1 ] ),.sum( sum[ 1*4+3: 1*4] ),.cout( c[1 +1]));
cla_4bit u_cla_4bit_2 (.op1( op1[ 2*4+3: 2*4] ),.op2( op2[ 2*4+3: 2*4] ),.cin( c[2 ] ),.sum( sum[ 2*4+3: 2*4] ),.cout( c[2 +1]));
cla_4bit u_cla_4bit_3 (.op1( op1[ 3*4+3: 3*4] ),.op2( op2[ 3*4+3: 3*4] ),.cin( c[3 ] ),.sum( sum[ 3*4+3: 3*4] ),.cout( c[3 +1]));
cla_4bit u_cla_4bit_4 (.op1( op1[ 4*4+3: 4*4] ),.op2( op2[ 4*4+3: 4*4] ),.cin( c[4 ] ),.sum( sum[ 4*4+3: 4*4] ),.cout( c[4 +1]));
cla_4bit u_cla_4bit_5 (.op1( op1[ 5*4+3: 5*4] ),.op2( op2[ 5*4+3: 5*4] ),.cin( c[5 ] ),.sum( sum[ 5*4+3: 5*4] ),.cout( c[5 +1]));
cla_4bit u_cla_4bit_6 (.op1( op1[ 6*4+3: 6*4] ),.op2( op2[ 6*4+3: 6*4] ),.cin( c[6 ] ),.sum( sum[ 6*4+3: 6*4] ),.cout( c[6 +1]));
cla_4bit u_cla_4bit_7 (.op1( op1[ 7*4+3: 7*4] ),.op2( op2[ 7*4+3: 7*4] ),.cin( c[7 ] ),.sum( sum[ 7*4+3: 7*4] ),.cout( c[7 +1]));
cla_4bit u_cla_4bit_8 (.op1( op1[ 8*4+3: 8*4] ),.op2( op2[ 8*4+3: 8*4] ),.cin( c[8 ] ),.sum( sum[ 8*4+3: 8*4] ),.cout( c[8 +1]));
cla_4bit u_cla_4bit_9 (.op1( op1[ 9*4+3: 9*4] ),.op2( op2[ 9*4+3: 9*4] ),.cin( c[9 ] ),.sum( sum[ 9*4+3: 9*4] ),.cout( c[9 +1]));
cla_4bit u_cla_4bit_10 (.op1( op1[10*4+3:10*4] ),.op2( op2[10*4+3:10*4] ),.cin( c[10] ),.sum( sum[10*4+3:10*4] ),.cout( c[10+1]));
cla_4bit u_cla_4bit_11 (.op1( op1[11*4+3:11*4] ),.op2( op2[11*4+3:11*4] ),.cin( c[11] ),.sum( sum[11*4+3:11*4] ),.cout( c[11+1]));
cla_4bit u_cla_4bit_12 (.op1( op1[12*4+3:12*4] ),.op2( op2[12*4+3:12*4] ),.cin( c[12] ),.sum( sum[12*4+3:12*4] ),.cout( c[12+1]));
cla_4bit u_cla_4bit_13 (.op1( op1[13*4+3:13*4] ),.op2( op2[13*4+3:13*4] ),.cin( c[13] ),.sum( sum[13*4+3:13*4] ),.cout( c[13+1]));
cla_4bit u_cla_4bit_14 (.op1( op1[14*4+3:14*4] ),.op2( op2[14*4+3:14*4] ),.cin( c[14] ),.sum( sum[14*4+3:14*4] ),.cout( c[14+1]));
cla_4bit u_cla_4bit_15 (.op1( op1[15*4+3:15*4] ),.op2( op2[15*4+3:15*4] ),.cin( c[15] ),.sum( sum[15*4+3:15*4] ),.cout( c[15+1]));
cla_4bit u_cla_4bit_16 (.op1( op1[16*4+3:16*4] ),.op2( op2[16*4+3:16*4] ),.cin( c[16] ),.sum( sum[16*4+3:16*4] ),.cout( c[16+1]));
cla_4bit u_cla_4bit_17 (.op1( op1[17*4+3:17*4] ),.op2( op2[17*4+3:17*4] ),.cin( c[17] ),.sum( sum[17*4+3:17*4] ),.cout( c[17+1]));
cla_4bit u_cla_4bit_18 (.op1( op1[18*4+3:18*4] ),.op2( op2[18*4+3:18*4] ),.cin( c[18] ),.sum( sum[18*4+3:18*4] ),.cout( c[18+1]));
cla_4bit u_cla_4bit_19 (.op1( op1[19*4+3:19*4] ),.op2( op2[19*4+3:19*4] ),.cin( c[19] ),.sum( sum[19*4+3:19*4] ),.cout( c[19+1]));
cla_4bit u_cla_4bit_20 (.op1( op1[20*4+3:20*4] ),.op2( op2[20*4+3:20*4] ),.cin( c[20] ),.sum( sum[20*4+3:20*4] ),.cout( c[20+1]));
cla_4bit u_cla_4bit_21 (.op1( op1[21*4+3:21*4] ),.op2( op2[21*4+3:21*4] ),.cin( c[21] ),.sum( sum[21*4+3:21*4] ),.cout( c[21+1]));
cla_4bit u_cla_4bit_22 (.op1( op1[22*4+3:22*4] ),.op2( op2[22*4+3:22*4] ),.cin( c[22] ),.sum( sum[22*4+3:22*4] ),.cout( c[22+1]));
cla_4bit u_cla_4bit_23 (.op1( op1[23*4+3:23*4] ),.op2( op2[23*4+3:23*4] ),.cin( c[23] ),.sum( sum[23*4+3:23*4] ),.cout( c[23+1]));
cla_4bit u_cla_4bit_24 (.op1( op1[24*4+3:24*4] ),.op2( op2[24*4+3:24*4] ),.cin( c[24] ),.sum( sum[24*4+3:24*4] ),.cout( c[24+1]));
cla_4bit u_cla_4bit_25 (.op1( op1[25*4+3:25*4] ),.op2( op2[25*4+3:25*4] ),.cin( c[25] ),.sum( sum[25*4+3:25*4] ),.cout( c[25+1]));
cla_4bit u_cla_4bit_26 (.op1( op1[26*4+3:26*4] ),.op2( op2[26*4+3:26*4] ),.cin( c[26] ),.sum( sum[26*4+3:26*4] ),.cout( c[26+1]));
cla_4bit u_cla_4bit_27 (.op1( op1[27*4+3:27*4] ),.op2( op2[27*4+3:27*4] ),.cin( c[27] ),.sum( sum[27*4+3:27*4] ),.cout( c[27+1]));
cla_4bit u_cla_4bit_28 (.op1( op1[28*4+3:28*4] ),.op2( op2[28*4+3:28*4] ),.cin( c[28] ),.sum( sum[28*4+3:28*4] ),.cout( c[28+1]));
cla_4bit u_cla_4bit_29 (.op1( op1[29*4+3:29*4] ),.op2( op2[29*4+3:29*4] ),.cin( c[29] ),.sum( sum[29*4+3:29*4] ),.cout( c[29+1]));
cla_4bit u_cla_4bit_30 (.op1( op1[30*4+3:30*4] ),.op2( op2[30*4+3:30*4] ),.cin( c[30] ),.sum( sum[30*4+3:30*4] ),.cout( c[30+1]));
cla_4bit u_cla_4bit_31 (.op1( op1[31*4+3:31*4] ),.op2( op2[31*4+3:31*4] ),.cin( c[31] ),.sum( sum[31*4+3:31*4] ),.cout( c[31+1]));
endmodule