输入图片和,32*32*16和滤波器,5*5*6*16,输出位28*28*16
卷积部分无非就是数据的相乘和相加,设计好乘法器和加法器并保存得到的结果(16)。考虑到数据精度采用半精度浮点数乘法器和半精度浮点数加法器。半精度浮点数详解
举例一个平常的数,这次反过来,如-1.5625×10^(-1)
即-0.15625 = -0.00101(十进制转二进制)= -1.01×2^(-3)
所以符号位为1,指数为-3+15=12,所以指数位为01100,尾数位为0100000000。所以-1.5625×10^(-1)用半精度浮点数表示就为1 01100 0100000000。注:小数部分二进制每次乘以2取整。
module floatAdd (
input wire [15:0] floatA,
input wire [15:0] floatB,
output reg [15:0] sum
);
reg sign; // 输出结果的正负标志位
reg signed [5:0] exponent; //输出数据的指数,因为有正负所以选择有符号数
reg [9:0] mantissa; // 输出数据的尾数
reg [4:0] exponentA, exponentB; //输入数据的阶数
reg [10:0] fractionA, fractionB, fraction; // 计算暂存位
reg [7:0] shiftAmount; // 移位寄存器,为了计算加法时配平阶数
reg cout;
always @ (floatA or floatB)
begin
exponentA = floatA[14:10];
exponentB = floatB[14:10];
fractionA = {1'b1,floatA[9:0]};
fractionB = {1'b1,floatB[9:0]};
exponent = exponentA;
if (floatA == 0) // 特殊情况A为0
begin
sum = floatB;
end
else if (floatB == 0) // 特殊情况B为0
begin
sum = floatA;
end
else if (floatA[14:0] == floatB[14:0] && floatA[15]^floatB[15]==1'b1) //特殊情况互为相反数
begin
sum=0;
end
else
begin
if (exponentB > exponentA) // 配平阶数使得相加两数在同一阶数上
begin
shiftAmount = exponentB - exponentA;
fractionA = fractionA >> (shiftAmount);
exponent = exponentB;
end
else if (exponentA > exponentB)
begin
shiftAmount = exponentA - exponentB;
fractionB = fractionB >> (shiftAmount);
exponent = exponentA;
end
if (floatA[15] == floatB[15]) // 两数同号
begin
{cout,fraction} = fractionA + fractionB;
if (cout == 1'b1)
begin
{cout,fraction} = {cout,fraction} >> 1;
exponent = exponent + 1;
end
sign = floatA[15];
end
else
begin //两数异号
if (floatA[15] == 1'b1) // A 为负数
begin
{cout,fraction} = fractionB - fractionA; // B-A
end
else
begin
{cout,fraction} = fractionA - fractionB; // A-B
end
sign = cout;
if (cout == 1'b1)
fraction = -fraction; // 0-负数可求出此数的绝对值
// 对franction进行阶数配平求出尾数
if (fraction [10] == 0) begin
if (fraction[9] == 1'b1) begin
fraction = fraction << 1;
exponent = exponent - 1;
end else if (fraction[8] == 1'b1) begin
fraction = fraction << 2;
exponent = exponent - 2;
end else if (fraction[7] == 1'b1) begin
fraction = fraction << 3;
exponent = exponent - 3;
end else if (fraction[6] == 1'b1) begin
fraction = fraction << 4;
exponent = exponent - 4;
end else if (fraction[5] == 1'b1) begin
fraction = fraction << 5;
exponent = exponent - 5;
end else if (fraction[4] == 1'b1) begin
fraction = fraction << 6;
exponent = exponent - 6;
end else if (fraction[3] == 1'b1) begin
fraction = fraction << 7;
exponent = exponent - 7;
end else if (fraction[2] == 1'b1) begin
fraction = fraction << 8;
exponent = exponent - 8;
end else if (fraction[1] == 1'b1) begin
fraction = fraction << 9;
exponent = exponent - 9;
end else if (fraction[0] == 1'b1) begin
fraction = fraction << 10;
exponent = exponent - 10;
end
end
end
mantissa = fraction[9:0];
if(exponent[5]==1'b1) begin //太小了输出全0太小了
sum = 16'b0000000000000000;
end
else begin
sum = {sign,exponent[4:0],mantissa}; // 组合数据
end
end
end
endmodule
测试代码
`timescale 100 ns / 10 ps
module floatAdd_TB ();
reg [15:0] floatA;
reg [15:0] floatB;
wire [15:0] sum;
initial begin
// 0.3 + 0.2
#0
floatA = 16'h34CD;
floatB = 16'h3266;
// 0.3 + 0
#10
floatA = 16'h34CD;
floatB = 16'h0000;
#10
$stop;
end
floatAdd FADD
(
.floatA(floatA),
.floatB(floatB),
.sum(sum)
);
endmodule
这里选择0.3+0.2,与0.3+0,对应二进制可以运算,结果无误
module floatMuilt
(
input wire [15:0] floatA,
input wire [15:0] floatB,
output reg [15:0] product
);
reg sign; // 输出的正负标志位
reg signed [5:0] exponent; // 输出数据的指数,因为有正负所以选择有符号数
reg [9:0] mantissa; // 输出数据的小数
reg [10:0] fractionA, fractionB; //fraction = {1,mantissa} // 计算二进制数据最高位补1
reg [21:0] fraction; // 相乘结果参数
always @ (floatA or floatB)
begin
if (floatA == 0 || floatB == 0) // 处理乘数有一个或者两个均为0的情况
product = 0; // 输出为0
else
begin
sign = floatA[15] ^ floatB[15]; // 异或门判断输出的计算正负
exponent = floatA[14:10] + floatB[14:10] - 5'd15 + 5'd2; // 由于借位给fractionA和fractionB需要先补齐两位指数
fractionA = {1'b1,floatA[9:0]}; //借位给fractionA
fractionB = {1'b1,floatB[9:0]}; //借位给fractionB
fraction = fractionA * fractionB; //计算二进制乘法
// 找到第一个不为0的数字并对指数进行匹配处理
if (fraction[21] == 1'b1)
begin
fraction = fraction << 1;
exponent = exponent - 1;
end
else if (fraction[20] == 1'b1)
begin
fraction = fraction << 2;
exponent = exponent - 2;
end
else if (fraction[19] == 1'b1)
begin
fraction = fraction << 3;
exponent = exponent - 3;
end
else if (fraction[18] == 1'b1)
begin
fraction = fraction << 4;
exponent = exponent - 4;
end
else if (fraction[17] == 1'b1)
begin
fraction = fraction << 5;
exponent = exponent - 5;
end
else if (fraction[16] == 1'b1)
begin
fraction = fraction << 6;
exponent = exponent - 6;
end
else if (fraction[15] == 1'b1)
begin
fraction = fraction << 7;
exponent = exponent - 7;
end
else if (fraction[14] == 1'b1)
begin
fraction = fraction << 8;
exponent = exponent - 8;
end
else if (fraction[13] == 1'b1)
begin
fraction = fraction << 9;
exponent = exponent - 9;
end
else if (fraction[12] == 1'b0)
begin
fraction = fraction << 10;
exponent = exponent - 10;
end
// 按照半精度浮点数的格式输出
mantissa = fraction[21:12];
if(exponent[5]==1'b1) begin //太小了输出全0(精度问题)
product=16'b0000000000000000;
end
else begin
product = {sign,exponent[4:0],mantissa}; //拼接输出数据
end
end
end
测试代码
`timescale 100 ns / 10 ps
module floatMult_TB ();
reg [15:0] floatA;
reg [15:0] floatB;
wire [15:0] product;
initial begin
// 4 * 5
#0
floatA = 16'b0100010000000000;
floatB = 16'b0100010100000000;
// 0.0004125 * 0
#10
floatA = 16'b0000111011000010;
floatB = 16'b0000000000000000;
#10
$stop;
end
floatMult FM
(
.floatA(floatA),
.floatB(floatB),
.product(product)
);
endmodule
这里采用4*5,与一个很小的数与0相乘,结果无误。
将计数一次得到的结果存储下来,用于后面的卷积,这里不做演示。
`timescale 100 ns / 10 ps
module processingElement(clk,reset,floatA,floatB,result);
parameter DATA_WIDTH = 16;
input clk, reset;
input [DATA_WIDTH-1:0] floatA, floatB;
output reg [DATA_WIDTH-1:0] result;
wire [DATA_WIDTH-1:0] multResult;
wire [DATA_WIDTH-1:0] addResult;
floatMult FM (floatA,floatB,multResult);
floatAdd FADD (multResult,result,addResult);
always @ (posedge clk or posedge reset) begin
if (reset == 1'b1) begin
result = 0;
end else begin
result = addResult;
end
end
endmodule
最后通过循环完成一整个卷积。
`timescale 100 ns / 10 ps
module convUnit(clk,reset,image,filter,result);
parameter DATA_WIDTH = 16;
parameter D = 1; //depth of the filter
parameter F = 5; //size of the filter
input clk, reset;
input [0:D*F*F*DATA_WIDTH-1] image, filter;
output [0:DATA_WIDTH-1] result;
reg [DATA_WIDTH-1:0] selectedInput1, selectedInput2;
integer i;
processingElement PE
(
.clk(clk),
.reset(reset),
.floatA(selectedInput1),
.floatB(selectedInput2),
.result(result)
);
// The convolution is calculated in a sequential process to save hardware
// The result of the element wise matrix multiplication is finished after (F*F+2) cycles (2 cycles to reset the processing element and F*F cycles to accumulate the result of the F*F multiplications)
always @ (posedge clk, posedge reset) begin
if (reset == 1'b1) begin // reset
i = 0;
selectedInput1 = 0;
selectedInput2 = 0;
end else if (i > D*F*F-1) begin // if the convolution is finished but we still wait for other blocks to finsih, send zeros to the conv unit (in case of pipelining)
selectedInput1 = 0;
selectedInput2 = 0;
end else begin // send one element of the image part and one element of the filter to be multiplied and accumulated
selectedInput1 = image[DATA_WIDTH*i+:DATA_WIDTH];
selectedInput2 = filter[DATA_WIDTH*i+:DATA_WIDTH];
i = i + 1;
end
end
endmodule
在进行卷积之前需要将处理的数据提取出来,比如55的矩阵,1-25的依次排列,过滤器选择22,就需要将1,2,6,7提取出来。
`timescale 100 ns / 10 ps
//this modules takes as inputs the image, a row number and a column number
//it fills the output array with matrices of the parts of the image to be sent to the conv units
module RFselector(image,rowNumber, column,receptiveField);
parameter DATA_WIDTH = 16;
parameter D = 1; //Depth of the filter
parameter H = 32; //Height of the image
parameter W = 32; //Width of the image
parameter F = 5; //Size of the filter
input [0:D*H*W*DATA_WIDTH-1] image;
input [5:0] rowNumber, column;
output reg [0:(((W-F+1)/2)*D*F*F*DATA_WIDTH)-1] receptiveField; //array to hold the matrices (parts of the image) to be sent to the conv units
//address: counter to fill the receptive filed array
//c: counter to loop on the columns of the input image
//k: counter to loop on the depth of the input image
//i: counter to loop on the rows of the input image
integer address, c, k, i;
always @ (image or rowNumber or column) begin
address = 0;
if (column == 0) begin //if the column is zero fill the array with the parts of the image correspoding to the first half of pixels of the row (with rowNumber) of the output image
for (c = 0; c < (W-F+1)/2; c = c + 1) begin
for (k = 0; k < D; k = k + 1) begin
for (i = 0; i < F; i = i + 1) begin
receptiveField[address*F*DATA_WIDTH+:F*DATA_WIDTH] = image[rowNumber*W*DATA_WIDTH+c*DATA_WIDTH+k*H*W*DATA_WIDTH+i*W*DATA_WIDTH+:F*DATA_WIDTH];
address = address + 1;
end
end
end
end else begin //if the column is zero fill the array with the parts of the image correspoding to the second half of pixels of the row (with rowNumber) of the output image
for (c = (W-F+1)/2; c < (W-F+1); c = c + 1) begin
for (k = 0; k < D; k = k + 1) begin
for (i = 0; i < F; i = i + 1) begin
receptiveField[address*F*DATA_WIDTH+:F*DATA_WIDTH] = image[rowNumber*W*DATA_WIDTH+c*DATA_WIDTH+k*H*W*DATA_WIDTH+i*W*DATA_WIDTH+:F*DATA_WIDTH];
address = address + 1;
end
end
end
end
end
endmodule
接下来就需要选择过滤器的移动来完成一整个卷积层。
`timescale 100 ns / 10 ps
module convLayerSingle(clk,reset,image,filter,outputConv);
parameter DATA_WIDTH = 16;
parameter D = 1; //Depth of the filter
parameter H = 32; //Height of the image
parameter W = 32; //Width of the image
parameter F = 5; //Size of the filter
input clk, reset;
input [0:D*H*W*DATA_WIDTH-1] image;
input [0:D*F*F*DATA_WIDTH-1] filter;
output reg [0:(H-F+1)*(W-F+1)*DATA_WIDTH-1] outputConv; // output of the module
wire [0:((W-F+1)/2)*DATA_WIDTH-1] outputConvUnits; // output of the conv units and input to the row selector
reg internalReset;
wire [0:(((W-F+1)/2)*D*F*F*DATA_WIDTH)-1] receptiveField; // array of the matrices to be sent to conv units
integer counter, outputCounter;
//counter: number of clock cycles need for the conv unit to finsish
//outputCounter: index to map the output of the conv units to the output of the module
reg [5:0] rowNumber, column;
//rowNumber: determines the row that is calculated by the conv units
//column: determines if we are calculating the first or the second 14 pixels of the output row
RFselector
#(
.DATA_WIDTH(DATA_WIDTH),
.D(D),
.H(H),
.W(W),
.F(F)
) RF
(
.image(image),
.rowNumber(rowNumber),
.column(column),
.receptiveField(receptiveField)
);
genvar n;
generate //generating n convolution units where n is half the number of pixels in one row of the output image
for (n = 0; n < (H-F+1)/2; n = n + 1)
begin: convLayerSingle
convUnit
#(
.D(D),
.F(F)
) CU
(
.clk(clk),
.reset(internalReset),
.image(receptiveField[n*D*F*F*DATA_WIDTH+:D*F*F*DATA_WIDTH]),
.filter(filter),
.result(outputConvUnits[n*DATA_WIDTH+:DATA_WIDTH])
);
end
endgenerate
always @ (posedge clk or posedge reset) begin
if (reset == 1'b1) begin
internalReset = 1'b1;
rowNumber = 0;
column = 0;
counter = 0;
outputCounter = 0;
end else if (rowNumber < H-F+1) begin
if (counter == D*F*F+2) begin //The conv unit finishes ater 1*5*5+2 clock cycles
outputCounter = outputCounter + 1;
counter = 0;
internalReset = 1'b1;
if (column == 0) begin
column = (H-F+1)/2;
end else begin
rowNumber = rowNumber + 1;
column = 0;
end
end else begin
internalReset = 0;
counter = counter + 1;
end
end
end
always @ (*) begin // connecting the output of the conv units with the output of the module
outputConv[outputCounter*((W-F+1)/2)*DATA_WIDTH+:((W-F+1)/2)*DATA_WIDTH] = outputConvUnits;
end
endmodule