Hi everyone. Currently Im designing systolic array using Verilog HDL. Systolic array is consist of an array of Processing Element (PE). The design is synthesise usng Xilinx ISE. Below is my previous design for 2 PEs.

1 | module SystolicArray(Clk,Rst,SubSec,QueSec,Gap,Score); |

2 | parameter ComputeDataWidth = 15; |

3 | parameter PE = 2; //LENGTH |

4 | localparam |

5 | N_A = 3'b000, //nucleotide "A" |

6 | N_C = 3'b001, //nucleotide "C" |

7 | N_G = 3'b010, //nucleotide "G" |

8 | N_T = 3'b011, //nucleotide "T" |

9 | G_P = 3'b100; //gap '-' |

10 | |

11 | input Clk,Rst; |

12 | input [2:0] SubSec,Gap; |

13 | input wire [PE*3-1:0] QueSec; |

14 | output signed [ComputeDataWidth-1:0] Score; |

15 | |

16 | wire [ComputeDataWidth-1:0] d [PE-1:0]; |

17 | wire [ComputeDataWidth-1:0] t [PE-1:0]; |

18 | wire [ComputeDataWidth-1:0] l [PE-1:0]; |

19 | wire [ComputeDataWidth-1:0] dl [PE-1:0]; |

20 | wire [ComputeDataWidth-1:0] score [PE-1:0]; |

21 | wire [PE*3-1:0] SSout,GAPout; |

22 | |

23 | assign Score = score[PE-1] ; |

24 | genvar i; |

25 | generate |

26 | for (i=0; i < PE; i = i + 1) |

27 | begin : pe_block |

28 | if (i == 0) //first processing element in auto-generated chain |

29 | begin:pe |

30 | ProcessingElement pe0 |

31 | ( .Clk (Clk), |

32 | .Rst (Rst), |

33 | .SS (SubSec[2:0]), |

34 | .QC (QueSec[2:0]), |

35 | .Gap (Gap[2:0]), |

36 | .DDiag (15'b0), |

37 | .DTop (15'b0), |

38 | .DLeft (15'b0), |

39 | .LDiag (15'b0), |

40 | .LLeft (15'b0), |

41 | .Zero (15'b0), |

42 | .SCORE_PE_Previous (15'b0), |

43 | .DiagOut (d[i]), |

44 | .LeftOut (l[i]), |

45 | .TopOut (t[i]), |

46 | .SCORE_Left_D (dl[i]), |

47 | .SS_Out (SSout[2:0]), |

48 | .Gap_Out (GAPout[2:0]), |

49 | .Best_So_Far_PE_SCORE (score[i]) |

50 | ); |

51 | end |

52 | else //processing elements other than first one |

53 | begin:pe |

54 | ProcessingElement pe1 |

55 | ( .Clk (Clk), |

56 | .Rst (Rst), |

57 | .SS (SSout[(3*i)-1:(3*i)-3]), |

58 | .QC (QueSec[(3*i)+2:3*i]), |

59 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

60 | .DDiag (d[i-1]), |

61 | .DTop (t[i-1]), |

62 | .DLeft (dl[i-1]), |

63 | .LDiag (d[i-1]), |

64 | .LLeft (l[i-1]), |

65 | .Zero (15'b0), |

66 | .SCORE_PE_Previous (score[i-1]), |

67 | .DiagOut (d[i]), |

68 | .LeftOut (l[i]), |

69 | .TopOut (t[i]), |

70 | .SCORE_Left_D (dl[i]), |

71 | .SS_Out (SSout[(3*i)+2:3*i]), |

72 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

73 | .Best_So_Far_PE_SCORE (score[i]) |

74 | ); |

75 | end |

76 | end |

77 | endgenerate |

78 | endmodule |

The problem with this design is that the Query input is large (3*no of PE). If i want to generate 100 PEs, it is impossible as it can only fit up to 75 PEs for the available I/O of the choosen devices (shown in the attached image). So if come with another solution shown below.

1 | module SA_Test1(Clk,Rst,SubSec,Gap,Score); |

2 | |

3 | parameter ComputeDataWidth = 8; |

4 | parameter PE = 8; //LENGTH |

5 | |

6 | ```
localparam
``` |

7 | N_A = 3'b000, //nucleotide "A" |

8 | N_C = 3'b001, //nucleotide "C" |

9 | N_G = 3'b010, //nucleotide "G" |

10 | N_T = 3'b011, //nucleotide "T" |

11 | G_P = 3'b100; //gap '-' |

12 | |

13 | input Clk,Rst; |

14 | input [2:0] SubSec,Gap; |

15 | output signed [ComputeDataWidth-1:0] Score; |

16 | |

17 | |

18 | wire [ComputeDataWidth-1:0] d [PE-1:0]; |

19 | wire [ComputeDataWidth-1:0] t [PE-1:0]; |

20 | wire [ComputeDataWidth-1:0] l [PE-1:0]; |

21 | wire [ComputeDataWidth-1:0] score [PE-1:0]; |

22 | wire [PE*3-1:0] SSout,GAPout; |

23 | |

24 | assign Score=score[PE-1]; |

25 | |

26 | genvar i; |

27 | |

28 | ```
generate
``` |

29 | for (i=0; i < PE; i = i + 1) |

30 | begin : pe_block |

31 | if (i == 0) //first processing element in auto-generated chain |

32 | begin:pe |

33 | ProcessingElement pe0 |

34 | ( .Clk (Clk), |

35 | .Rst (Rst), |

36 | .SS (SubSec), |

37 | .QC ({N_T}), |

38 | .Gap (Gap), |

39 | .DDiag (8'b0), |

40 | .DTop (8'b0), |

41 | .DLeft (8'b0), |

42 | .LDiag (8'b0), |

43 | .LLeft (8'b0), |

44 | .Zero (8'b0), |

45 | .SCORE_PE_Previous (8'b0), |

46 | .DiagOut (d[i]), |

47 | .LeftOut (l[i]), |

48 | .TopOut (t[i]), |

49 | .SS_Out (SSout[2:0]), |

50 | .Gap_Out (GAPout[2:0]), |

51 | .Max_PE_Score (score[0]) |

52 | ```
);
``` |

53 | ```
end
``` |

54 | if (i == 1) |

55 | begin:pe |

56 | ProcessingElement pe1 |

57 | ( .Clk (Clk), |

58 | .Rst (Rst), |

59 | .SS (SSout[(3*i)-1:(3*i)-3]), |

60 | .QC ({N_G}), |

61 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

62 | .DDiag (d[i-1]), |

63 | .DTop (t[i-1]), |

64 | .DLeft (l[i-1]), |

65 | .LDiag (d[i-1]), |

66 | .LLeft (l[i-1]), |

67 | .Zero (8'b0), |

68 | .SCORE_PE_Previous (score[i-1]), |

69 | .DiagOut (d[i]), |

70 | .LeftOut (l[i]), |

71 | .TopOut (t[i]), |

72 | .SS_Out (SSout[(3*i)+2:3*i]), |

73 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

74 | .Max_PE_Score (score[i]) |

75 | ```
);
``` |

76 | ```
end
``` |

77 | if (i == 2) |

78 | begin:pe |

79 | ProcessingElement pe2 |

80 | ( .Clk (Clk), |

81 | .Rst (Rst), |

82 | .SS (SSout[(3*i)-1:(3*i)-3]), |

83 | .QC ({N_C}), |

84 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

85 | .DDiag (d[i-1]), |

86 | .DTop (t[i-1]), |

87 | .DLeft (l[i-1]), |

88 | .LDiag (d[i-1]), |

89 | .LLeft (l[i-1]), |

90 | .Zero (8'b0), |

91 | .SCORE_PE_Previous (score[i-1]), |

92 | .DiagOut (d[i]), |

93 | .LeftOut (l[i]), |

94 | .TopOut (t[i]), |

95 | .SS_Out (SSout[(3*i)+2:3*i]), |

96 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

97 | .Max_PE_Score (score[i]) |

98 | ```
);
``` |

99 | ```
end
``` |

100 | if (i == 3) |

101 | begin:pe |

102 | ProcessingElement pe3 |

103 | ( .Clk (Clk), |

104 | .Rst (Rst), |

105 | .SS (SSout[(3*i)-1:(3*i)-3]), |

106 | .QC ({N_T}), |

107 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

108 | .DDiag (d[i-1]), |

109 | .DTop (t[i-1]), |

110 | .DLeft (l[i-1]), |

111 | .LDiag (d[i-1]), |

112 | .LLeft (l[i-1]), |

113 | .Zero (8'b0), |

114 | .SCORE_PE_Previous (score[i-1]), |

115 | .DiagOut (d[i]), |

116 | .LeftOut (l[i]), |

117 | .TopOut (t[i]), |

118 | .SS_Out (SSout[(3*i)+2:3*i]), |

119 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

120 | .Max_PE_Score (score[i]) |

121 | ```
);
``` |

122 | ```
end
``` |

123 | if (i == 4) |

124 | begin:pe |

125 | ProcessingElement pe4 |

126 | ( .Clk (Clk), |

127 | .Rst (Rst), |

128 | .SS (SSout[(3*i)-1:(3*i)-3]), |

129 | .QC ({N_C}), |

130 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

131 | .DDiag (d[i-1]), |

132 | .DTop (t[i-1]), |

133 | .DLeft (l[i-1]), |

134 | .LDiag (d[i-1]), |

135 | .LLeft (l[i-1]), |

136 | .Zero (8'b0), |

137 | .SCORE_PE_Previous (score[i-1]), |

138 | .DiagOut (d[i]), |

139 | .LeftOut (l[i]), |

140 | .TopOut (t[i]), |

141 | .SS_Out (SSout[(3*i)+2:3*i]), |

142 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

143 | .Max_PE_Score (score[i]) |

144 | ```
);
``` |

145 | ```
end
``` |

146 | if (i == 5) |

147 | begin:pe |

148 | ProcessingElement pe5 |

149 | ( .Clk (Clk), |

150 | .Rst (Rst), |

151 | .SS (SSout[(3*i)-1:(3*i)-3]), |

152 | .QC ({N_G}), |

153 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

154 | .DDiag (d[i-1]), |

155 | .DTop (t[i-1]), |

156 | .DLeft (l[i-1]), |

157 | .LDiag (d[i-1]), |

158 | .LLeft (l[i-1]), |

159 | .Zero (8'b0), |

160 | .SCORE_PE_Previous (score[i-1]), |

161 | .DiagOut (d[i]), |

162 | .LeftOut (l[i]), |

163 | .TopOut (t[i]), |

164 | .SS_Out (SSout[(3*i)+2:3*i]), |

165 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

166 | .Max_PE_Score (score[i]) |

167 | ```
);
``` |

168 | ```
end
``` |

169 | if (i == 6) |

170 | begin:pe |

171 | ProcessingElement pe6 |

172 | ( .Clk (Clk), |

173 | .Rst (Rst), |

174 | .SS (SSout[(3*i)-1:(3*i)-3]), |

175 | .QC ({N_T}), |

176 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

177 | .DDiag (d[i-1]), |

178 | .DTop (t[i-1]), |

179 | .DLeft (l[i-1]), |

180 | .LDiag (d[i-1]), |

181 | .LLeft (l[i-1]), |

182 | .Zero (8'b0), |

183 | .SCORE_PE_Previous (score[i-1]), |

184 | .DiagOut (d[i]), |

185 | .LeftOut (l[i]), |

186 | .TopOut (t[i]), |

187 | .SS_Out (SSout[(3*i)+2:3*i]), |

188 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

189 | .Max_PE_Score (score[i]) |

190 | ```
);
``` |

191 | ```
end
``` |

192 | if (i == 7) |

193 | begin:pe |

194 | ProcessingElement pe7 |

195 | ( .Clk (Clk), |

196 | .Rst (Rst), |

197 | .SS (SSout[(3*i)-1:(3*i)-3]), |

198 | .QC ({N_A}), |

199 | .Gap (GAPout[(3*i)-1:(3*i)-3]), |

200 | .DDiag (d[i-1]), |

201 | .DTop (t[i-1]), |

202 | .DLeft (l[i-1]), |

203 | .LDiag (d[i-1]), |

204 | .LLeft (l[i-1]), |

205 | .Zero (8'b0), |

206 | .SCORE_PE_Previous (score[i-1]), |

207 | .DiagOut (d[i]), |

208 | .LeftOut (l[i]), |

209 | .TopOut (t[i]), |

210 | .SS_Out (SSout[(3*i)+2:3*i]), |

211 | .Gap_Out (GAPout[(3*i)+2:3*i]), |

212 | .Max_PE_Score (score[i]) |

213 | ```
);
``` |

214 | ```
end
``` |

215 | ```
end
``` |

216 | ```
endgenerate
``` |

217 | ```
endmodule
``` |

However, I found it hard to copy all this foe 100 PEs and change the DNA character at QC input. Can anyone give me some idea hoe to improve this design?