forked from Shikunliu/2D_Convolution_Vivado_HLS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcore.cpp
213 lines (191 loc) · 6.73 KB
/
core.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#include "core.h"
void doImgproc(hls::stream<uint_8_side_channel> &inStream, hls::stream<uint_8_side_channel> &inStream2, hls::stream<int_8_side_channel> &outStream, char kernel[KERNEL_DIM*KERNEL_DIM],int operation)
{
#pragma HLS INTERFACE s_axilite port=kernel bundle=KERNEL_BUS
#pragma HLS INTERFACE s_axilite port=operation bundle=CRTL_BUS
#pragma HLS INTERFACE s_axilite port=return bundle=CRTL_BUS
#pragma HLS INTERFACE axis port=inStream
#pragma HLS INTERFACE axis port=inStream2
#pragma HLS INTERFACE axis port=outStream
//Define the line buffer and setting the internal dependency to false through program
hls::LineBuffer<KERNEL_DIM, IMG_WIDTH_OR_COLS, unsigned char> lineBuff; //Define the size of line buffer £¨3 of linebuffer which size of 240£©
hls::Window<KERNEL_DIM,KERNEL_DIM,short> window; //Define the size of the window
//Index used to keep track of row, clo
int idxCol = 0;
int idxRow = 0;
int pixConvolved = 0;
//Calculate delay to fix line-buffer offset
int waitTicks = (IMG_WIDTH_OR_COLS*(KERNEL_DIM-1)+KERNEL_DIM)/2;
int countWait = 0;
int sentPixels = 0;
int_8_side_channel dataOutSideChannel;
uint_8_side_channel currPixelSideChannel;
int_8_side_channel dataOutSideChannel2;
uint_8_side_channel currPixelSideChannel2;
//Iterate on all pixels for 320x240 image, the HLS PIPELINE improves the latency
for(int idxPixel = 0; idxPixel < (IMG_WIDTH_OR_COLS*IMG_HEIGHT_OR_ROWS); idxPixel++) // Sweep all the pixels
{
#pragma HLS PIPELINE
//Read and cache
currPixelSideChannel = inStream.read();
//Get the pixel data
unsigned char pixelIn = currPixelSideChannel.data;
currPixelSideChannel2 = inStream2.read();
//Get the pixel data
unsigned char pixelIn2 = currPixelSideChannel2.data;
//Put data on the Linebuffer
lineBuff.shift_up(idxCol);
lineBuff.insert_top(pixelIn,idxCol);
//Put data on the window and multiply with kernel, in this case, only 9 times multiplication, finish one time of window x kernel
for(int idxWinRow = 0; idxWinRow < KERNEL_DIM; idxWinRow++)
{
for (int idxWinCol = 0; idxWinCol < KERNEL_DIM; idxWinCol++)
{
//idxWinCol + pixConvolved, will slide the window ...
short val = (short)lineBuff.getval(idxWinRow, idxWinCol+pixConvolved); //Get value at specific position
//Multiply kernel by the sampling window
val = (short)kernel[(idxWinRow*KERNEL_DIM) + idxWinCol]*val; //Kernel x value in
window.insert(val, idxWinRow, idxWinCol); //Save back to the corresponding position, now the window is the product of kernel and the same size (3 x 3 window)
}
}
//Avoid calculate out of the image boundaries
short valOutput = 0;
if ((idxRow >= KERNEL_DIM-1) && (idxCol >= KERNEL_DIM-1)) // Once finish 9 times multiplication
{
switch (operation)
{
case 0:
{
//Convolution
valOutput = sumWindow(&window); //Sum of 9 results
valOutput = valOutput /8;
//Avoid negative values
if (valOutput <0)
valOutput = 0;
break;
}
case 1:
{
//Erode
valOutput = minWindow(&window);
break;
}
case 2:
{
//Dilate
valOutput = maxWindow(&window);
break;
}
}
pixConvolved++;
}
//Calculate row and col index
if (idxCol < IMG_WIDTH_OR_COLS-1)
{
idxCol++;// Switch to next pixel in the same line
}
else
{
//New line
idxCol = 0;
idxRow++;
pixConvolved = 0;
}
valOutput = valOutput + (short)pixelIn2;
/*
* Fix the line buffer delay, on a 320x240image with 3x3 kernel, the delay will be
* ((240*2) +3)/2 = 241
* So we wait for 241 ticks send the results than out more 241 zeros
*/
//Put data on output stream (side-channel(tlast) way...)
dataOutSideChannel.data = valOutput;
dataOutSideChannel.keep = currPixelSideChannel.keep;
dataOutSideChannel.strb = currPixelSideChannel.strb;
dataOutSideChannel.user = currPixelSideChannel.user;
dataOutSideChannel.last = currPixelSideChannel.last;
dataOutSideChannel.id = currPixelSideChannel.id;
dataOutSideChannel.dest = currPixelSideChannel.dest;
//Send to the stream(Block if the FIFOreceiver is full)
outStream.write(dataOutSideChannel);
/*countWait++;
if (countWait > waitTicks)
{
dataOutSideChannel.data = valOutput;
dataOutSideChannel.keep = currPixelSideChannel.keep;
dataOutSideChannel.strb = currPixelSideChannel.strb;
dataOutSideChannel.user = currPixelSideChannel.user;
dataOutSideChannel.last = currPixelSideChannel.last;
dataOutSideChannel.id = currPixelSideChannel.id;
dataOutSideChannel.dest = currPixelSideChannel.dest;
//Send to the stream (Block if the IFOreceiver is full)
outStream.write(dataOutSideChannel);
sentPixels++;
}
*/
}
/*//Now send the remaining zeros (Just the Number of delayed ticks)
for (countWait = 0; countWait < waitTicks; countWait++)
{
dataOutSideChannel.data = 0;
dataOutSideChannel.keep = currPixelSideChannel.keep;
dataOutSideChannel.strb = currPixelSideChannel.strb;
dataOutSideChannel.user = currPixelSideChannel.user;
//Send last on the last item
if(countWaut <waitTick - 1)
dataOutSideChannel.last = 0;
else
dataOutSideChannel.last = 1;
dataOutSideChannel.id = currPixelSideChannel.id;
dataOutSideChannel.dest = currPixelSideChannel.dest;
// Send to the stream (Blockif the FIFO receiver is full)
outStream.write(dataOutSideChannel);
}
*/
}
short minWindow(hls::Window<KERNEL_DIM, KERNEL_DIM, short>*window)
{
unsigned char minVal = 255;
//look for the smallest value in the array
for (int idxRow = 0; idxRow < KERNEL_DIM; idxRow++)
{
for (int idxCol = 0; idxCol < KERNEL_DIM; idxCol++)
{
unsigned char valInWindow;
valInWindow = (unsigned char)window->getval(idxRow, idxCol);
if (valInWindow < minVal)
minVal = valInWindow;
}
}
return minVal;
}
//Dilate will get the maximum value on the window, which in our case will always be rectangular and filled with one
short maxWindow(hls::Window<KERNEL_DIM, KERNEL_DIM, short>*window)
{
unsigned char maxVal = 0;
//Look for the max value in the array
for (int idxRow = 0; idxRow < KERNEL_DIM; idxRow++)
{
for (int idxCol = 0; idxCol < KERNEL_DIM; idxCol++)
{
unsigned char valInWindow;
valInWindow = (unsigned char)window->getval(idxRow, idxCol);
if(valInWindow > maxVal)
maxVal= valInWindow;
}
}
return maxVal;
}
//Sum all values inside window (Already multiplied by the kernel)
short sumWindow(hls::Window<KERNEL_DIM, KERNEL_DIM,short>*window)
{
short accumulator = 0;
//Iterate on the window multiplying and accumulating the kernel and sampling window
for (int idxRow = 0; idxRow < KERNEL_DIM; idxRow++)
{
for (int idxCol = 0; idxCol < KERNEL_DIM; idxCol++)
{
accumulator = accumulator + (short)window->getval(idxRow, idxCol);
}
}
return accumulator;
}