|
cudaRecursiveGaussianFilter.w
|
1 \documentclass[a4paper,10pt,titlepage]{cweb}
2
3 \usepackage{amsmath,amssymb}
4 \usepackage{graphicx}
5 \usepackage{anysize}
6
7
8 \marginsize{3cm}{3cm}{3cm}{3cm}
9 \newcommand{\im}{\textfrak{i}}
10 \newcommand{\imag}{\im}
11 \newcommand{\absolut}[1]{{\ensuremath \mid \! #1\! \mid}}
12 \newcommand{\mat}[1]{\ensuremath\mathbf{#1}}
13
14 \graphicspath{{img/}}
15 \setkeys{Gin}{width=0.3\textwidth}
16
17 \def\CEE/{C\spacefactor 1000 }
18 \def\cweb{{\tt CWEB\/}}
19 \def\UNIX/{{\small UNIX\/}}
20 \def\WEB{{\tt WEB\/}}
21 \def\ml{{\textit{MathLink}}}
22 \def\mma{{\textit{Mathematica}}}
23 \def\cuda{{\sc Cuda}}
24 \def\Jxy{J_{xy}}
25 \def\nx{{n_x}}
26 \def\ny{{n_y}}
27 \def\mx{{m_x}}
28 \def\my{{m_y}}
29
30 @s alpha TeX
31
32 \begin{document}
33 \thispagestyle{empty}
34 \title{{\it Recursive Gaussian Filter for Mathematica}\\using CUDA}
35 \author{Patrick Scheibe}
36 \date{December 2008}
37 \maketitle
38 \cleardoublepage
39 \pagestyle{plain}
40
41 \maketitle
42 \tableofcontents
43
44 @* Introduction.
45
46
47 This implementation bases on~\cite{young1995rig}. It demonstrates how to combine \mma\ and
48 \cuda\ and it is written in \cweb. Three parts are important
49 \begin{itemize}
50 \item I need a \ml -template to call the C-function from \mma. This connects parameters and
51 function names of the corresponding parts in C and in \mma.
52 \item A C-function is required which allocates stores for arrays, does some simple precomputations
53 of the gaussian parameters
54 and calls then the \cuda -kernel. After the computation is finished the results are send to \mma.
55 I will call this the {\em host-code} since all this does not run on the gpu.
56 \item The most important part is the \cuda -kernel which define what is done on the gpu in
57 parallel. This part is called the {\em device-code}.
58 \end{itemize}
59
60 @d NUM_OF_THREADS 32
61
62 @c
63 @<Headers@>@;
64 @<CUDA Kernel for the Gaussian Filter@>@;
65 @<Mathematica MathLink Template for the Gaussian Filter@>@;
66 @<Gaussian Filter@>@;
67 @<Main Loop@>@;
68
69 @ The includes.
70 @<Headers@>=
71 #include <mathlink.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 #include <cutil.h>
76 #include <math.h>
77
78
79 @ The main-loop for the {\ml}-program. Note that this is only for the linux OS (maybe OSX too).
80 @<Main Loop@>=
81 int main(int argc, char *argv[]) {
82 CUT_DEVICE_INIT(argc, argv);
83 return MLMain(argc,argv);
84 }
85
86
87
88 @* {\ml}-template code.
89 The Mathematica Template connects the C-function, |cudaGaussianFilter(...)|, and
90 the Mathematica
91 function, |CudaGaussianFilter[...]|, which is a lowlevel function since it gets not an image but
92 a list of the pixel-values and the dimensions. Later I will wrap this lowlevel function with
93 a highlevel version which is called with an image. In the line starting with {\em Pattern} you can
94 see that
95 I get the graylevel-bitmap |bm| as one-dimensional list from {\mma}. Since I only test whether
96 |bm| is a list and not whether it is one-dimensional, I flatten it in the |Arguments| line.
97 Furthermore, I ensure that the sigma is evaluated to a numerical value.
98 @<Mathematica MathLink...@>=
99 @=:Begin:@>@#
100 @=:Function:cudaGaussianFilter@>@#
101 @=:Pattern:CudaGaussianFilter[bm_List,nx_Integer,ny_Integer,sigma_?NumericQ]@>@#
102 @=:Arguments:{Flatten[bm],nx,ny,N[sigma]}@>@#
103 @=:ArgumentTypes:{RealList,Integer,Integer,Real}@>@#
104 @=:ReturnType:Manual@>@#
105 @=:End:@>
106
107
108 @* The Host-Code.
109 Before calling the \cuda -kernel I have to allocate and initialise the arrays. In this step
110 I transfer the image into the memory of the graphics-card. After that the whole filter consists
111 of only 4 sequential steps where every step runs in parallel on the gpu:
112 \begin{enumerate}
113 \item All rows are processed separate with the method described in~\cite{young1995rig}. This
114 consists basically of a forward run followed by a backward run through the row.
115 \item Now all columns need to be processed. Therefore, the bitmap is transposed and
116 \item the first step is called again, working now on the columns (Transposing a matrix means
117 first row becomes first column, second row becomes second column and so on).
118 \item A final transpose rotates the bitmap to its original form.
119 \end{enumerate}
120 I end with returning the result to {\mma} and freeing all allocated memory.
121 @<Gaussian Filter@>=
122 void cudaGaussianFilter(double *h_bm, long n, int nx, int ny, double sigma){
123 double *d_bm,
124 *d_bm_transposed,
125 *h_res; /* The array where the result is stored */
126 double q; /* An adapted version of the sigma */
127 size_t stride,stride_tr; /* The stride I have to use instead of ny */
128 int blocksize, /* How many thread per block */
129 gridsize; /* How many blocks in all */
130
131 cudaError_t err;
132
133 blocksize = NUM_OF_THREADS;
134 gridsize = (ny%blocksize==0)?ny/blocksize:ny/blocksize+1; /* make enough blocks if
135 ny is not divisible by the choosen |blocksize| which is usually 32*/
136
137 @<Calculate the parameter for the gaussian filter@>@;
138 CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm,&stride,nx*sizeof(double),ny) );
139 CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm_transposed,&stride_tr,ny*sizeof(double),nx));
140 CUDA_SAFE_CALL(
141 cudaMemcpy2D((void*)d_bm,stride,(void*)h_bm, nx*sizeof(double),
142 nx*sizeof(double),ny,cudaMemcpyHostToDevice)
143 );
144 h_res = (double *) calloc(sizeof(double), nx*ny);
145
146 cudaGaussKernel<<<gridsize,blocksize>>>(d_bm,stride,nx,ny,b0,b1,b2,b3,b4);
147 cudaTransposeKernel<<<gridsize,blocksize>>>(d_bm,stride,nx,ny,d_bm_transposed,stride_tr);
148 cudaGaussKernel<<<gridsize,blocksize>>>(d_bm_transposed,stride_tr,
149 ny,nx,b0,b1,b2,b3,b4);
150 cudaTransposeKernel<<<gridsize,blocksize>>>(d_bm_transposed,stride_tr,ny,nx,d_bm,stride);
151
152 err = cudaGetLastError();
153 if( cudaSuccess != err) {
154 fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString( err) );
155 exit(EXIT_FAILURE);
156 }
157
158 CUDA_SAFE_CALL(
159 cudaMemcpy2D((void*) h_res, nx*sizeof(double),(void*) d_bm, stride,
160 nx*sizeof(double), ny, cudaMemcpyDeviceToHost)
161 );
162
163 MLPutRealList(stdlink, h_res, nx*ny);
164 cudaFree(d_bm);
165 cudaFree(d_bm_transposed);
166 free(h_res);
167 }
168
169
170 @* The Device Code.
171 This part consists of two {\em Kernels} (functions running on the gpu), one for the transposition
172 of a matrix and one for the gaussian filter. Note that a kernel is (here) a chunk of code which
173 processes exactly one row (column) of the bitmap. {\cuda} will run many instances of the kernel
174 in parallel. You may ask how {\cuda} know which kernel should process which row. This is done
175 through the |blockIdx|, |threadIdx| and |blockDim| variables which "enumerate" every
176 gpu-processor uniquely (this is very unprecise in terms of {\cuda}s Processor, Thread, Block
177 and Grid vocabulary).
178
179 @ I start with the calculation of the |q| of (8c). Further details can be found in the section
180 {\em Choosing q} in section 4 of~\cite{young1995rig}.
181 @<Calculate the parameter for the gaussian filter@>=
182 if(sigma < 0.5){ /* The quotient q would be 0, so we can stop. */
183 MLPutSymbol(stdlink,"$Failed");
184 return;
185 }else if(sigma >= 0.5 && sigma <= 2.5){
186 q=3.97156 - 4.14554*sqrt(1.0-0.26891*sigma);
187 } else if(sigma > 2.5){
188 q=-0.9633 + 0.98711*sigma;
189 } else q=sigma;
190
191 double b0 = 1.57825 + q*(2.44413 + (1.4281 + 0.422205*q)*q);
192 double b1 = q*(2.44423 + q*(2.85619 + 1.26661*q));
193 double b2 = (-1.4281 - 1.26661*q)*q*q;
194 double b3 = 0.422205*q*q*q;
195 double b4 = 1.0-(b1 + b2 + b3)/b0;
196
197
198
199 @ The algorithm for one row is straight forward. Since the formula for a pixel needs the
200 (already calculated) last three pixel values, the boundary must be handled separately. I
201 assumed missing pixel to be of the same value like the boundary pixel and I process these
202 three by hand.
203 @<CUDA Kernel ...@>=
204 __global__ void cudaGaussKernel(
205 double *d_bm,
206 size_t stride,
207 int nx,
208 int ny,
209 double b0, double b1, double b2, double b3, double B){
210 int pos = blockIdx.x * blockDim.x + threadIdx.x;
211 if(pos >= ny || pos < 0) return;
212
213 double pV; /* The value which is used for padding */
214
215 /* Forward iteration. Calculating the boundary-elements by hand. */
216 double* in = (double*) ((char*)d_bm+pos*stride);
217 pV = *in;
218 in[0] = B*pV+(b1*pV+b2*pV+b3*pV)/b0;
219 in[1] = B*in[1]+(b1*in[0]+b2*pV+b3*pV)/b0;
220 in[2] = B*in[2]+(b1*in[1]+b2*in[0]+b3*pV)/b0;
221 for(int i=3; i<nx; ++i)
222 in[i] = B*in[i]+(b1*in[i-1]+b2*in[i-2]+b3*in[i-3])/b0;
223
224 /* Backward iteration. Calculating the boundary-elements by hand. */
225 int r = nx-1;
226 pV = in[r];
227 in[r] = B*pV+(b1*pV+b2*pV+b3*pV)/b0;
228 in[r-1] = B*in[r-1]+(b1*in[r]+b2*pV+b3*pV)/b0;
229 in[r-2] = B*in[r-2]+(b1*in[r-1]+b2*in[r]+b3*pV)/b0;
230
231 for(int i=r-3; i>=0; --i)
232 in[i] = B*in[i]+(b1*in[i+1]+b2*in[i+2]+b3*in[i+3])/b0;
233
234 }
235
236
237 @ The last part is the transposition of the bitmap. Here I just allocated another array and every
238 row is copied into the appropriate column. If you don't know why the |strides| are needed,
239 check the documentation of |cudaMallocPitch|.
240 @<CUDA Kernel ...@>+=
241 __global__ void cudaTransposeKernel(double *in, size_t stride1, int nx, int ny,
242 double *out, size_t stride2){
243 int row = blockIdx.x * blockDim.x + threadIdx.x;
244 if(row >= ny) return;
245
246 double* r = (double*) ((char*)in + stride1*row);
247 for(int i=0; i<nx; ++i){
248 double *outRow = (double*) ((char*)out + i*stride2);
249 outRow[row] = r[i];
250 }
251
252
253 }
254
255 @* Appendix.
256 @ The compilation of the {\cweb} file into an executable constists of several steps. First you need
257 to {\em tangle} you .w file
258 \begin{verbatim}
259 ctangle -bh cudaRecursiveGaussianFilter.w - cudaRecursiveGaussianFilter.tm
260 \end{verbatim}
261 This step produces a {\mma}-template file which needs to be processed by |mprep|
262 \begin{verbatim}
263 mprep -o cudaRecursiveGaussianFilter.cu cudaRecursiveGaussianFilter.tm
264 \end{verbatim}
265 |mprep| can be found in the {\mma}-install path under (on my 64 bit Linux box)\\
266 {\tt SystemFiles/Links/MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions}.
267 This step is followed by the call of the {\cuda}-compiler. I added several options
268 \begin{itemize}
269 \item {\tt -arch compute\_13} and {\tt -gpu-code sm\_13}
270 forces {\cuda} to take the version 1.3. In older versions and on other
271 graphic cards the |double| data type
272 is not supported. Therefore, this option seems essential
273 \item {\tt --optimize 3} is for optimized code.
274 \item {\tt --machine 64} to build a 64-bit program.
275 \end{itemize}
276 The {\cuda}-compiler is called by
277 \begin{verbatim}
278 nvcc \
279 -arch compute_13 \
280 --optimize 3 \
281 --machine 64 \
282 --gpu-code sm_13 \
283 -o fastgauss.exe \
284 -I/usr/local/Wolfram/Mathematica/6.0/SystemFiles/\
285 Links/MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions \
286 -L/usr/local/Wolfram/Mathematica/6.0/SystemFiles/Links/\
287 MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions \
288 -lm \
289 -lpthread \
290 -lML64i3 \
291 -I ~/NVIDIA_CUDA_SDK/common/inc/ \
292 -L ~/NVIDIA_CUDA_SDK/lib \
293 -lcutil \
294 cudaRecursiveGaussianFilter.cu
295 \end{verbatim}
296
297
298 @
299 \bibliographystyle{alpha}
300 \bibliography{literature}
301
302
303
304 \end{document}
305
306