cudaRecursiveGaussianFilter.w

1    \documentclass[a4paper,10pt,titlepage]{cweb} 
2     
3    \usepackage{amsmath,amssymb} 
4    \usepackage{graphicx} 
5    \usepackage{anysize} 
6     
7     
8    \marginsize{3cm}{3cm}{3cm}{3cm} 
9    \newcommand{\im}{\textfrak{i}} 
10   \newcommand{\imag}{\im} 
11   \newcommand{\absolut}[1]{{\ensuremath \mid \! #1\! \mid}} 
12   \newcommand{\mat}[1]{\ensuremath\mathbf{#1}} 
13    
14   \graphicspath{{img/}} 
15   \setkeys{Gin}{width=0.3\textwidth} 
16                             
17   \def\CEE/{C\spacefactor 1000 } 
18   \def\cweb{{\tt CWEB\/}} 
19   \def\UNIX/{{\small UNIX\/}} 
20   \def\WEB{{\tt WEB\/}} 
21   \def\ml{{\textit{MathLink}}} 
22   \def\mma{{\textit{Mathematica}}} 
23   \def\cuda{{\sc Cuda}} 
24   \def\Jxy{J_{xy}} 
25   \def\nx{{n_x}} 
26   \def\ny{{n_y}} 
27   \def\mx{{m_x}} 
28   \def\my{{m_y}} 
29           
30   @s alpha TeX 
31    
32   \begin{document} 
33   \thispagestyle{empty} 
34   \title{{\it Recursive Gaussian Filter for Mathematica}\\using CUDA} 
35   \author{Patrick Scheibe} 
36   \date{December 2008} 
37   \maketitle 
38   \cleardoublepage 
39   \pagestyle{plain} 
40    
41   \maketitle 
42   \tableofcontents 
43    
44   @* Introduction. 
45    
46    
47   This implementation bases on~\cite{young1995rig}. It demonstrates how to combine \mma\ and 
48   \cuda\ and it is written in \cweb. Three parts are important 
49   \begin{itemize} 
50   \item I need a \ml -template to call the C-function from \mma. This connects parameters and 
51   function names of the corresponding parts in C and in \mma. 
52   \item A C-function is required which allocates stores for arrays, does some simple precomputations 
53   of the gaussian parameters 
54   and calls then the \cuda -kernel. After the computation is finished the results are send to \mma. 
55   I will call this the {\em host-code} since all this does not run on the gpu. 
56   \item The most important part is the \cuda -kernel which define what is done on the gpu in 
57   parallel. This part is called the {\em device-code}. 
58   \end{itemize} 
59    
60   @d NUM_OF_THREADS 32 
61    
62   @c 
63   @<Headers@>@; 
64   @<CUDA Kernel for the Gaussian Filter@>@; 
65   @<Mathematica MathLink Template for the Gaussian Filter@>@; 
66   @<Gaussian Filter@>@; 
67   @<Main Loop@>@; 
68    
69   @ The includes. 
70   @<Headers@>= 
71   #include <mathlink.h> 
72   #include <stdio.h> 
73   #include <stdlib.h> 
74   #include <string.h> 
75   #include <cutil.h> 
76   #include <math.h> 
77    
78    
79   @ The main-loop for the {\ml}-program. Note that this is only for the linux OS (maybe OSX too). 
80   @<Main Loop@>= 
81   int main(int argc, char *argv[]) { 
82     CUT_DEVICE_INIT(argc, argv); 
83     return MLMain(argc,argv); 
84   } 
85    
86    
87    
88   @* {\ml}-template code. 
89   The Mathematica Template connects the C-function, |cudaGaussianFilter(...)|, and 
90   the Mathematica 
91   function, |CudaGaussianFilter[...]|, which is a lowlevel function since it gets not an image but 
92   a list of the pixel-values and the dimensions. Later I will wrap this lowlevel function with 
93   a highlevel version which is called with an image. In the line starting with {\em Pattern} you can 
94   see that 
95   I get the graylevel-bitmap |bm| as one-dimensional list from {\mma}. Since I only test whether 
96   |bm| is a list and not whether it is one-dimensional, I flatten it in the |Arguments| line. 
97   Furthermore, I ensure that the sigma is evaluated to a numerical value. 
98   @<Mathematica MathLink...@>= 
99   @=:Begin:@>@# 
100  @=:Function:cudaGaussianFilter@>@# 
101  @=:Pattern:CudaGaussianFilter[bm_List,nx_Integer,ny_Integer,sigma_?NumericQ]@>@# 
102  @=:Arguments:{Flatten[bm],nx,ny,N[sigma]}@>@# 
103  @=:ArgumentTypes:{RealList,Integer,Integer,Real}@>@# 
104  @=:ReturnType:Manual@>@# 
105  @=:End:@> 
106   
107   
108  @* The Host-Code. 
109  Before calling the \cuda -kernel I have to allocate and initialise the arrays. In this step 
110  I transfer the image into the memory of the graphics-card. After that the whole filter consists 
111  of only 4 sequential steps where every step runs in parallel on the gpu: 
112  \begin{enumerate} 
113  \item All rows are processed separate with the method described in~\cite{young1995rig}. This 
114  consists basically of a forward run followed by a backward run through the row. 
115  \item Now all columns need to be processed. Therefore, the bitmap is transposed and 
116  \item the first step is called again, working now on the columns (Transposing a matrix means 
117  first row becomes first column, second row becomes second column and so on).  
118  \item A final transpose rotates the bitmap to its original form. 
119  \end{enumerate} 
120  I end with returning the result to {\mma} and freeing all allocated memory. 
121  @<Gaussian Filter@>= 
122  void cudaGaussianFilter(double *h_bm, long n, int nx, int ny, double sigma){ 
123    double *d_bm, 
124           *d_bm_transposed, 
125           *h_res; /* The array where the result is stored */ 
126    double q;      /* An adapted version of the sigma */ 
127    size_t stride,stride_tr; /* The stride I have to use instead of ny */ 
128    int blocksize, /* How many thread per block */ 
129      gridsize;    /* How many blocks in all */ 
130   
131    cudaError_t err; 
132   
133    blocksize = NUM_OF_THREADS; 
134    gridsize = (ny%blocksize==0)?ny/blocksize:ny/blocksize+1; /* make enough blocks if 
135    ny is not divisible by the choosen |blocksize| which is usually 32*/ 
136   
137    @<Calculate the parameter for the gaussian filter@>@; 
138    CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm,&stride,nx*sizeof(double),ny)  ); 
139    CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm_transposed,&stride_tr,ny*sizeof(double),nx)); 
140    CUDA_SAFE_CALL( 
141      cudaMemcpy2D((void*)d_bm,stride,(void*)h_bm, nx*sizeof(double), 
142                   nx*sizeof(double),ny,cudaMemcpyHostToDevice) 
143    ); 
144    h_res = (double *) calloc(sizeof(double), nx*ny); 
145   
146    cudaGaussKernel<<<gridsize,blocksize>>>(d_bm,stride,nx,ny,b0,b1,b2,b3,b4); 
147    cudaTransposeKernel<<<gridsize,blocksize>>>(d_bm,stride,nx,ny,d_bm_transposed,stride_tr); 
148    cudaGaussKernel<<<gridsize,blocksize>>>(d_bm_transposed,stride_tr, 
149                                              ny,nx,b0,b1,b2,b3,b4); 
150    cudaTransposeKernel<<<gridsize,blocksize>>>(d_bm_transposed,stride_tr,ny,nx,d_bm,stride); 
151   
152    err = cudaGetLastError(); 
153    if( cudaSuccess !=  err) { 
154      fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString( err) ); 
155      exit(EXIT_FAILURE); 
156      } 
157    
158    CUDA_SAFE_CALL( 
159      cudaMemcpy2D((void*) h_res, nx*sizeof(double),(void*) d_bm, stride, 
160      nx*sizeof(double), ny, cudaMemcpyDeviceToHost) 
161    ); 
162   
163    MLPutRealList(stdlink, h_res, nx*ny); 
164    cudaFree(d_bm); 
165    cudaFree(d_bm_transposed); 
166    free(h_res); 
167  } 
168   
169   
170  @* The Device Code. 
171  This part consists of two {\em Kernels} (functions running on the gpu), one for the transposition 
172  of a matrix and one for the gaussian filter. Note that a kernel is (here) a chunk of code which 
173  processes exactly one row (column) of the bitmap. {\cuda} will run many instances of the kernel 
174  in parallel. You may ask how {\cuda} know which kernel should process which row. This is done 
175  through the |blockIdx|, |threadIdx| and |blockDim| variables which "enumerate" every 
176  gpu-processor uniquely (this is very unprecise in terms of {\cuda}s Processor, Thread, Block 
177  and Grid vocabulary). 
178   
179  @ I start with the calculation of the |q| of (8c). Further details can be found in the section 
180  {\em Choosing q} in section 4 of~\cite{young1995rig}. 
181  @<Calculate the parameter for the gaussian filter@>= 
182    if(sigma < 0.5){ /* The quotient q would be 0, so we can stop. */ 
183      MLPutSymbol(stdlink,"$Failed"); 
184      return; 
185    }else if(sigma >= 0.5 && sigma <= 2.5){ 
186      q=3.97156 - 4.14554*sqrt(1.0-0.26891*sigma); 
187    } else if(sigma > 2.5){ 
188      q=-0.9633 + 0.98711*sigma; 
189    } else q=sigma; 
190   
191    double b0 = 1.57825 + q*(2.44413 + (1.4281 + 0.422205*q)*q); 
192    double b1 = q*(2.44423 + q*(2.85619 + 1.26661*q)); 
193    double b2 = (-1.4281 - 1.26661*q)*q*q; 
194    double b3 = 0.422205*q*q*q; 
195    double b4 = 1.0-(b1 + b2 + b3)/b0; 
196   
197   
198   
199  @ The algorithm for one row is straight forward. Since the formula for a pixel needs the 
200  (already calculated) last three pixel values, the boundary must be handled separately. I 
201  assumed missing pixel to be of the same value like the boundary pixel and I process these 
202  three by hand. 
203  @<CUDA Kernel ...@>= 
204  __global__ void cudaGaussKernel( 
205    double *d_bm, 
206    size_t stride, 
207    int nx, 
208    int ny, 
209    double b0, double b1, double b2, double b3, double B){ 
210      int pos = blockIdx.x * blockDim.x + threadIdx.x; 
211      if(pos >= ny || pos < 0) return; 
212   
213      double pV; /* The value which is used for padding */ 
214   
215      /* Forward iteration. Calculating the boundary-elements by hand. */ 
216      double* in = (double*) ((char*)d_bm+pos*stride); 
217      pV = *in; 
218      in[0] = B*pV+(b1*pV+b2*pV+b3*pV)/b0; 
219      in[1] = B*in[1]+(b1*in[0]+b2*pV+b3*pV)/b0; 
220      in[2] = B*in[2]+(b1*in[1]+b2*in[0]+b3*pV)/b0; 
221      for(int i=3; i<nx; ++i) 
222       in[i] = B*in[i]+(b1*in[i-1]+b2*in[i-2]+b3*in[i-3])/b0; 
223   
224      /* Backward iteration. Calculating the boundary-elements by hand. */ 
225      int  r = nx-1; 
226      pV = in[r]; 
227      in[r] = B*pV+(b1*pV+b2*pV+b3*pV)/b0; 
228      in[r-1] = B*in[r-1]+(b1*in[r]+b2*pV+b3*pV)/b0; 
229      in[r-2] = B*in[r-2]+(b1*in[r-1]+b2*in[r]+b3*pV)/b0; 
230   
231     for(int i=r-3; i>=0; --i) 
232       in[i] = B*in[i]+(b1*in[i+1]+b2*in[i+2]+b3*in[i+3])/b0; 
233   
234  } 
235   
236   
237  @ The last part is the transposition of the bitmap. Here I just allocated another array and every 
238  row is copied into the appropriate column. If you don't know why the |strides| are needed, 
239   check the documentation of |cudaMallocPitch|. 
240  @<CUDA Kernel ...@>+= 
241  __global__ void cudaTransposeKernel(double *in, size_t stride1, int nx, int ny, 
242                                      double *out, size_t stride2){ 
243    int row = blockIdx.x * blockDim.x + threadIdx.x; 
244    if(row >= ny) return; 
245   
246    double* r = (double*) ((char*)in + stride1*row); 
247    for(int i=0; i<nx; ++i){ 
248      double *outRow = (double*) ((char*)out + i*stride2); 
249      outRow[row] = r[i]; 
250    } 
251   
252   
253  } 
254   
255  @* Appendix. 
256  @ The compilation of the {\cweb} file into an executable constists of several steps. First you need 
257  to {\em tangle} you .w file 
258  \begin{verbatim} 
259  ctangle -bh cudaRecursiveGaussianFilter.w - cudaRecursiveGaussianFilter.tm 
260  \end{verbatim} 
261  This step produces a {\mma}-template file which needs to be processed by |mprep| 
262  \begin{verbatim} 
263  mprep -o cudaRecursiveGaussianFilter.cu cudaRecursiveGaussianFilter.tm 
264  \end{verbatim} 
265  |mprep| can be found in the {\mma}-install path under (on my 64 bit Linux box)\\ 
266  {\tt SystemFiles/Links/MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions}. 
267  This step is followed by the call of the {\cuda}-compiler. I added several options 
268  \begin{itemize} 
269  \item {\tt -arch compute\_13} and {\tt -gpu-code sm\_13} 
270  forces {\cuda} to take the version 1.3. In older versions and on other 
271   graphic cards the |double| data type 
272  is not supported. Therefore, this option seems essential 
273  \item {\tt --optimize 3} is for optimized code. 
274  \item {\tt --machine 64} to build a 64-bit program. 
275  \end{itemize} 
276  The {\cuda}-compiler is called by 
277  \begin{verbatim} 
278  nvcc \ 
279  -arch compute_13 \ 
280  --optimize 3 \ 
281  --machine 64 \ 
282  --gpu-code sm_13 \ 
283  -o fastgauss.exe \ 
284  -I/usr/local/Wolfram/Mathematica/6.0/SystemFiles/\ 
285  Links/MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions \ 
286  -L/usr/local/Wolfram/Mathematica/6.0/SystemFiles/Links/\ 
287    MathLink/DeveloperKit/Linux-x86-64/CompilerAdditions \ 
288    -lm \ 
289    -lpthread \ 
290    -lML64i3 \ 
291    -I ~/NVIDIA_CUDA_SDK/common/inc/ \ 
292    -L ~/NVIDIA_CUDA_SDK/lib \ 
293    -lcutil \ 
294    cudaRecursiveGaussianFilter.cu 
295  \end{verbatim} 
296   
297   
298  @ 
299  \bibliographystyle{alpha} 
300  \bibliography{literature} 
301   
302   
303   
304  \end{document} 
305   
306