\documentclass[a4paper,10pt,titlepage]{cweb} \usepackage{amsmath,amssymb} \usepackage{graphicx} \usepackage{anysize} \marginsize{3cm}{3cm}{3cm}{3cm} \newcommand{\im}{\textfrak{i}} \newcommand{\imag}{\im} \newcommand{\absolut}[1]{{\ensuremath \mid \! #1\! \mid}} \newcommand{\mat}[1]{\ensuremath\mathbf{#1}} \graphicspath{{img/}} \setkeys{Gin}{width=0.3\textwidth} \def\CEE/{C\spacefactor 1000 } \def\cweb{{\tt CWEB\/}} \def\UNIX/{{\small UNIX\/}} \def\WEB{{\tt WEB\/}} \def\ml{{\textit{MathLink}}} \def\mma{{\textit{Mathematica}}} \def\cuda{{\sc Cuda}} \def\Jxy{J_{xy}} \def\nx{{n_x}} \def\ny{{n_y}} \def\mx{{m_x}} \def\my{{m_y}} @s alpha TeX \begin{document} \thispagestyle{empty} \title{{\it Recursive Gaussian Filter for Mathematica}\\using CUDA} \author{Patrick Scheibe} \date{December 2008} \maketitle \cleardoublepage \pagestyle{plain} \maketitle \tableofcontents @* Introduction. This implementation bases on~\cite{young1995rig}. It demonstrates how to combine \mma\ and \cuda\ and it is written in \cweb. Three parts are important \begin{itemize} \item I need a \ml -template to call the C-function from \mma. This connects parameters and function names of the corresponding parts in C and in \mma. \item A C-function is required which allocates stores for arrays, does some simple precomputations of the gaussian parameters and calls then the \cuda -kernel. After the computation is finished the results are send to \mma. I will call this the {\em host-code} since all this does not run on the gpu. \item The most important part is the \cuda -kernel which define what is done on the gpu in parallel. This part is called the {\em device-code}. \end{itemize} @d NUM_OF_THREADS 32 @c @@; @@; @@; @@; @
@; @ The includes. @= #include #include #include #include #include #include @ The main-loop for the {\ml}-program. Note that this is only for the linux OS (maybe OSX too). @
= int main(int argc, char *argv[]) { CUT_DEVICE_INIT(argc, argv); return MLMain(argc,argv); } @* {\ml}-template code. The Mathematica Template connects the C-function, |cudaGaussianFilter(...)|, and the Mathematica function, |CudaGaussianFilter[...]|, which is a lowlevel function since it gets not an image but a list of the pixel-values and the dimensions. Later I will wrap this lowlevel function with a highlevel version which is called with an image. In the line starting with {\em Pattern} you can see that I get the graylevel-bitmap |bm| as one-dimensional list from {\mma}. Since I only test whether |bm| is a list and not whether it is one-dimensional, I flatten it in the |Arguments| line. Furthermore, I ensure that the sigma is evaluated to a numerical value. @= @=:Begin:@>@# @=:Function:cudaGaussianFilter@>@# @=:Pattern:CudaGaussianFilter[bm_List,nx_Integer,ny_Integer,sigma_?NumericQ]@>@# @=:Arguments:{Flatten[bm],nx,ny,N[sigma]}@>@# @=:ArgumentTypes:{RealList,Integer,Integer,Real}@>@# @=:ReturnType:Manual@>@# @=:End:@> @* The Host-Code. Before calling the \cuda -kernel I have to allocate and initialise the arrays. In this step I transfer the image into the memory of the graphics-card. After that the whole filter consists of only 4 sequential steps where every step runs in parallel on the gpu: \begin{enumerate} \item All rows are processed separate with the method described in~\cite{young1995rig}. This consists basically of a forward run followed by a backward run through the row. \item Now all columns need to be processed. Therefore, the bitmap is transposed and \item the first step is called again, working now on the columns (Transposing a matrix means first row becomes first column, second row becomes second column and so on). \item A final transpose rotates the bitmap to its original form. \end{enumerate} I end with returning the result to {\mma} and freeing all allocated memory. @= void cudaGaussianFilter(double *h_bm, long n, int nx, int ny, double sigma){ double *d_bm, *d_bm_transposed, *h_res; /* The array where the result is stored */ double q; /* An adapted version of the sigma */ size_t stride,stride_tr; /* The stride I have to use instead of ny */ int blocksize, /* How many thread per block */ gridsize; /* How many blocks in all */ cudaError_t err; blocksize = NUM_OF_THREADS; gridsize = (ny%blocksize==0)?ny/blocksize:ny/blocksize+1; /* make enough blocks if ny is not divisible by the choosen |blocksize| which is usually 32*/ @@; CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm,&stride,nx*sizeof(double),ny) ); CUDA_SAFE_CALL(cudaMallocPitch((void **) &d_bm_transposed,&stride_tr,ny*sizeof(double),nx)); CUDA_SAFE_CALL( cudaMemcpy2D((void*)d_bm,stride,(void*)h_bm, nx*sizeof(double), nx*sizeof(double),ny,cudaMemcpyHostToDevice) ); h_res = (double *) calloc(sizeof(double), nx*ny); cudaGaussKernel<<>>(d_bm,stride,nx,ny,b0,b1,b2,b3,b4); cudaTransposeKernel<<>>(d_bm,stride,nx,ny,d_bm_transposed,stride_tr); cudaGaussKernel<<>>(d_bm_transposed,stride_tr, ny,nx,b0,b1,b2,b3,b4); cudaTransposeKernel<<>>(d_bm_transposed,stride_tr,ny,nx,d_bm,stride); err = cudaGetLastError(); if( cudaSuccess != err) { fprintf(stderr, "Cuda error: %s.\n", cudaGetErrorString( err) ); exit(EXIT_FAILURE); } CUDA_SAFE_CALL( cudaMemcpy2D((void*) h_res, nx*sizeof(double),(void*) d_bm, stride, nx*sizeof(double), ny, cudaMemcpyDeviceToHost) ); MLPutRealList(stdlink, h_res, nx*ny); cudaFree(d_bm); cudaFree(d_bm_transposed); free(h_res); } @* The Device Code. This part consists of two {\em Kernels} (functions running on the gpu), one for the transposition of a matrix and one for the gaussian filter. Note that a kernel is (here) a chunk of code which processes exactly one row (column) of the bitmap. {\cuda} will run many instances of the kernel in parallel. You may ask how {\cuda} know which kernel should process which row. This is done through the |blockIdx|, |threadIdx| and |blockDim| variables which "enumerate" every gpu-processor uniquely (this is very unprecise in terms of {\cuda}s Processor, Thread, Block and Grid vocabulary). @ I start with the calculation of the |q| of (8c). Further details can be found in the section {\em Choosing q} in section 4 of~\cite{young1995rig}. @= if(sigma < 0.5){ /* The quotient q would be 0, so we can stop. */ MLPutSymbol(stdlink,"$Failed"); return; }else if(sigma >= 0.5 && sigma <= 2.5){ q=3.97156 - 4.14554*sqrt(1.0-0.26891*sigma); } else if(sigma > 2.5){ q=-0.9633 + 0.98711*sigma; } else q=sigma; double b0 = 1.57825 + q*(2.44413 + (1.4281 + 0.422205*q)*q); double b1 = q*(2.44423 + q*(2.85619 + 1.26661*q)); double b2 = (-1.4281 - 1.26661*q)*q*q; double b3 = 0.422205*q*q*q; double b4 = 1.0-(b1 + b2 + b3)/b0; @ The algorithm for one row is straight forward. Since the formula for a pixel needs the (already calculated) last three pixel values, the boundary must be handled separately. I assumed missing pixel to be of the same value like the boundary pixel and I process these three by hand. @= __global__ void cudaGaussKernel( double *d_bm, size_t stride, int nx, int ny, double b0, double b1, double b2, double b3, double B){ int pos = blockIdx.x * blockDim.x + threadIdx.x; if(pos >= ny || pos < 0) return; double pV; /* The value which is used for padding */ /* Forward iteration. Calculating the boundary-elements by hand. */ double* in = (double*) ((char*)d_bm+pos*stride); pV = *in; in[0] = B*pV+(b1*pV+b2*pV+b3*pV)/b0; in[1] = B*in[1]+(b1*in[0]+b2*pV+b3*pV)/b0; in[2] = B*in[2]+(b1*in[1]+b2*in[0]+b3*pV)/b0; for(int i=3; i=0; --i) in[i] = B*in[i]+(b1*in[i+1]+b2*in[i+2]+b3*in[i+3])/b0; } @ The last part is the transposition of the bitmap. Here I just allocated another array and every row is copied into the appropriate column. If you don't know why the |strides| are needed, check the documentation of |cudaMallocPitch|. @+= __global__ void cudaTransposeKernel(double *in, size_t stride1, int nx, int ny, double *out, size_t stride2){ int row = blockIdx.x * blockDim.x + threadIdx.x; if(row >= ny) return; double* r = (double*) ((char*)in + stride1*row); for(int i=0; i