Advanced Image Processing with CUDA | The Supercomputing Blog - Part 2

Advanced Image Processing with CUDA

Pages: 1 2

CUDA code

// This code originates from supercomputingblog.com
// This code is provided as is without any warranty of any kind
// Feel free to use this code however you'd like, as long as 
// due credit is given, and this header remains intact.

void __global__ PaintCu(int width, int height, int stride, unsigned int *pRawBitmapOrig, unsigned int *pBitmapCopy, int radius, int nBins)
{
	// This CUDA kernel effectively transforms an image into an image that looks like it's been painted.
	// Each thread will calculate exactly 1 final pixel
	// The processing of each pixel requires many computations, and the use of 4 kilobytes of memory.
	// This code will work well on fermi archetecture and beyond due to the cache structure of the GPU

	int i = blockIdx.y * blockDim.y + threadIdx.y;
	int j = blockIdx.x * blockDim.x + threadIdx.x;
	// Test to see if we're testing a valid pixel
	if (i >= height || j >= width) return;	// Don't bother doing the calculation. We're not in a valid pixel location

	#define C_MAX_INTENSITIES 256 // 8 bits per color, 256 intensities is fine.
	int intensityCount[C_MAX_INTENSITIES];
	int avgR[C_MAX_INTENSITIES];
	int avgG[C_MAX_INTENSITIES];
	int avgB[C_MAX_INTENSITIES];
	for (int k=0; k <= nBins; k++)
	{
		intensityCount[k] = 0;
		avgR[k] = 0;
		avgG[k] = 0;
		avgB[k] = 0;
	}
	// we have a radius r
	int maxIntensityCount = 0;
	int maxIntensityCountIndex = 0;
	for (int k=i-radius; k <= i+radius;k++)
	{
		if (k < 0 || k >= height) continue;
		for (int l=j-radius; l <= j+radius; l++)
		{
			if (l < 0 || l >= width) continue;
			int curPixel = pBitmapCopy[k*stride/4 + l];
			int r = ((curPixel & 0x00ff0000) >> 16);
			int g = ((curPixel & 0x0000ff00) >> 8);
			int b = ((curPixel & 0x000000ff) >> 0);
			int curIntensity = (int)((float)((r+g+b)/3*nBins)/255.0f);
			intensityCount[curIntensity]++;
			if (intensityCount[curIntensity] > maxIntensityCount)
			{
				maxIntensityCount = intensityCount[curIntensity];
				maxIntensityCountIndex = curIntensity;
			}
			avgR[curIntensity] += r;
			avgG[curIntensity] += g;

			avgB[curIntensity] += b;
		}
	}
	int finalR = avgR[maxIntensityCountIndex] / maxIntensityCount;
	int finalG = avgG[maxIntensityCountIndex] / maxIntensityCount;
	int finalB = avgB[maxIntensityCountIndex] / maxIntensityCount;
	pRawBitmapOrig[i*stride/4 + j] = 0xff000000 | ((finalR << 16) + (finalG << 8) + finalB);
}

CPU code

#pragma omp parallel
for (int i=0; i < height; i++)
{
	#define C_MAX_INTENSITIES 256 // 8 bits per color, 256 intensities is fine.
	int intensityCount[C_MAX_INTENSITIES];
	int avgR[C_MAX_INTENSITIES];
	int avgG[C_MAX_INTENSITIES];
	int avgB[C_MAX_INTENSITIES];
	for (int j=0; j < width; j++)
	{
		// reset to zero
		for (int k=0; k <= nBins; k++)
		{
			intensityCount[k] = 0;
			avgR[k] = 0;
			avgG[k] = 0;
			avgB[k] = 0;
		}
		// we have a radius r
		int maxIntensityCount = 0;
		int maxIntensityCountIndex = 0;
		for (int k=i-radius; k <= i+radius;k++)
		{
			if (k < 0 || k >= height) continue;
			for (int l=j-radius; l <= j+radius; l++)
			{
				if (l < 0 || l >= width) continue;
				int curPixel = pBitmapCopy[k*bitmapData.Stride/4 + l];
				int r = ((curPixel & 0x00ff0000) >> 16);
				int g = ((curPixel & 0x0000ff00) >> 8);
				int b = ((curPixel & 0x000000ff) >> 0);
				int curIntensity = (int)((float)((r+g+b)/3*nBins)/255.0f);
				intensityCount[curIntensity]++;
				if (intensityCount[curIntensity] > maxIntensityCount)
				{
					maxIntensityCount = intensityCount[curIntensity];
					maxIntensityCountIndex = curIntensity;
				}
				avgR[curIntensity] += r;
				avgG[curIntensity] += g;
				avgB[curIntensity] += b;
			}
		}
		int finalR = avgR[maxIntensityCountIndex] / maxIntensityCount;
		int finalG = avgG[maxIntensityCountIndex] / maxIntensityCount;
		int finalB = avgB[maxIntensityCountIndex] / maxIntensityCount;
		pRawBitmapOrig[i*bitmapData.Stride/4 + j] = 0xff000000 | ((finalR << 16) + (finalG << 8) + finalB);
	}
}

Pages: 1 2

This entry was posted by admin on September 21, 2011 at 12:17 am under CUDA, Graphics. Tagged Algorithm, Benchmark, Cache, CUDA, Image, Image processing, Local memory, Oil Painting, Paintl, Performance. Both comments and pings are currently closed.