Advanced Image Processing with SSE

Pages: 1 2

SSE code

Below is code to do simple image convolution using 128-bit SSE vectors. Each pixel is treated as one vector, where it is easy and fast to do one math operation on four different components simultaneously. SSE code gives a huge speedup compared to ordinary x86 code.

void ConvolveSSE(Bitmap * pBitmap)
{
	// A basic 2d convolution function. Calculations are done in floating point to maintain accuracy.
	int width = pBitmap->GetWidth();
	int height = pBitmap->GetHeight();
	BitmapData bitmapData;
	pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData);
	unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0;   // for easy access and indexing

	float *pBitmapCopy = new float[bitmapData.Stride*height]; // note we're not dividing by four anymore

	const int radius = 5;
	int nPixels = height*bitmapData.Stride/4;
	#pragma omp parallel
	for (int i=0; i < nPixels; i++)
	{
		unsigned int curPixel = pRawBitmapOrig[i];
		float alpha = (float)((curPixel & 0xff000000) >> 24);
		float red = (float)((curPixel & 0x00ff0000) >> 16);
		float green = (float)((curPixel & 0x0000ff00) >> 8);
		float blue = (float)(curPixel & 0x000000ff);
		pBitmapCopy[i*4] = alpha;
		pBitmapCopy[i*4+1] = red;
		pBitmapCopy[i*4+2] = green;
		pBitmapCopy[i*4+3] = blue;
	}

	for (int i=0; i < height; i++) 
	{
		for (int j=0; j < width; j++)
		{
			float totalWeight = 0;
			__m128 totalColor = _mm_setzero_ps(); // Reset total color vector to all zeros. 
			for (int k=i-radius; k <= i+radius; k++)
			{
				if (k < 0 || k >= height) continue;
				for (int l=j-radius; l <= j+radius; l++)
				{
					if (l<0 || l>= width) continue;
					int base = k*bitmapData.Stride + l*4;
					float diff;
					if (i==k && j==l) diff = 1.0f;
					else
						diff = 1.0f/(abs(k-i) + abs(l-j));
					__m128 curColor = _mm_loadu_ps(&pBitmapCopy[base]); // Load the pixel into an SSE vector
					__m128 diffVector = _mm_set_ps1(diff); // Set the current weight of kernel to all 4 floats in vector
					__m128 resultAddition = _mm_mul_ps(curColor, diffVector); // multiply the color channels by kernel weight
					totalColor = _mm_add_ps(totalColor, resultAddition); // Add the color channels to total vector
					totalWeight += diff;
				}
			}
			__m128 totalWeightVector = _mm_set_ps1(totalWeight); // We need to get the average weighted color
			totalColor = _mm_div_ps(totalColor, totalWeightVector); // divide all 4 colors by total kernel weight
			// Before converting back to integer components, we need to add .5 to each color
			// so that rounding down works correctly.
			__m128 roundWeightVector = _mm_set_ps1(.5f);
			totalColor = _mm_add_ps(totalColor, roundWeightVector);
			// Here's a fast easy way to store the location of a 128-bit vector.
			float localResult[4];
			_mm_storeu_ps(localResult, totalColor);
			int totalA = (int)localResult[0];
			int totalR = (int)localResult[1];
			int totalG = (int)localResult[2];
			int totalB = (int)localResult[3];
			unsigned int result = (totalA << 24) + (totalR << 16) + (totalG << 8) + totalB;
			pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result;	// for loop reduced to one line
		}
	}
	pBitmap->UnlockBits(&bitmapData);
	delete[] pBitmapCopy;
}

Non-SSE code

This next code example is the non-SSE code which converts and stores the entire image to a floating point format before the convolution. It turns out that this pre-convolution format switch gives us a healthy speedup.

void Convolve2(Bitmap * pBitmap)
{
	// A basic 2d convolution function. Calculations are done in floating point to maintain accuracy.
	// This function converts the entire bitmap to floating point format before convolution begins
	int width = pBitmap->GetWidth();
	int height = pBitmap->GetHeight();
	BitmapData bitmapData;
	pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData);
	unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0;   // for easy access and indexing

	float *pBitmapCopy = new float[bitmapData.Stride*height]; // note we're not dividing by four anymore

	const int radius = 5;
	int nPixels = height*bitmapData.Stride/4;
	#pragma omp parallel
	for (int i=0; i < nPixels; i++)
	{
		unsigned int curPixel = pRawBitmapOrig[i];
		float alpha = (float)((curPixel & 0xff000000) >> 24);
		float red = (float)((curPixel & 0x00ff0000) >> 16);
		float green = (float)((curPixel & 0x0000ff00) >> 8);
		float blue = (float)(curPixel & 0x000000ff);
		pBitmapCopy[i*4] = alpha;
		pBitmapCopy[i*4+1] = red;
		pBitmapCopy[i*4+2] = green;
		pBitmapCopy[i*4+3] = blue;
	}

	for (int i=0; i < height; i++) 
	{
		for (int j=0; j < width; j++)
		{
			float totalA = 0;
			float totalR = 0;
			float totalG = 0;
			float totalB = 0;
			float totalWeight = 0;
			for (int k=i-radius; k <= i+radius; k++)
			{
				if (k < 0 || k >= height) continue;
				for (int l=j-radius; l <= j+radius; l++)
				{
					if (l<0 || l>= width) continue;
					/*unsigned int curPixel = pBitmapCopy[k*bitmapData.Stride/4 + l];
					float alpha = (float)((curPixel & 0xff000000) >> 24);
					float red = (float)((curPixel & 0x00ff0000) >> 16);
					float green = (float)((curPixel & 0x0000ff00) >> 8);
					float blue = (float)(curPixel & 0x000000ff);
					*/
					int base = k*bitmapData.Stride + l*4;
					float alpha = pBitmapCopy[base];
					float red = pBitmapCopy[base+1];
					float green = pBitmapCopy[base+2];
					float blue = pBitmapCopy[base+3];
					float diff;
					if (i==k && j==l) diff = 1.0f;
					else
						diff = 1.0f/(abs(k-i) + abs(l-j));
					totalWeight += diff;
					totalA += diff*alpha;
					totalR += diff*red;
					totalG += diff*green;
					totalB += diff*blue;
				}
			}
			totalA = totalA/totalWeight + .5f;
			totalR = totalR/totalWeight + .5f;
			totalG = totalG/totalWeight + .5f;
			totalB = totalB/totalWeight + .5f;
			unsigned int result = ((int)totalA << 24) + ((int)totalR << 16) + ((int)totalG << 8) + (int)totalB;
			pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result;	// for loop reduced to one line
		}
	}
	pBitmap->UnlockBits(&bitmapData);
	delete[] pBitmapCopy;
}

Naive non-SSE code

Finally, we have some naïve code below which simply does an image convolution without any special instruction sets or other tricks.

void CImageProcessor::Convolve(Bitmap * pBitmap)
{
	// A basic 2d convolution function. Calculations are done in floating point to maintain accuracy.
	int width = pBitmap->GetWidth();
	int height = pBitmap->GetHeight();
	BitmapData bitmapData;
	pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData);
	unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0;   // for easy access and indexing

	unsigned int *pBitmapCopy = new unsigned int[bitmapData.Stride*height/4];
	memcpy(pBitmapCopy, pRawBitmapOrig, (bitmapData.Stride*height/4) * sizeof(unsigned int));
	const int radius = 5;
	int nPixels = height*bitmapData.Stride/4;
	for (int i=0; i < height; i++) 
	{
		for (int j=0; j < width; j++)
		{
			float totalA = 0;
			float totalR = 0;
			float totalG = 0;
			float totalB = 0;
			float totalWeight = 0;
			for (int k=i-radius; k <= i+radius; k++)
			{
				if (k < 0 || k >= height) continue;
				for (int l=j-radius; l <= j+radius; l++)
				{
					if (l<0 || l>= width) continue;
					unsigned int curPixel = pBitmapCopy[k*bitmapData.Stride/4 + l];
					float alpha = (float)((curPixel & 0xff000000) >> 24);
					float red = (float)((curPixel & 0x00ff0000) >> 16);
					float green = (float)((curPixel & 0x0000ff00) >> 8);
					float blue = (float)(curPixel & 0x000000ff);
					float diff;
					if (i==k && j==l) diff = 1.0f;
					else
						diff = 1.0f/(abs(k-i) + abs(l-j));
					totalWeight += diff;
					totalA += diff*alpha;
					totalR += diff*red;
					totalG += diff*green;
					totalB += diff*blue;
				}
			}
			totalA = totalA/totalWeight + .5f;
			totalR = totalR/totalWeight + .5f;
			totalG = totalG/totalWeight + .5f;
			totalB = totalB/totalWeight + .5f;
			unsigned int result = ((int)totalA << 24) + ((int)totalR << 16) + ((int)totalG << 8) + (int)totalB;
			pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result;	// for loop reduced to one line
		}
	}
	pBitmap->UnlockBits(&bitmapData);
	delete[] pBitmapCopy;
}

Pages: 1 2

This entry was posted by admin on September 27, 2011 at 9:29 pm under C++, Graphics, Optimization, Windows. Tagged 128-bit, Algorithm, C++, Code, Example, Floating Point, Guide, Image manipulation, Image processing, Intrinsic, Optimization, Performance, SSE, SSE2, Tutorial, Vector. Both comments and pings are currently closed.

The Supercomputing Blog