Pages: 1 2
SSE code
Below is code to do simple image convolution using 128-bit SSE vectors. Each pixel is treated as one vector, where it is easy and fast to do one math operation on four different components simultaneously. SSE code gives a huge speedup compared to ordinary x86 code.
void ConvolveSSE(Bitmap * pBitmap) { // A basic 2d convolution function. Calculations are done in floating point to maintain accuracy. int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing float *pBitmapCopy = new float[bitmapData.Stride*height]; // note we're not dividing by four anymore const int radius = 5; int nPixels = height*bitmapData.Stride/4; #pragma omp parallel for (int i=0; i < nPixels; i++) { unsigned int curPixel = pRawBitmapOrig[i]; float alpha = (float)((curPixel & 0xff000000) >> 24); float red = (float)((curPixel & 0x00ff0000) >> 16); float green = (float)((curPixel & 0x0000ff00) >> 8); float blue = (float)(curPixel & 0x000000ff); pBitmapCopy[i*4] = alpha; pBitmapCopy[i*4+1] = red; pBitmapCopy[i*4+2] = green; pBitmapCopy[i*4+3] = blue; } for (int i=0; i < height; i++) { for (int j=0; j < width; j++) { float totalWeight = 0; __m128 totalColor = _mm_setzero_ps(); // Reset total color vector to all zeros. for (int k=i-radius; k <= i+radius; k++) { if (k < 0 || k >= height) continue; for (int l=j-radius; l <= j+radius; l++) { if (l<0 || l>= width) continue; int base = k*bitmapData.Stride + l*4; float diff; if (i==k && j==l) diff = 1.0f; else diff = 1.0f/(abs(k-i) + abs(l-j)); __m128 curColor = _mm_loadu_ps(&pBitmapCopy[base]); // Load the pixel into an SSE vector __m128 diffVector = _mm_set_ps1(diff); // Set the current weight of kernel to all 4 floats in vector __m128 resultAddition = _mm_mul_ps(curColor, diffVector); // multiply the color channels by kernel weight totalColor = _mm_add_ps(totalColor, resultAddition); // Add the color channels to total vector totalWeight += diff; } } __m128 totalWeightVector = _mm_set_ps1(totalWeight); // We need to get the average weighted color totalColor = _mm_div_ps(totalColor, totalWeightVector); // divide all 4 colors by total kernel weight // Before converting back to integer components, we need to add .5 to each color // so that rounding down works correctly. __m128 roundWeightVector = _mm_set_ps1(.5f); totalColor = _mm_add_ps(totalColor, roundWeightVector); // Here's a fast easy way to store the location of a 128-bit vector. float localResult[4]; _mm_storeu_ps(localResult, totalColor); int totalA = (int)localResult[0]; int totalR = (int)localResult[1]; int totalG = (int)localResult[2]; int totalB = (int)localResult[3]; unsigned int result = (totalA << 24) + (totalR << 16) + (totalG << 8) + totalB; pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result; // for loop reduced to one line } } pBitmap->UnlockBits(&bitmapData); delete[] pBitmapCopy; }
Non-SSE code
This next code example is the non-SSE code which converts and stores the entire image to a floating point format before the convolution. It turns out that this pre-convolution format switch gives us a healthy speedup.
void Convolve2(Bitmap * pBitmap) { // A basic 2d convolution function. Calculations are done in floating point to maintain accuracy. // This function converts the entire bitmap to floating point format before convolution begins int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing float *pBitmapCopy = new float[bitmapData.Stride*height]; // note we're not dividing by four anymore const int radius = 5; int nPixels = height*bitmapData.Stride/4; #pragma omp parallel for (int i=0; i < nPixels; i++) { unsigned int curPixel = pRawBitmapOrig[i]; float alpha = (float)((curPixel & 0xff000000) >> 24); float red = (float)((curPixel & 0x00ff0000) >> 16); float green = (float)((curPixel & 0x0000ff00) >> 8); float blue = (float)(curPixel & 0x000000ff); pBitmapCopy[i*4] = alpha; pBitmapCopy[i*4+1] = red; pBitmapCopy[i*4+2] = green; pBitmapCopy[i*4+3] = blue; } for (int i=0; i < height; i++) { for (int j=0; j < width; j++) { float totalA = 0; float totalR = 0; float totalG = 0; float totalB = 0; float totalWeight = 0; for (int k=i-radius; k <= i+radius; k++) { if (k < 0 || k >= height) continue; for (int l=j-radius; l <= j+radius; l++) { if (l<0 || l>= width) continue; /*unsigned int curPixel = pBitmapCopy[k*bitmapData.Stride/4 + l]; float alpha = (float)((curPixel & 0xff000000) >> 24); float red = (float)((curPixel & 0x00ff0000) >> 16); float green = (float)((curPixel & 0x0000ff00) >> 8); float blue = (float)(curPixel & 0x000000ff); */ int base = k*bitmapData.Stride + l*4; float alpha = pBitmapCopy[base]; float red = pBitmapCopy[base+1]; float green = pBitmapCopy[base+2]; float blue = pBitmapCopy[base+3]; float diff; if (i==k && j==l) diff = 1.0f; else diff = 1.0f/(abs(k-i) + abs(l-j)); totalWeight += diff; totalA += diff*alpha; totalR += diff*red; totalG += diff*green; totalB += diff*blue; } } totalA = totalA/totalWeight + .5f; totalR = totalR/totalWeight + .5f; totalG = totalG/totalWeight + .5f; totalB = totalB/totalWeight + .5f; unsigned int result = ((int)totalA << 24) + ((int)totalR << 16) + ((int)totalG << 8) + (int)totalB; pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result; // for loop reduced to one line } } pBitmap->UnlockBits(&bitmapData); delete[] pBitmapCopy; }
Naive non-SSE code
Finally, we have some naïve code below which simply does an image convolution without any special instruction sets or other tricks.
void CImageProcessor::Convolve(Bitmap * pBitmap) { // A basic 2d convolution function. Calculations are done in floating point to maintain accuracy. int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing unsigned int *pBitmapCopy = new unsigned int[bitmapData.Stride*height/4]; memcpy(pBitmapCopy, pRawBitmapOrig, (bitmapData.Stride*height/4) * sizeof(unsigned int)); const int radius = 5; int nPixels = height*bitmapData.Stride/4; for (int i=0; i < height; i++) { for (int j=0; j < width; j++) { float totalA = 0; float totalR = 0; float totalG = 0; float totalB = 0; float totalWeight = 0; for (int k=i-radius; k <= i+radius; k++) { if (k < 0 || k >= height) continue; for (int l=j-radius; l <= j+radius; l++) { if (l<0 || l>= width) continue; unsigned int curPixel = pBitmapCopy[k*bitmapData.Stride/4 + l]; float alpha = (float)((curPixel & 0xff000000) >> 24); float red = (float)((curPixel & 0x00ff0000) >> 16); float green = (float)((curPixel & 0x0000ff00) >> 8); float blue = (float)(curPixel & 0x000000ff); float diff; if (i==k && j==l) diff = 1.0f; else diff = 1.0f/(abs(k-i) + abs(l-j)); totalWeight += diff; totalA += diff*alpha; totalR += diff*red; totalG += diff*green; totalB += diff*blue; } } totalA = totalA/totalWeight + .5f; totalR = totalR/totalWeight + .5f; totalG = totalG/totalWeight + .5f; totalB = totalB/totalWeight + .5f; unsigned int result = ((int)totalA << 24) + ((int)totalR << 16) + ((int)totalG << 8) + (int)totalB; pRawBitmapOrig[i*bitmapData.Stride/4 + j] = result; // for loop reduced to one line } } pBitmap->UnlockBits(&bitmapData); delete[] pBitmapCopy; }
Pages: 1 2