#include "StdAfx.h" #include "ImageProcessor.h" #include // Need this for SSE compiler intrinsics #include /* This file is brought to you by supercomputingblog.com This code may be freely copied and distributed given that this comment section is intact This code may be used in commercial products, but this source code itself may not be sold. */ CImageProcessor::CImageProcessor(void) { } CImageProcessor::~CImageProcessor(void) { } void CImageProcessor::RemoveBlue(Bitmap * pBitmap) { int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing for (int x=0;x < width; x++) { for (int y=0; y < height; y++) { pRawBitmapOrig[y * bitmapData.Stride / 4 + x] &= 0xffffff00; /*int b = curColor & 0xff; int g = (curColor & 0xff00) >> 8; int r = (curColor & 0xff0000) >> 16; int b = (curColor & 0xff000000) >> 24; */ } } pBitmap->UnlockBits(&bitmapData); } void CImageProcessor::RemoveBlue2(Bitmap * pBitmap) { // This version does away with the nested for loops, and instead does only one for loop // This way, memory access patterns will be coherent, and a speedup will be achieved // since there will be greater cache coherency. int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing int nPixels = height*bitmapData.Stride/4; for (int i=0; i < nPixels; i++) pRawBitmapOrig[i] &= 0xffffff00; // for loop reduced to one line pBitmap->UnlockBits(&bitmapData); } void CImageProcessor::RemoveBlueSSE(Bitmap * pBitmap) { int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing // SSE2 requires memory alignment to a 16 byte boundary, so we'll want to copy the memory over to an alligned space unsigned int *pSSEArray = (unsigned int*) _aligned_malloc((bitmapData.Stride*height/4) * sizeof(unsigned int), 16); // align to 16-byte for SSE memcpy(pSSEArray, pRawBitmapOrig, (bitmapData.Stride*height/4) * sizeof(unsigned int)); int nPixels = height*bitmapData.Stride/4; for (int i=0; i < nPixels; i+=4) { __m128i curPixelGroup = *(__m128i*)(&pSSEArray[i]); // This is a group of four pixels __m128i noBlueMask = _mm_set1_epi32 (0xffffff00); __m128i newPixelGroup = _mm_and_si128(curPixelGroup, noBlueMask); _mm_store_si128((__m128i*)&pSSEArray[i], newPixelGroup); } // Copy the aligned memory back into the original memcpy(pRawBitmapOrig, pSSEArray, (bitmapData.Stride*height/4) * sizeof(unsigned int)); pBitmap->UnlockBits(&bitmapData); _aligned_free(pSSEArray); } void CImageProcessor::RemoveBlueSSE2(Bitmap * pBitmap) { int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing // SSE2 requires memory alignment to a 16 byte boundary, so we'll want to copy the memory over to an alligned space unsigned int *pSSEArray = (unsigned int*) _aligned_malloc((bitmapData.Stride*height/4) * sizeof(unsigned int), 16); // align to 16-byte for SSE memcpy(pSSEArray, pRawBitmapOrig, (bitmapData.Stride*height/4) * sizeof(unsigned int)); int nPixels = height*bitmapData.Stride/4; for (int i=0; i < nPixels; i+=4) { __m128i curPixelGroup = *(__m128i*)(&pSSEArray[i]); // This is a group of four pixels __m128i noBlueMask = _mm_set1_epi32 (0xffffff00); __m128i newPixelGroup = _mm_and_si128(curPixelGroup, noBlueMask); _mm_storeu_si128((__m128i*)&pRawBitmapOrig[i], newPixelGroup); // store with unaligned instruction } // No need to copy back, since we stored with unaligned instruction //memcpy(pRawBitmapOrig, pSSEArray, (bitmapData.Stride*height/4) * sizeof(unsigned int)); pBitmap->UnlockBits(&bitmapData); _aligned_free(pSSEArray); } void CImageProcessor::RemoveBlueSSE3(Bitmap * pBitmap) { // This function loads and stores to sse2 registers with unaligned-ok intrinsics. // No need for memory copies! int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing int nPixels = height*bitmapData.Stride/4; for (int i=0; i < nPixels; i+=4) { __m128i curPixelGroup = _mm_loadu_si128((__m128i*)(&pRawBitmapOrig[i])); // This is a group of four pixels __m128i noBlueMask = _mm_set1_epi32 (0xffffff00); __m128i newPixelGroup = _mm_and_si128(curPixelGroup, noBlueMask); _mm_storeu_si128((__m128i*)&pRawBitmapOrig[i], newPixelGroup); // store with unaligned instruction } pBitmap->UnlockBits(&bitmapData); } void CImageProcessor::RemoveBlueSSE4(Bitmap * pBitmap) { // This function moves noBlueMask out of the for loop int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing int nPixels = height*bitmapData.Stride/4; __m128i noBlueMask = _mm_set1_epi32 (0xffffff00); for (int i=0; i < nPixels; i+=4) { __m128i curPixelGroup = _mm_loadu_si128((__m128i*)(&pRawBitmapOrig[i])); // This is a group of four pixels __m128i newPixelGroup = _mm_and_si128(curPixelGroup, noBlueMask); _mm_storeu_si128((__m128i*)&pRawBitmapOrig[i], newPixelGroup); // store with unaligned instruction } pBitmap->UnlockBits(&bitmapData); } void CImageProcessor::RemoveBlueSSE5(Bitmap * pBitmap) { // Almost always, the bitmap is going to be 16-byte aligned // We can take advantage of that and plan for the most common case int width = pBitmap->GetWidth(); int height = pBitmap->GetHeight(); BitmapData bitmapData; pBitmap->LockBits(&Rect(0,0,pBitmap->GetWidth(), pBitmap->GetHeight()), ImageLockModeWrite, PixelFormat32bppARGB, &bitmapData); unsigned int *pRawBitmapOrig = (unsigned int*)bitmapData.Scan0; // for easy access and indexing unsigned int address = (unsigned int)(&pRawBitmapOrig[0]); if (address & 0xf) // if any of the lower four bits are 1, we aren't 16-byte aligned { pBitmap->UnlockBits(&bitmapData); RemoveBlueSSE4(pBitmap); // use unaligned version } else { int nPixels = height*bitmapData.Stride/4; __m128i noBlueMask = _mm_set1_epi32 (0xffffff00); for (int i=0; i < nPixels; i+=4) { __m128i curPixelGroup = _mm_load_si128((__m128i*)(&pRawBitmapOrig[i])); // This is a group of four pixels __m128i newPixelGroup = _mm_and_si128(curPixelGroup, noBlueMask); _mm_store_si128((__m128i*)&pRawBitmapOrig[i], newPixelGroup); // store with unaligned instruction } pBitmap->UnlockBits(&bitmapData); } }