#include #include /* This file can be downloaded from supercomputingblog.com. This is part of a series of tutorials that demonstrate how to use CUDA The tutorials will also demonstrate the speed of using CUDA */ // IMPORTANT NOTE: for this data size, your graphics card should have at least 256 megabytes of memory. // If your GPU has less memory, then you will need to decrease this data size. #define MAX_DATA_SIZE 1024*1024*8 // about 16 million elements. // The max data size must be an integer multiple of 128*256, because each block will have 256 threads, // and the block grid width will be 128. These are arbitrary numbers I choose. #define THREADS_PER_BLOCK 256 __global__ void NaiveKernel(int *pA, int *pB, int *pC, int *pD, int *pMaxResult) { // We are assuming 256 threads per block // This function is very simple! // Too many atomic operations on the same address will cause massive slowdown int index = blockIdx.x * blockDim.x + threadIdx.x; int myResult = pA[index] * pB[index] / pC[index] + pD[index]; atomicMax(pMaxResult, myResult); } __global__ void BetterKernel(int *pA, int *pB, int *pC, int *pD, int *pMaxResult) { // We are assuming 256 threads per block // This function is still fairly simple // We try to cull out some unnecessary atomic operations. __shared__ int curGlobalMax; int index = blockIdx.x * blockDim.x + threadIdx.x; int myResult = pA[index] * pB[index] / pC[index] + pD[index]; if (threadIdx.x == 0) { // Remember, *pMaxResult will be changing all the time // We're just taking a snapshot, and putting it into shared memory curGlobalMax = *pMaxResult; } __syncthreads(); if (myResult > curGlobalMax) { // Only do an atomic operation if there is a chance we have a greater value atomicMax(pMaxResult, myResult); } } __global__ void SmartKernel(int *pA, int *pB, int *pC, int *pD, int *pMaxResult) { // We are assuming 256 threads per block __shared__ int results[256]; // This function is slightly more complicated, but has the highest performance. int index = blockIdx.x * blockDim.x + threadIdx.x; int myResult = pA[index] * pB[index] / pC[index] + pD[index]; results[threadIdx.x] = myResult; __syncthreads(); // Do reduction in shared mem for(unsigned int i=blockDim.x/2; i>0; i>>=1) { if (threadIdx.x < i) { if (results[threadIdx.x + i] > myResult) { myResult = results[threadIdx.x + i]; results[threadIdx.x] = myResult; } } __syncthreads(); } // We now have the maximum value stored in results[0]; // That happens to be identical to local variable myResult if (threadIdx.x == 0) { atomicMax(pMaxResult, myResult); } } __global__ void SmarterKernel(int *pA, int *pB, int *pC, int *pD, int *pMaxResult) { // We are assuming 256 threads per block __shared__ int results[256]; // This function is slightly more complicated, but has the highest performance. // This function requires CUDA compute capability 1.2 or higher int index = blockIdx.x * blockDim.x + threadIdx.x; int myResult = pA[index] * pB[index] / pC[index] + pD[index]; results[threadIdx.x] = myResult; __syncthreads(); // Do reduction in shared mem for(unsigned int i=blockDim.x/2; i>0; i>>=1) { if (threadIdx.x < i) { atomicMax(&(results[threadIdx.x]), results[threadIdx.x + i]); } __syncthreads(); } // We now have the maximum value stored in results[0]; if (threadIdx.x == 0) { atomicMax(pMaxResult, results[0]); } } void getMaxCPU(int *h_result, int *h_A, int *h_B, int *h_C, int *h_D, int nElems) { int max = 0; for (int i=0; i < nElems; i++) { int result = h_A[i]*h_B[i]/h_C[i] + h_D[i]; if (result > max) max = result; } *h_result = max; } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ int *h_A, *h_B, *h_C, *h_D, *h_result; int *d_A, *d_B, *d_C, *d_D, *d_result; double gpuTime; int i; unsigned int hTimer; CUT_DEVICE_INIT(argc, argv); CUT_SAFE_CALL(cutCreateTimer(&hTimer)); printf("Initializing data...\n"); h_A = (int *)malloc(sizeof(int) * MAX_DATA_SIZE); h_B = (int *)malloc(sizeof(int) * MAX_DATA_SIZE); h_C = (int *)malloc(sizeof(int) * MAX_DATA_SIZE); h_D = (int *)malloc(sizeof(int) * MAX_DATA_SIZE); h_result = (int*)malloc(sizeof(int)); CUDA_SAFE_CALL( cudaMalloc( (void **)&d_A, sizeof(int) * MAX_DATA_SIZE) ); CUDA_SAFE_CALL( cudaMalloc( (void **)&d_B, sizeof(int) * MAX_DATA_SIZE) ); CUDA_SAFE_CALL( cudaMalloc( (void **)&d_C, sizeof(int) * MAX_DATA_SIZE) ); CUDA_SAFE_CALL( cudaMalloc( (void **)&d_D, sizeof(int) * MAX_DATA_SIZE) ); CUDA_SAFE_CALL( cudaMalloc( (void **)&d_result, sizeof(int)) ); srand(123); for(i = 0; i < MAX_DATA_SIZE; i++) { h_A[i] = rand(); h_B[i] = rand(); h_C[i] = rand()+1; // so we don't worry about dividing by zero h_D[i] = rand(); } int firstRun = 1; // Indicates if it's the first execution of the for loop const int useGPU = 1; // When 0, only the CPU is used. When 1, only the GPU is used for (int dataAmount = MAX_DATA_SIZE; dataAmount > 128*THREADS_PER_BLOCK; dataAmount /= 2) { int blockGridWidth = dataAmount / THREADS_PER_BLOCK; int blockGridHeight = 1; dim3 blockGridRows(blockGridWidth, blockGridHeight); dim3 threadBlockRows(THREADS_PER_BLOCK, 1); // Start the timer. // We want to measure copying data, running the kernel, and copying the results back to host CUT_SAFE_CALL( cutResetTimer(hTimer) ); CUT_SAFE_CALL( cutStartTimer(hTimer) ); if (useGPU == 1) { // Copy the data to the device *h_result = 0; CUDA_SAFE_CALL( cudaMemcpy(d_A, h_A, sizeof(int) * dataAmount, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_B, h_B, sizeof(int) * dataAmount, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_C, h_C, sizeof(int) * dataAmount, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_D, h_D, sizeof(int) * dataAmount, cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL( cudaMemcpy(d_result, h_result, sizeof(int) * 1, cudaMemcpyHostToDevice) ); // Do calculations and find max on the GPU //NaiveKernel<<>>(d_A, d_B, d_C, d_D, d_result); //BetterKernel<<>>(d_A, d_B, d_C, d_D, d_result); //SmartKernel<<>>(d_A, d_B, d_C, d_D, d_result); SmarterKernel<<>>(d_A, d_B, d_C, d_D, d_result); CUT_CHECK_ERROR("The kernel execution failed\n"); CUDA_SAFE_CALL( cudaThreadSynchronize() ); // Copy the data back to the host. It's just 1 int CUDA_SAFE_CALL( cudaMemcpy(h_result, d_result, sizeof(int) * 1, cudaMemcpyDeviceToHost) ); printf("Result is : %d\n", *h_result); } else { // We're using the CPU only getMaxCPU(h_result, h_A, h_B, h_C, h_D, dataAmount); } // Stop the timer, print the total round trip execution time. CUT_SAFE_CALL(cutStopTimer(hTimer)); gpuTime = cutGetTimerValue(hTimer); if (!firstRun || !useGPU) { printf("Elements: %d - Calculation time : %f msec - %f Atomics/sec\n", dataAmount, gpuTime, dataAmount / (gpuTime * 0.001)); } else { firstRun = 0; // We discard the results of the first run because of the extra overhead incurred // during the first time a kernel is ever executed. dataAmount *= 2; // reset to first run value } } printf("Cleaning up...\n"); CUDA_SAFE_CALL( cudaFree(d_A ) ); CUDA_SAFE_CALL( cudaFree(d_B ) ); CUDA_SAFE_CALL( cudaFree(d_C ) ); CUDA_SAFE_CALL( cudaFree(d_D ) ); CUDA_SAFE_CALL( cudaFree(d_result ) ); free(h_A); free(h_B); free(h_C); free(h_D); free(h_result); CUT_SAFE_CALL(cutDeleteTimer(hTimer)); CUT_EXIT(argc, argv); }