About

Hello, and welcome to the Super computing blog! The purpose of this blog is to teach you how to create your own, high performance, multi-threaded applications. There are many different technologies available, including MPI, OpenMP, CUDA, and others to choose from. By using proper high performance programming techniques, it is possible to gain an incredible amount of performance out of your applications, whether they run on just one computer, or on a farm with several hundred computers. Feedback and suggestions are always welcome.

6 Comments

  1. VINCENT C. FULCO says:

    Dear Admin- Kudos on a phenomenal site providing easily digestible teach-ins on CUDA specifically. While trying to climb the steep curve of C++ and CUDA at the same time, your excellent illustrations make the journey that much more surmountable and rewarding.

    Best regards, Vince

  2. arief nur andono says:

    Dear Admin…
    i love the detail of tutorial but if you could add pycuda tutorial from basic it must be great…i hope you could blog about it soon..

    regards, arief

  3. Francesca Barcelona says:

    Hi, Your site really helped us accomplish our thesis. Would it be possible to ask for your contact information as to formally cite you in our dissertation?

  4. Vivek says:

    in Tutorial 3: can you also save the index of the maximum and minimum element in the array?? If so, any suggestions??

  5. Vivek says:

    can you modify tutorial 3 to also save the indexes where the max and min value occur??

  6. Vivek says:

    got it to work:

    //getmax kernel
    __global__ void getmax(float *pArray, float *pMaxResults, int *index)
    {
    // Declare arrays to be in shared memory.
    __shared__ float max[16];

    int nTotalThreads = blockDim.x; // Total number of active threads
    float temp;
    float max_val;
    int max_index;

    // Calculate which element this thread reads from memory
    int arrayIndex = gridDim.x*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
    max[threadIdx.x] = pArray[arrayIndex];
    max_val = max[threadIdx.x];
    max_index = arrayIndex;
    __syncthreads();

    while(nTotalThreads > 1)
    {
    int halfPoint = (nTotalThreads >> 1);
    if (threadIdx.x max[threadIdx.x])
    {
    max[threadIdx.x] = temp;
    max_val = max[threadIdx.x];
    // max_index = arrayIndex;
    }
    }
    __syncthreads();

    nTotalThreads = (nTotalThreads >> 1); // divide by two.
    }

    if (threadIdx.x == 0)
    {
    pMaxResults[25*blockIdx.y + blockIdx.x] = max[threadIdx.x];
    }

    if(max[0] == max_val )
    {
    index[25*blockIdx.y + blockIdx.x] = max_index;
    }

    }

    __global__ void getmax_dynamic(float *pArray, float *pMaxResults, int *index)
    {
    // Declare arrays to be in shared memory.
    __shared__ float max[16];

    int nTotalThreads = blockDim.x; // Total number of active threads
    float temp;
    float max_val;
    int max_index;

    // Calculate which element this thread reads from memory
    int arrayIndex = gridDim.x*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
    max[threadIdx.x] = pArray[arrayIndex];
    max_val = max[threadIdx.x];
    max_index = arrayIndex;
    __syncthreads();

    while(nTotalThreads > 1)
    {
    int halfPoint = (nTotalThreads >> 1);
    if (threadIdx.x max[threadIdx.x])
    {
    max[threadIdx.x] = temp;
    max_val = max[threadIdx.x];
    // max_index = arrayIndex;
    }
    }
    __syncthreads();

    nTotalThreads = (nTotalThreads >> 1); // divide by two.
    }

    if (threadIdx.x == 0)
    {
    pMaxResults[blockIdx.x] = max[threadIdx.x];
    }

    if(max[0] == max_val )
    {
    if(max_index >= 2*num_blocks)
    {
    index[0] = max_index;
    }
    else
    {
    int indx = index[max_index];
    index[0] = indx;
    }
    }

    }