Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

CUDA: Shared memory over a large-ish 2D array

Tags:

cuda

I had a simple CUDA problem for a class assignment, but the professor added an optional task to implement the same algorithm using shared memory instead. I was unable to finish it before the deadline (as in, the turn-in date was a week ago) but I'm still curious so now I'm going to ask the internet ;).

The basic assignment was to implement a bastardized version of a red-black successive over-relaxation both sequentially and in CUDA, make sure you got the same result in both and then compare the speedup. Like I said, doing it with shared memory was an optional +10% add-on.

I'm going to post my working version and pseudocode what I've attempted to do since I don't have the code in my hands at the moment, but I can update this later with the actual code if someone needs it.

Before anyone says it: Yes, I know using CUtil is lame, but it made the comparison and timers easier.

Working global memory version:

#include <stdlib.h>
#include <stdio.h>
#include <cutil_inline.h>

#define N 1024

__global__ void kernel(int *d_A, int *d_B) {
    unsigned int index_x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int index_y = blockIdx.y * blockDim.y + threadIdx.y;

    // map the two 2D indices to a single linear, 1D index
    unsigned int grid_width = gridDim.x * blockDim.x;
    unsigned int index = index_y * grid_width + index_x;

    // check for boundaries and write out the result
    if((index_x > 0) && (index_y > 0) && (index_x < N-1) && (index_y < N-1))
        d_B[index] = (d_A[index-1]+d_A[index+1]+d_A[index+N]+d_A[index-N])/4;

}

main (int argc, char **argv) {

    int A[N][N], B[N][N];
    int *d_A, *d_B; // These are the copies of A and B on the GPU
    int *h_B; // This is a host copy of the output of B from the GPU
    int i, j;
    int num_bytes = N * N * sizeof(int);

    // Input is randomly generated
    for(i=0;i<N;i++) {
        for(j=0;j<N;j++) {
            A[i][j] = rand()/1795831;
            //printf("%d\n",A[i][j]);
        }
    }

    cudaEvent_t start_event0, stop_event0;
    float elapsed_time0;
    CUDA_SAFE_CALL( cudaEventCreate(&start_event0) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop_event0) );
    cudaEventRecord(start_event0, 0);
    // sequential implementation of main computation
    for(i=1;i<N-1;i++) {
        for(j=1;j<N-1;j++) {
            B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
        }
    }
    cudaEventRecord(stop_event0, 0);
    cudaEventSynchronize(stop_event0);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time0,start_event0, stop_event0) );



    h_B = (int *)malloc(num_bytes);
    memset(h_B, 0, num_bytes);
    //ALLOCATE MEMORY FOR GPU COPIES OF A AND B
    cudaMalloc((void**)&d_A, num_bytes);
    cudaMalloc((void**)&d_B, num_bytes);
    cudaMemset(d_A, 0, num_bytes);
    cudaMemset(d_B, 0, num_bytes);

    //COPY A TO GPU
    cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice);

    // create CUDA event handles for timing purposes
    cudaEvent_t start_event, stop_event;
    float elapsed_time;
    CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
    cudaEventRecord(start_event, 0);

// TODO: CREATE BLOCKS AND THREADS AND INVOKE GPU KERNEL
    dim3 block_size(256,1,1); //values experimentally determined to be fastest

    dim3 grid_size;
    grid_size.x = N / block_size.x;
    grid_size.y = N / block_size.y;

    kernel<<<grid_size,block_size>>>(d_A,d_B);

    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time,start_event, stop_event) );

    //COPY B BACK FROM GPU
    cudaMemcpy(h_B, d_B, num_bytes, cudaMemcpyDeviceToHost);

    // Verify result is correct
    CUTBoolean res = cutComparei( (int *)B, (int *)h_B, N*N);
    printf("Test %s\n",(1 == res)?"Passed":"Failed");
    printf("Elapsed Time for Sequential: \t%.2f ms\n", elapsed_time0);
    printf("Elapsed Time for CUDA:\t%.2f ms\n", elapsed_time);
    printf("CUDA Speedup:\t%.2fx\n",(elapsed_time0/elapsed_time));

    cudaFree(d_A);
    cudaFree(d_B);
    free(h_B);

    cutilDeviceReset();
}

For the shared memory version, this is what I've tried so far:

#define N 1024

__global__ void kernel(int *d_A, int *d_B, int width) {
    //assuming width is 64 because that's the biggest number I can make it
    //each MP has 48KB of shared mem, which is 12K ints, 32 threads/warp, so max 375 ints/thread?
    __shared__ int A_sh[3][66];

    //get x and y index and turn it into linear index

    for(i=0; i < width+2; i++)  //have to load 2 extra values due to the -1 and +1 in algo
          A_sh[index_y%3][i] = d_A[index+i-1]; //so A_sh[index_y%3][0] is actually d_A[index-1]

    __syncthreads(); //and hope that previous and next row have been loaded by other threads in the block?

    //ignore boundary conditions because it's pseudocode
    for(i=0; i < width; i++)
        d_B[index+i] = A_sh[index_y%3][i] + A_sh[index_y%3][i+2] + A_sh[index_y%3-1][i+1] + A_sh[index_y%3+1][i+1];

}

main(){
   //same init as above until threads/grid init

   dim3 threadsperblk(32,16);
   dim3 numblks(32,64);

   kernel<<<numblks,threadsperblk>>>(d_A,d_B,64);

   //rest is the same
}

This shared mem code crashes ("launch failed due to unspecified error") since I haven't caught all the boundary conditions yet, but I'm not worried about that as much as finding the correct way to get things going. I feel that my code is way too complicated to be the correct path (especially compared to the SDK examples), but I also can't see another way to do it since my array doesn't fit into shared mem like all the examples I can find.

And frankly, I'm not sure it would be that much faster on my hardware (GTX 560 Ti - runs the global memory version in 0.121ms), but I need to prove it to myself first :P

Edit 2: For anyone who runs across this in the future, the code in the answer is a good starting point if you want to do some shared memory.

like image 690
a5ehren Avatar asked Apr 26 '11 18:04

a5ehren


1 Answers

The key to getting the maximum out of these sort of stencil operators in CUDA is data re-usage. I have found that the best approach is usually to have each block "walk" through a dimension of the grid. After the block has loaded an initial tile of data into shared memory, only a single dimension (so row in a row-major order 2D problem ) needs to be read from global memory to have the necessary data in shared memory for the second and subsequent row calculations. The rest of the data can just be reused. To visualise how the shared memory buffer looks through the first four steps of this sort of algorithm:

  1. Three "rows" (a,b,c) of the input grid are loaded to shared memory, and the stencil computed for row (b) and written to global memory

    aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbb cccccccccccccccc

  2. Another row (d) is loaded into the shared memory buffer, replacing row (a), and the calculations made for row (c) using a different stencil, reflecting where the row data is in shared memory

    dddddddddddddddd bbbbbbbbbbbbbbbb cccccccccccccccc

  3. Another row (e) is loaded into the shared memory buffer, replacing row (b), and the calculations made for row (d), using a different stencil from either step 1 or 2.

    dddddddddddddddd eeeeeeeeeeeeeeee cccccccccccccccc

  4. Another row (f) is loaded into the shared memory buffer, replacing row (c), and the calculations made for row (e). Now the data is back to the same layout as used in step 1, and the same stencil used in step 1 can be used.

    dddddddddddddddd eeeeeeeeeeeeeeee ffffffffffffffff

The whole cycle repeats until the block has traverse full column length of the input grid. The reason for using different stencils rather than shifting the data in the shared memory buffer is down to performance - shared memory only has about 1000 Gb/s bandwidth on Fermi, and the shifting of data will become a bottleneck in fully optimal code. You should try different buffer sizes, because you might find smaller buffers allows for higher occupancy and improved kernel throughput.

EDIT: To give a concrete example of how that might be implemented:

template<int width>
__device__ void rowfetch(int *in, int *out, int col)
{
    *out = *in;
    if (col == 1) *(out-1) = *(in-1);   
    if (col == width) *(out+1) = *(in+1);   
}

template<int width>
__global__ operator(int *in, int *out, int nrows, unsigned int lda)
{
    // shared buffer holds three rows x (width+2) cols(threads)
    __shared__ volatile int buffer [3][2+width]; 

    int colid = threadIdx.x + blockIdx.x * blockDim.x;
    int tid = threadIdx.x + 1;

    int * rowpos = &in[colid], * outpos = &out[colid];

    // load the first three rows (compiler will unroll loop)
    for(int i=0; i<3; i++, rowpos+=lda) {
        rowfetch<width>(rowpos, &buffer[i][tid], tid);
    }

    __syncthreads(); // shared memory loaded and all threads ready

    int brow = 0; // brow is the next buffer row to load data onto
    for(int i=0; i<nrows; i++, rowpos+=lda, outpos+=lda) {

        // Do stencil calculations - use the value of brow to determine which
        // stencil to use
        result = ();
        // write result to outpos
        *outpos = result;

        // Fetch another row
        __syncthreads(); // Wait until all threads are done calculating
        rowfetch<width>(rowpos, &buffer[brow][tid], tid);
        brow = (brow < 2) ? (brow+1) : 0; // Increment or roll brow over
        __syncthreads(); // Wait until all threads have updated the buffer
    }
}
like image 121
talonmies Avatar answered Oct 14 '22 15:10

talonmies