Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Performance of static versus dynamic CUDA shared memory allocation

I have 2 kernels that do exactly the same thing. One of them allocates shared memory statically while the other allocates the memory dynamically at run time. I am using the shared memory as 2D array. So for the dynamic allocation, I have a macro that computes the memory location. Now, the results generated by the 2 kernels are exactly the same. However, the timing results I got from both kernels are 3 times apart! The static memory allocation is much faster. I am sorry that I can't post any of my code. Can someone give a justification for this?

like image 772
small_potato Avatar asked Nov 20 '25 14:11

small_potato


1 Answers

I have no evidence that static shared memory allocation is faster than dynamic shared memory allocation. As was evidenced in the comments above, it would be impossible to answer your question without a reproducer. In at least the case of the code below, the timings of the same kernel, when run with static or dynamic shared memory allocations, are exactly the same:

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
__global__ void kernel_static_memory_allocation(int *d_inout, int N)
{
    __shared__ int s[BLOCK_SIZE];

    const int tid   = threadIdx.x;
    const int i     = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {

        s[tid] = d_inout[i];
        __syncthreads();

        s[tid] = s[tid] * s[tid];
        __syncthreads();

        d_inout[i] = s[tid];
    }
}

/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void kernel_dynamic_memory_allocation(int *d_inout, int N)
{
    extern __shared__ int s[];

    const int tid   = threadIdx.x;
    const int i     = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {

        s[tid] = d_inout[i];
        __syncthreads();

        s[tid] = s[tid] * s[tid];
        __syncthreads();

        d_inout[i] = s[tid];
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    int N = 1000000;

    int* a = (int*)malloc(N*sizeof(int));

    for (int i = 0; i < N; i++) { a[i] = i; }

    int *d_inout; gpuErrchk(cudaMalloc(&d_inout, N * sizeof(int))); 

    int n_blocks = N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1);

    gpuErrchk(cudaMemcpy(d_inout, a, N*sizeof(int), cudaMemcpyHostToDevice));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);  
    kernel_static_memory_allocation<<<n_blocks,BLOCK_SIZE>>>(d_inout, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Static allocation - elapsed time:  %3.3f ms \n", time);

    cudaEventRecord(start, 0);  
    kernel_dynamic_memory_allocation<<<n_blocks,BLOCK_SIZE,BLOCK_SIZE*sizeof(int)>>>(d_inout, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Dynamic allocation - elapsed time:  %3.3f ms \n", time);

}

The possible reason for that is due to the fact that the disassembled codes for the two kernels are exactly the same and do not change even on replacing int N = 1000000; with int N = rand();.

like image 66
Vitality Avatar answered Nov 24 '25 23:11

Vitality



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!