CUDA atomic operation performance in different scenarios

Tags:

When I came across this question on SO, I was curious to know the answer. so I wrote below piece of code to test atomic operation performance in different scenarios. The OS is Ubuntu 12.04 with CUDA 5.5 and the device is GeForce GTX780 (Kepler architecture). I compiled the code with -O3 flag and for CC=3.5.

#include <stdio.h>

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32

__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+i, 6);  //arbitrary number to add
    }
}

__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6);   //arbitrary number to add
    }
}

__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i>>5), 6); //arbitrary number to add
    }
}

__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data, 6);    //arbitrary number to add
    }
}

__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+threadIdx.x, data[i]);
    }
}

__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
    }
}

__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);

    }
}

__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data, data[0]);
    }
}

int main(void)
{

    const int n = 2 << 24;
    int* data = new int[n];

    int i;
    for(i=0; i<n; i++) {
        data[i] = i%1024+1;
    }

    int* dev_data;
    HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    HANDLE_ERROR( cudaDeviceReset() );
    printf("Program finished without error.\n");
    return 0;
}

Basically in above code there are 8 kernels in which all threads do atomicAdd on all data.

Coalesced atomic addition on global memory.
Atomic addition on a restricted address space in global memory. (32 in the code)
Atomic addition for warp lanes on the same address in global memory.
Atomic addition of all threads on the same address in global memory.

Items 5 to 8 can be found by replacing global with shared in above items. Chosen block size is 256.

I used nvprof to profile the program. The output is:

Time(%)      Time     Calls       Avg       Min       Max  Name
44.33%  2.35113s        50  47.023ms  46.987ms  47.062ms  SameAddressAtomicOnSharedMem(int*, int)
31.89%  1.69104s        50  33.821ms  33.818ms  33.826ms  SameAddressAtomicOnGlobalMem(int*, int)
10.10%  535.88ms        50  10.718ms  10.707ms  10.738ms  WarpRestrictedAtomicOnSharedMem(int*, int)
3.96%  209.95ms        50  4.1990ms  4.1895ms  4.2103ms  AddressRestrictedAtomicOnSharedMem(int*, int)
3.95%  209.47ms        50  4.1895ms  4.1893ms  4.1900ms  AddressRestrictedAtomicOnGlobalMem(int*, int)
3.33%  176.48ms        50  3.5296ms  3.5050ms  3.5498ms  WarpRestrictedAtomicOnGlobalMem(int*, int)
1.08%  57.428ms        50  1.1486ms  1.1460ms  1.1510ms  CoalescedAtomicOnGlobalMem(int*, int)
0.84%  44.784ms        50  895.68us  888.65us  905.77us  CoalescedAtomicOnSharedMem(int*, int)
0.51%  26.805ms         1  26.805ms  26.805ms  26.805ms  [CUDA memcpy HtoD]
0.01%  543.61us         1  543.61us  543.61us  543.61us  [CUDA memset]

Obviously coalesced conflict-free atomic operations had the best performance, and same-address had the the worst. One thing I couldn't explain was that why same address atomic on shared memory (inside a block) is slower comparing to on global memory (common between all threads).
When all the warps lanes access the same place in shared memory the performance is very bad but it's (surprisingly) not the case when they perform it onto global memory. I cannot explain why. Another confusion case is address restricted atomic on global is performing worse than when all the threads inside the warp perform it on the same address while it seems memory contentions in the first case is lower.

Anyways I would be happy if anyone could explain above profiling results.

268

asked Mar 13 '14 01:03

Farzad

1 Answers

As a forward-looking statement, to some extent my comments here may be architecture-specific. But for the architectures at hand (up to cc 3.5, AFAIK) shared memory atomics get implemented via a code sequence (created by the assembler). This code sequence, operating on shared memory, is subject to serialization if multiple threads are contending for access to the same bank/location.

The R-M-W operation itself is atomic in the sense that no other thread can disrupt the operation (i.e. create incorrect results), but when the threads are contending to do an atomic operation on a single shared memory location, the contention gives rise to serialization, exacerbating the delay associated with atomics.

To quote Nick from the CUDA Handbook:

Unlike global memory, which implements atomics using single instructions (either GATOM or GRED, depending on whether the return value is used), shared memory atomics are implemented with explicit lock/unlock semantics, and the compiler emits code that causes each thread to loop over these lock operations until the thread has performed its atomic operation.

and:

Take care to avoid contention, or the loop in Listing 8-2 may iterate up to 32 times.

I suggest you read the full section 8.1.5, at least.

Starting in the Maxwell architecture, shared memory atomics are no longer implemented via a code sequence, but there are native atomic instructions for shared memory. This can result in shared memory atomics running considerably faster on Maxwell and newer architectures.

142

answered Oct 17 '22 07:10

Robert Crovella

Related questions
                            
                                How can I add up two 2d (pitched) arrays using nested for loops?
                            
                                CUDA host and device using same __constant__ memory
                            
                                How do I set CUDA architecture to compute_50 and sm_50 from cmake (3.10 version)?
                            
                                Using assert within kernel invocation
                            
                                what is "SASS" short for? [closed]
                            
                                trying to install pycuda, getting zip error?
                            
                                How do I use Nvidia Multi-process Service (MPS) to run multiple non-MPI CUDA applications?
                            
                                Enable code indexing of Cuda in Clion
                            
                                How to get the assembly code of a CUDA kernel?
                            
                                Redefinitions when compiling CUDA with clang on Windows
                            
                                Why does CUDA float program get faster in full speed FP64 mode?
                            
                                Why does cuFFT performance suffer with overlapping inputs?
                            
                                GPUDirect RDMA transfer from GPU to remote host
                            
                                What do I need for programming for Tegra GPU
                            
                                Are there MapReduce implementations on GPUs (CUDA)?
                            
                                Using GPU inside docker container - CUDA Version: N/A and torch.cuda.is_available returns False
                            
                                Timing different sections in CUDA kernel
                            
                                CUDA Zero Copy memory considerations
                            
                                Using CUDA to solve a system of equations in non-linear least squares fashion
                            
                                Upload data in shared memory for convolution kernel

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

CUDA atomic operation performance in different scenarios

Tags:

atomic

cuda

gpgpu

gpu

nvidia

Farzad

People also ask

1 Answers

Robert Crovella

Recent Activity

Donate For Us