Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Negative array indexing in shared memory based 1d stencil CUDA implementation

Tags:

arrays

cuda

I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:

__global__ void stencil_1d(int *in, int *out) {

    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];

    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    temp[lindex] = in[gindex];
    if (threadIdx.x < RADIUS) {
        temp[lindex - RADIUS] = in[gindex - RADIUS];
        temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
    }

....

To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.

The confusion comes in on the following line:

 temp[lindex - RADIUS] = in[gindex - RADIUS];

If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.

like image 548
user481610 Avatar asked Oct 29 '14 07:10

user481610


4 Answers

The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.

To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.

(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    unsigned int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}
like image 143
Vitality Avatar answered Nov 07 '22 14:11

Vitality


You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).

My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:

stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);

So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.

like image 37
pQB Avatar answered Nov 07 '22 14:11

pQB


There are already good answers, but to focus on the actual point that caused the confusion:

In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.

For example, consider a pointer like this:

int* data= ... // Points to some memory

When you then write a statement like

data[3] = 42;

you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written

int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3] 

and consequently,

dataWithOffset[-3] = 123; // This will write into data[0]

In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)

like image 4
Marco13 Avatar answered Nov 07 '22 14:11

Marco13


I can compile @JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":

enter image description here

And when run, it will abort like: enter image description here

I have modified the code to the following and the warning disappeared and it can get right result:

#include <thrust/device_vector.h>

#define RADIUS      3
#define BLOCKSIZE   32

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {

    __shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];

    int gindexx = threadIdx.x + blockIdx.x * blockDim.x;

    int lindexx = threadIdx.x + RADIUS;

    // --- Read input elements into shared memory
    temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
    if (threadIdx.x < RADIUS) {
        temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
        temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
    }

    __syncthreads();

    // --- Apply the stencil
    unsigned int result = 0;
    for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
        result += temp[lindexx + offset];
    }

    // --- Store the result
    out[gindexx] = result;
}

/********/
/* MAIN */
/********/
int main() {

    const int N        = 55 + 2 * RADIUS;

    const unsigned int constant = 4;

    thrust::device_vector<unsigned int> d_in(N, constant);
    thrust::device_vector<unsigned int> d_out(N);

    moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    thrust::host_vector<unsigned int> h_out = d_out;

    for (int i=0; i<N; i++)
        printf("Element i = %i; h_out = %i\n", i, h_out[i]);

    return 0;

}

The result is like this:

enter image description here

like image 1
BruceSun Avatar answered Nov 07 '22 13:11

BruceSun