OpenCL

Question

I'm absolutelly novice in OpenCL programming. For my app. (molecular simulaton) I wrote a kernel for calculate intermolecular potential of lennard-jones liquid. In this kernel I need to compute cumulative value of the potential of all particles with one:

__kernel void Molsim(__global const float* inmatrix, __global float* fi, const int c, const float r1, const float r2, const float r3, const float rc, const float epsilon, const float sigma, const float h1, const float h23)
{
   float fi0;
   float fi1;
   float d;

   unsigned int i = get_global_id(0); //number of particles (typically 2000)

   if(c!=i) {
      // potential before particle movement
      d=sqrt(pow((0.5*h1-fabs(0.5*h1-fabs(inmatrix[c*3]-inmatrix[i*3]))),2.0)+pow((0.5*h23-fabs(0.5*h23-fabs(inmatrix[c*3+1]-inmatrix[i*3+1]))),2.0)+pow((0.5*h23-fabs(0.5*h23-fabs(inmatrix[c*3+2]-inmatrix[i*3+2]))),2.0));
      if(d<rc) {
        fi0=4.0*epsilon*(pow(sigma/d,12.0)-pow(sigma/d,6.0));
      }
      else {
        fi0=0;
      }
      // potential after particle movement
      d=sqrt(pow((0.5*h1-fabs(0.5*h1-fabs(r1-inmatrix[i*3]))),2.0)+pow((0.5*h23-fabs(0.5*h23-fabs(r2-inmatrix[i*3+1]))),2.0)+pow((0.5*h23-fabs(0.5*h23-fabs(r3-inmatrix[i*3+2]))),2.0));
      if(d<rc) {
        fi1=4.0*epsilon*(pow(sigma/d,12.0)-pow(sigma/d,6.0));
      }
        else {
          fi1=0;
        }
      // cumulative difference of potentials
      // fi[0]+=fi1-fi0; changed to full size vector
      fi[get_global_id(0)]=fi1-fi0;
      }
}

My problem is in the line: fi[0]+=fi1-fi0;. In the one-element vector fi[0] are wrong results. I read something about sum reduction, but I do not know how to do it during the calculation.

Exist any simple solution of my problem?

Notice: I tried to add next kernel for the sum of the vector components (see code below), but there was an even greater slowdown than when I sum vector using the CPU.

__kernel void Arrsum(__global const float* inmatrix, __global float* outsum, const int inmatrixsize, __local float* resultScratch)
{
       // načtení indexu
      int gid = get_global_id(0);
      int wid = get_local_id(0);
      int wsize = get_local_size(0);
      int grid = get_group_id(0);
      int grcount = get_num_groups(0);

      int i;
      int workAmount = inmatrixsize/grcount;
      int startOffest = workAmount * grid + wid;
      int maxOffest = workAmount * (grid + 1);
      if(maxOffest > inmatrixsize){
        maxOffest = inmatrixsize;
    }

    resultScratch[wid] = 0.0;
    for(i=startOffest;i<maxOffest;i+=wsize){
            resultScratch[wid] += inmatrix[i];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if(gid == 0){
            for(i=1;i<wsize;i++){
                    resultScratch[0] += resultScratch[i];
            }
            outsum[grid] = resultScratch[0];
    }
}

Alex Placet · Accepted Answer

I think you need the atomic_add atomic function for fi[0]+=fi1-fi0;

Warning: Use an atomic function reduces performance.

Here, two examples with the increment atomic function.

Example without atomic function and 2 workitems:

__kernel void inc(global int * num){
    num[0]++; //num[0] = 0
}

Work Item 1 reads num[0]: 0
Work Item 2 reads num[0]: 0
Work Item 1 increments num[0]: 0 + 1
Work Item 2 increments num[0]: 0 + 1
Work Item 1 writes num[0]: num[0] = 1
Work Item 2 writes num[0]: num[0] = 1

Result : num[0] = 1

Example with atomic function and 2 workitems:

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

__kernel void inc(global int * num){
    atom_inc(&num[0]);
}

Work Item 1 reads num[0]: 0
Work Item 1 increments num[0]: 0 + 1
Work Item 1 writes num[0]: num[0] = 1
Work Item 2 reads num[0]: 1
Work Item 2 increments num[0]: 1 + 1
Work Item 2 writes num[0]: num[0] = 2

Result : num[0] = 2

OpenCL - incremental summation during compute

Tags:

Michal

1 Answers

Alex Placet

Recent Activity

Donate For Us