Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

OpenCL double precision different from CPU double precision

Tags:

opencl

I am programming in OpenCL using a GeForce GT 610 card in Linux. My CPU and GPU double precision results are not consistent. I can post part of the code here, but I would first like to know whether anyone else has faced this problem. The difference between the GPU and CPU double precision results get pronounced when I run loops with many iterations. There is really nothing special about the code, but I can post it here if anyone is interested. Thanks a lot. Here is my code. Please excuse the __ and bad formatting as I am new here :) As you can see, I have two loops and my CPU code is essentially almost an identical version.

#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision floating point not supported by OpenCL implementation."

#endif

__kernel void simpar(__global double* fp, __global double* fp1,
  __global double* fp3, __global double* fp5,
 __global double* fp6, __global double* fp7,
 __global double* fp8, __global double* fp8Plus,
 __global double* x, __global double* v, __global double* acc,
 __global double* keBuf, __global double* peBuf,
 unsigned int prntstps, unsigned int nprntstps, double dt
 ) {
unsigned int m,i,j,k,l,t;
unsigned int chainlngth=100;
double dxi, twodxi, dxipl1, dximn1, fac, fac1, fac2, fac13, fac23;
double ke,pe,tke,tpe,te,dx;
double hdt, hdt2;
double alpha=0.16;
double beta=0.7;
double cmass;
double peTemp;
nprntstps=1001;
dt=0.01;
prntstps=100;
double alphaby4=beta/4.0;
hdt=0.5*dt;
hdt2=dt*0.5*dt;
double Xlocal,Vlocal,Acclocal;
unsigned int global_id=get_global_id(0);
if (global_id<chainlngth){
Xlocal=x[global_id];
Vlocal=v[global_id];
Acclocal=acc[global_id];
for (m=0;m<nprntstps;m++){

for(l=0;l<prntstps;l++){
               Xlocal =Xlocal+dt *Vlocal+hdt2*Acclocal; 
               x[global_id]=Xlocal;
               barrier(CLK_LOCAL_MEM_FENCE);

              Vlocal =Vlocal+ hdt * Acclocal; 
              barrier(CLK_LOCAL_MEM_FENCE);

            j = global_id - 1;
            k = global_id + 1;
            if (j == -1) {
                    dximn1 = 0.0;
            } else {
                    dximn1 = x[j];
            }
            if (k == chainlngth) {
                    dxipl1 = 0.0;
            } else {
                    dxipl1 = x[k];
            }
            dxi = Xlocal;
            twodxi = 2.0 * dxi;
            fac = dxipl1 + dximn1 - twodxi;
            fac1 = dxipl1 - dxi;
            fac2 = dxi - dximn1;
            fac13 = fac1 * fac1 * fac1;
            fac23 = fac2 * fac2 * fac2;
            Acclocal = alpha * fac + beta * (fac13 - fac23);

            barrier(CLK_GLOBAL_MEM_FENCE);

            Vlocal += hdt * Acclocal;
            v[global_id]=Vlocal;
            acc[global_id]=Acclocal;
            barrier(CLK_GLOBAL_MEM_FENCE);
       }
            barrier(CLK_GLOBAL_MEM_FENCE);

            tke = tpe = te = dx = 0.0;
            ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
           barrier(CLK_GLOBAL_MEM_FENCE);
            fp6[(m*100)+global_id]=ke;
            keBuf[global_id]=ke;
            ke=0.0; 
            barrier(CLK_GLOBAL_MEM_FENCE);


            j = global_id - 1;
            k = global_id + 1;
            if (j == -1) {
                    dximn1 = 0.0;
            } else {
                    dximn1 = x[j];
            }
            if (k == chainlngth) {
                    dxipl1 = 0.0;
            } else {
                    dxipl1 = x[k];
            }
            dxi = Xlocal;
            twodxi = 2.0 * dxi;
            fac = dxipl1 + dximn1 - twodxi;
            fac1 = dxipl1 - dxi;
            fac2 = dxi - dximn1;
            fac13 = fac1 * fac1 * fac1;
            fac23 = fac2 * fac2 * fac2;
            Acclocal = alpha * fac + beta * (fac13 - fac23);

            barrier(CLK_GLOBAL_MEM_FENCE);

            Vlocal += hdt * Acclocal;
            v[global_id]=Vlocal;
            acc[global_id]=Acclocal;
            barrier(CLK_GLOBAL_MEM_FENCE);
       }
            barrier(CLK_GLOBAL_MEM_FENCE);

            tke = tpe = te = dx = 0.0;
            ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
           barrier(CLK_GLOBAL_MEM_FENCE);
            fp6[(m*100)+global_id]=ke;
            keBuf[global_id]=ke;
            ke=0.0; 
            barrier(CLK_GLOBAL_MEM_FENCE);
            j = global_id - 1;
            k = global_id + 1;
            if (j == -1) {
                    dximn1 = 0.0;
            } else {
                    dximn1 = x[j];
            }
            if (k == chainlngth) {
                    dxipl1 = 0.0;
            } else {
                    dxipl1 = x[k];
            }
            dxi = Xlocal;
            twodxi = 2.0 * dxi;
            fac = dxipl1 + dximn1 - twodxi;
            fac1 = dxipl1 - dxi;
            fac2 = dxi - dximn1;
            fac13 = fac1 * fac1 * fac1;
            fac23 = fac2 * fac2 * fac2;
            Acclocal = alpha * fac + beta * (fac13 - fac23);

            barrier(CLK_GLOBAL_MEM_FENCE);

            Vlocal += hdt * Acclocal;
            v[global_id]=Vlocal;
            acc[global_id]=Acclocal;
            barrier(CLK_GLOBAL_MEM_FENCE);
       }
            barrier(CLK_GLOBAL_MEM_FENCE);

            tke = tpe = te = dx = 0.0;
            ke=0.5*Vlocal*Vlocal;//Vlocal*Vlocal;
           barrier(CLK_GLOBAL_MEM_FENCE);
            fp6[(m*100)+global_id]=ke;
            keBuf[global_id]=ke;
            ke=0.0; 
            barrier(CLK_GLOBAL_MEM_FENCE);
     if (global_id ==0){
             for(t=0;t<100;t++)
                  tke+=keBuf[t];
            }

            barrier(CLK_GLOBAL_MEM_FENCE); 
            k = global_id-1;
            if (k == -1) {
                dx = Xlocal;
            }else{
              dx = Xlocal-x[k];
            }

              fac = dx * dx;
              peTemp = alpha * 0.5 * fac + alphaby4 * fac * fac;
              fp8[global_id*m]=peTemp;
              if (global_id == 0)
                    tpe+=peTemp;

              barrier(CLK_GLOBAL_MEM_FENCE);  
              cmass=0.0;  
              dx = -x[100-1];
              fac = dx*dx;

              pe=alpha*0.5*fac+alphaby4*fac*fac;
              if (global_id==0){
              fp8Plus[m]=pe;
              tpe+=peBuf[0];
              fp5[m*2]=i;
              fp5[m*2+1]=cmass;
              te=tke+tpe;
              fp[m*2]=m;
              fp[m*2+1]=te;

             }
   barrier(CLK_GLOBAL_MEM_FENCE);
              //cmass /=100;
             fp1[(m*chainlngth)+global_id]=Xlocal-cmass; 
             // barrier(CLK_GLOBAL_MEM_FENCE);
              fp3[(m*chainlngth)+global_id]=Vlocal;
             // barrier(CLK_GLOBAL_MEM_FENCE);
             fp7[(m*chainlngth)+global_id]=Acclocal;

              barrier(CLK_GLOBAL_MEM_FENCE);
  }
 }

}

like image 680
Newbee Avatar asked Aug 05 '13 11:08

Newbee


1 Answers

This is somewhat expected behavior, actually.

On older x86 CPUs, floating point numbers are 80bits long (Intel's "long double"), and truncated to 64bit only when need be. When SIMD units/instructions for floating point arithmetics arrived for x86 CPUs, floating point double precision became 64bit by default; however, 80bit is still possible, depending on your compiler settings. There's a lot to read about this out there: Wikipedia: Floating Point.

Check your compiler settings for OpenCL and host code on floating point "magic tricks", to get better agreement of your results. Calculate the absolute and relative error of your values and check if this error margin is safe for your application.

like image 146
Sven Avatar answered Dec 24 '22 12:12

Sven