Short storry:
I have function with pass by reference of output variables
void acum( float2 dW, float4 dFE, float2 *W, float4 *FE )
which is supposed increment variables *W, *FE, by dW, dFE if some coditions are fulfilled. I want to make this function general - the output varibales can be either local or global.
acum( dW, dFE, &W , &FE ); // __local acum
acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum
When I try to compile it I got error
error: illegal implicit conversion between two pointers with different address spaces
is it possible to make it somehow??? I was thinking if I can use macro instead of function (but I'm not very familier with macros in C).
An other possibility would be probably:
Backbround (Not necessary to read):
I'm trying program some thermodynamical sampling using OpenCL. Because statistical weight W = exp(-E/kT) can easily overflow float (2^64) for low temperature, I made a composed datatype W = float2(W_digits, W_exponent) and I defined functions to manipulate these numbers "acum".
I try to minimize number of global memory operations, so I let work_items go over Vsurf rather than FEout, becauese I expect that only few points in Vsurf would have considerable statistical weight, so accumulation of FEout would be called only afew times for each workitem. While iteratin over FEout would require sizeof(FEout)*sizeof(Vsurf) memory operations. The whole code is here (any recomandations how to make it more efficinet are welcome):
// ===== function :: FF_vdW - Lenard-Jones Van Der Waals forcefield
float4 FF_vdW ( float3 R ){
const float C6 = 1.0;
const float C12 = 1.0;
float ir2 = 1.0/ dot( R, R );
float ir6 = ir2*ir2*ir2;
float ir12 = ir6*ir6;
float E6 = C6*ir6;
float E12 = C12*ir12;
return (float4)(
( 6*E6 - 12*E12 ) * ir2 * R
, E12 - E6
);}
// ===== function :: FF_spring - harmonic forcefield
float4 FF_spring( float3 R){
const float3 k = (float3)( 1.0, 1.0, 1.0 );
float3 F = k*R;
return (float4)( F,
0.5*dot(F,R)
);}
// ===== function :: EtoW - compute statistical weight
float2 EtoW( float EkT ){
float Wexp = floor( EkT);
return (float2)( exp(EkT - Wexp)
, Wexp
); }
// ===== procedure : addExpInplace -- acumulate F,E with statistical weight dW
void acum( float2 dW, float4 dFE, float2 *W, float4 *FE )
{
float dExp = dW.y - (*W).y; // log(dW)-log(W)
if(dExp>-22){ // e^22 = 2^32 , single_float = 2^+64
float fac = exp(dExp);
if (dExp<0){ // log(dW)<log(W)
dW.x *= fac;
(*FE) += dFE*dW.x;
(*W ).x += dW.x;
}else{ // log(dW)>log(W)
(*FE) = dFE + fac*(*FE);
(*W ).x = dW.x + fac*(*W).x;
(*W ).y = dW.y;
}
}
}
// ===== __kernel :: sampler
__kernel void sampler(
__global float * Vsurf, // in : surface potential (including vdW) // may be faster to precomputed endpoints positions like float8
__global float4 * FEout, // out : Fx,Fy,Fy, E
__global float2 * Wout, // out : W_digits, W_exponent
int3 nV ,
float3 dV ,
int3 nOut ,
int3 iOut0 , // shift of Fout in respect to Vsurf
int3 nCopy , // number of copies of
int3 nSample , // dimension of sampling in each dimension around R0 +nSample,-nSample
float3 RXe0 , // postion Xe relative to Tip
float EcutSurf ,
float EcutTip ,
float logWcut , // accumulate only when log(W) > logWcut
float kT // maximal energy which should be sampled
) {
int id = get_global_id(0); // loop over potential grid points
int idx = id/nV.x;
int3 iV = (int3)( idx/nV.y
, idx%nV.y
, id %nV.x );
float V = Vsurf[id];
float3 RXe = dV*iV;
if (V<EcutSurf){
// loop over tip position
for (int iz=0;iz<nOut.z;iz++ ){
for (int iy=0;iy<nOut.y;iy++ ){
for (int ix=0;ix<nOut.x;ix++ ){
int3 iTip = (int3)( iz, iy, ix );
float3 Rtip = dV*iTip;
float4 FE = 0;
float2 W = 0;
// loop over images of potential
for (int ix=0;ix<nCopy.x;ix++ ){
for (int iy=0;iy<nCopy.y;iy++ ){
float3 dR = RXe - Rtip;
float4 dFE = FF_vdW( dR );
dFE += FF_spring( dR - RXe0 );
dFE.w += V;
if( dFE.w < EcutTip ){
float2 dW = EtoW( - FE.w / kT );
acum( dW, dFE, &W , &FE ); // __local acum
}
}
}
if( W.y > logWcut ){ // accumulate force
int idOut = iOut0.x + iOut0.y*nOut.x + iOut0.z*nOut.x*nOut.y;
acum( W, FE, &Wout[idOut], &FEout[idOut] ); // __global acum
}
}}}}
}
I'm using pyOpenCL on ubuntu 12.04 64bit but I think it has nothink to do with the problem
OK, here what's happening, from the OpenCL man pages:
http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/global.html
"If the type of an object is qualified by an address space name, the object is allocated in the specified address name; otherwise, the object is allocated in the generic address space"
...
"The generic address space name for arguments to a function in a program, or local variables of a function is __private. All function arguments shall be in the __private address space. "
So your acum(... ) function args are in the __private address space.
So the compiler is correctly saying that
acum( ..&Wout[idOut], &FEout[idOut] )
is being called with &Wout and &FEout in the global addrress space when the function args must in the private address space.
The solution is conversion between global and private.
Create two private temporary vars to receive the results.
call acum( ... ) with these vars.
assign the temporary private values to the global values, after you've called acum( .. )
The code will look is bit messy
Remember on a GPU you have many address spaces, you can't magically jump between them by casting. You have to move data explicitly between address spaces by assignment.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With