I tried to use cuda math api such as sqrtf(), __fdividef() and got errors like the following:
error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found
It seems "NVIDIA CUDA Math API" didn't specify which header we're supposed to include when we want to use these apis. In helper_math.h, it looks like the function e.g.
inline __host__ __device__ float length(float4 v)
{
return sqrtf(dot(v, v));
}
use the implementation of sqrtf from c library math.h, isn't it a host function without a device identifier? Can we use it directly in our device function?
Please correct me. Thanks.
Previously, I implemented the code like this:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h
#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H
#include "vector_functions.h"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator)
{
return __fdividef(numerator, denominator);
//return numerator/denominator;
}
__device__
float getSqrtf(float f2)
{
return sqrtf(f2);
}
__device__
float getReverseSqrt(float f2)
{
return rsqrtf(f2);
}
__device__
float3 getCrossProduct(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
__device__
float4 getCrossProduct(float4 a, float4 b)
{
float3 v1 = make_float3(a.x, a.y, a.z);
float3 v2 = make_float3(b.x, b.y, b.z);
float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return make_float4(v3.x, v3.y, v3.z, 0.0f);
}
__device__
float getDotProduct(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
float getDotProduct(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
__device__ float3 getNormalizedVec(const float3 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}
__device__ float4 getNormalizedVec(const float4 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}
__device__
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.x, a.y, a.z,0.f);
float4 b1 = make_float4(b.x, b.y, b.z,0.f);
return getDotProduct(a1, b1);
}
__device__
float getLength(float3 a)
{
return sqrtf(getDotProduct(a, a));
}
__device__
float getLength(float4 a)
{
return sqrtf(getDotProduct(a, a));
}
/*****************************************
Matrix3x3
/*****************************************/
typedef struct
{
float4 m_row[3];
}Matrix3x3_d;
__device__
void setZero(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
__device__
Matrix3x3_d getZeroMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
return m;
}
__device__
void setIdentity(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
}
__device__
Matrix3x3_d getIdentityMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
return m;
}
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
Matrix3x3_d out;
out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
Matrix3x3_d transB = getTranspose( b );
Matrix3x3_d ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
#endif // UNIFIED_MATH_CUDA_H
Since cuda will automatically inline all the device code, I ignored the inline identifier. But It came with the error as I stated in the question:
error C3861: '__fdividef':Identifier not found
error C3861: 'rsqrtf':Identifier not found
Then I reimplemented following the tradional c/c++ style as following.
UnifiedMathCUDA.cuh:
// This implementation follows the code from
// https://github.com/erwincoumans/experiments/blob/master/opencl/primitives/AdlPrimitives/Math/MathCL.h
#ifndef UNIFIED_MATH_CUDA_H
#define UNIFIED_MATH_CUDA_H
#include "vector_functions.h"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator);
__device__
float getSqrtf(float f2);
__device__
float getReverseSqrt(float f2);
__device__
float3 getCrossProduct(float3 a, float3 b);
__device__
float4 getCrossProduct(float4 a, float4 b);
__device__
float getDotProduct(float3 a, float3 b);
__device__
float getDotProduct(float4 a, float4 b);
__device__ float3 getNormalizedVec(const float3 v);
__device__ float4 getNormalizedVec(const float4 v);
__device__
float dot3F4(float4 a, float4 b);
__device__
float getLength(float3 a);
__device__
float getLength(float4 a);
/*****************************************
Matrix3x3
/*****************************************/
typedef struct
{
float4 m_row[3];
}Matrix3x3_d;
__device__
void setZero(Matrix3x3_d& m);
__device__
Matrix3x3_d getZeroMatrix3x3();
__device__
void setIdentity(Matrix3x3_d& m);
__device__
Matrix3x3_d getIdentityMatrix3x3();
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m);
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b);
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b);
#endif // UNIFIED_MATH_CUDA_H
and the correspoding implementaion file is like this:
#include "UnifiedMathCUDA.cuh"
/*****************************************
Vector
/*****************************************/
__device__
float fastDiv(float numerator, float denominator)
{
return __fdividef(numerator, denominator);
//return numerator/denominator;
}
__device__
float getSqrtf(float f2)
{
return sqrtf(f2);
}
__device__
float getReverseSqrt(float f2)
{
return rsqrtf(f2);
}
__device__
float3 getCrossProduct(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
__device__
float4 getCrossProduct(float4 a, float4 b)
{
float3 v1 = make_float3(a.x, a.y, a.z);
float3 v2 = make_float3(b.x, b.y, b.z);
float3 v3 = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
return make_float4(v3.x, v3.y, v3.z, 0.0f);
}
__device__
float getDotProduct(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__
float getDotProduct(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
__device__ float3 getNormalizedVec(const float3 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float3(v.x * invLen, v.y * invLen, v.z * invLen);
}
__device__ float4 getNormalizedVec(const float4 v)
{
float invLen = 1.0f / sqrtf(getDotProduct(v, v));
return make_float4(v.x * invLen, v.y * invLen, v.z * invLen, v.w * invLen);
}
__device__
float dot3F4(float4 a, float4 b)
{
float4 a1 = make_float4(a.x, a.y, a.z,0.f);
float4 b1 = make_float4(b.x, b.y, b.z,0.f);
return getDotProduct(a1, b1);
}
__device__
float getLength(float3 a)
{
return sqrtf(getDotProduct(a, a));
}
__device__
float getLength(float4 a)
{
return sqrtf(getDotProduct(a, a));
}
/*****************************************
Matrix3x3
/*****************************************/
__device__
void setZero(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
__device__
Matrix3x3_d getZeroMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[1] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
m.m_row[2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
return m;
}
__device__
void setIdentity(Matrix3x3_d& m)
{
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
}
__device__
Matrix3x3_d getIdentityMatrix3x3()
{
Matrix3x3_d m;
m.m_row[0] = make_float4(1,0,0,0);
m.m_row[1] = make_float4(0,1,0,0);
m.m_row[2] = make_float4(0,0,1,0);
return m;
}
__device__
Matrix3x3_d getTranspose(const Matrix3x3_d m)
{
Matrix3x3_d out;
out.m_row[0] = make_float4(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
out.m_row[1] = make_float4(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
out.m_row[2] = make_float4(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
return out;
}
__device__
Matrix3x3_d MatrixMul(Matrix3x3_d& a, Matrix3x3_d& b)
{
Matrix3x3_d transB = getTranspose( b );
Matrix3x3_d ans;
// why this doesn't run when 0ing in the for{}
a.m_row[0].w = 0.f;
a.m_row[1].w = 0.f;
a.m_row[2].w = 0.f;
for(int i=0; i<3; i++)
{
// a.m_row[i].w = 0.f;
ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
ans.m_row[i].w = 0.f;
}
return ans;
}
__device__
float4 MatrixVectorMul(Matrix3x3_d a, float4 b)
{
float4 ans;
ans.x = dot3F4( a.m_row[0], b );
ans.y = dot3F4( a.m_row[1], b );
ans.z = dot3F4( a.m_row[2], b );
ans.w = 0.f;
return ans;
}
Now, Problem solved!
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With