MSVC 2013 Ultimate w/ Update 4
Not understanding why I am getting this error on this seemingly simple example
info C5002: loop not vectorized due to reason '1200'
which is
1200 Loop contains loop-carried data dependences
I don't see how the iterations of the loop could interfere with each other.
__declspec( align( 16 ) ) class PhysicsSystem
{
public:
static const int32_t MaxEntities = 65535;
__declspec( align( 16 ) ) struct VectorizedXYZ
{
double mX[ MaxEntities ];
double mY[ MaxEntities ];
double mZ[ MaxEntities ];
VectorizedXYZ()
{
memset( mX, 0, sizeof( mX ) );
memset( mY, 0, sizeof( mY ) );
memset( mZ, 0, sizeof( mZ ) );
}
};
void Update( double dt )
{
for ( int32_t i = 0; i < MaxEntities; ++i ) <== 1200
{
mTmp.mX[ i ] = mPos.mX[ i ] + mVel.mX[ i ] * dt;
mTmp.mY[ i ] = mPos.mY[ i ] + mVel.mY[ i ] * dt;
mTmp.mZ[ i ] = mPos.mZ[ i ] + mVel.mZ[ i ] * dt;
}
}
private:
VectorizedXYZ mTmp;
VectorizedXYZ mPos;
VectorizedXYZ mVel;
};
Edit: Judging by http://blogs.msdn.com/b/nativeconcurrency/archive/2012/05/08/auto-vectorizer-in-visual-studio-11-rules-for-loop-body.aspx this would seem to be an example of "Example 1 – Embarrassingly Parallel", but it acts like it thinks the arrays are unsafe from aliasing, which is puzzling to me.
Edit2: It would be nice if someone could share the reasons why the auto vectorization fails on such a seemingly simple example, but after tinkering with it for some time, I opted instead to take the reigns myself
void PhysicsSystem::Update( Real dt )
{
const __m128d mdt = { dt, dt };
// advance by 2 since we can do 2 at a time at double precision in __m128d
for ( size_t i = 0; i < MaxEntities; i += 2 )
{
__m128d posX = _mm_load_pd( &mPos.mX[ i ] );
__m128d posY = _mm_load_pd( &mPos.mY[ i ] );
__m128d posZ = _mm_load_pd( &mPos.mZ[ i ] );
__m128d velX = _mm_load_pd( &mVel.mX[ i ] );
__m128d velY = _mm_load_pd( &mVel.mY[ i ] );
__m128d velZ = _mm_load_pd( &mVel.mZ[ i ] );
__m128d velFrameX = _mm_mul_pd( velX, mdt );
__m128d velFrameY = _mm_mul_pd( velY, mdt );
__m128d velFrameZ = _mm_mul_pd( velZ, mdt );
_mm_store_pd( &mPos.mX[ i ], _mm_add_pd( posX, velFrameX ) );
_mm_store_pd( &mPos.mY[ i ], _mm_add_pd( posX, velFrameY ) );
_mm_store_pd( &mPos.mZ[ i ], _mm_add_pd( posX, velFrameZ ) );
}
}
Not sure if your compiler supports it, but for enforcing some proper vectorisation, you can portably do that:
void PhysicsSystem::Update( double dt ) {
double *tx=mTmp.mX, *ty=mTmp.mY, *tz=mTmp.mZ;
double *px=mPos.mX, *py=mPos.mY, *pz=mPos.mZ;
double *vx=mVel.mX, *vy=mVel.mY, *vz=mVel.mZ;
#pragma omp simd aligned( tx, ty, tz, px, py, pz, vx, vy, vz )
for ( int i = 0; i < MaxEntities; ++i ) {
tx[ i ] = px[ i ] + vx[ i ] * dt;
ty[ i ] = py[ i ] + vy[ i ] * dt;
tz[ i ] = pz[ i ] + vz[ i ] * dt;
}
}
You need then to enable the OpenMP support for the directive to be taken into account.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With