Here's a loop that I've tried with std::vector<double>
and with plain old double*
.
For 10 million elements, the vector version consistently runs in about 80% of the time that the double*
version takes; for pretty much any value of N
, vector is notably faster.
Peeking at the GCC STL source code, I don't see that std::vector
is doing anything essentially fancier than what the double*
idiom is doing (i.e., allocate with plain old new[]
, operator[]
dereferences an offset). This question speaks to that, too.
Any ideas why the vector version is faster?
Compiler: GCC 4.6.1
Example compile line: g++ -Ofast -march=native -DNDEBUG \
-ftree-vectorizer-verbose=2 -o vector.bin \
vector.cpp -lrt
OS: CentOS 5
CPU: Opteron 8431
RAM: 128 GB
Results are qualitatively the same if I use icpc 11.1 or run on a Xeon. Also, the vectorizer dump says that only the fill operation in std::vector
's constructor was vectorized.
The vector version:
#include <vector>
#include <iostream>
#include <boost/lexical_cast.hpp>
#include "util.h"
#include "rkck_params.h"
using namespace std;
int main( int argc, char* argv[] )
{
const size_t N = boost::lexical_cast<size_t>( argv[ 1 ] );
vector<double> y_old( N );
vector<double> y_new( N );
vector<double> y_err( N );
vector<double> k0( N );
vector<double> k1( N );
vector<double> k2( N );
vector<double> k3( N );
vector<double> k4( N );
vector<double> k5( N );
const double h = 0.5;
const timespec start = clock_tick();
for ( size_t i = 0 ; i < N ; ++i )
{
y_new[ i ] = y_old[ i ]
+ h
*(
rkck::c[ 0 ]*k0[ i ]
+ rkck::c[ 2 ]*k2[ i ]
+ rkck::c[ 3 ]*k3[ i ]
+ rkck::c[ 5 ]*k5[ i ]
);
y_err[ i ] = h
*(
rkck::cdiff[ 0 ]*k0[ i ]
+ rkck::cdiff[ 2 ]*k2[ i ]
+ rkck::cdiff[ 3 ]*k3[ i ]
+ rkck::cdiff[ 4 ]*k4[ i ]
+ rkck::cdiff[ 5 ]*k5[ i ]
);
}
const timespec stop = clock_tick();
const double total_time = seconds( start, stop );
// Output
cout << "vector\t" << N << "\t" << total_time << endl;
return 0;
}
The double*
version:
#include <iostream>
#include <boost/lexical_cast.hpp>
#include "util.h"
#include "rkck_params.h"
using namespace std;
int main( int argc, char* argv[] )
{
const size_t N = boost::lexical_cast<size_t>( argv[ 1 ] );
double* y_old = new double[ N ];
double* y_new = new double[ N ];
double* y_err = new double[ N ];
double* k0 = new double[ N ];
double* k1 = new double[ N ];
double* k2 = new double[ N ];
double* k3 = new double[ N ];
double* k4 = new double[ N ];
double* k5 = new double[ N ];
const double h = 0.5;
const timespec start = clock_tick();
for ( size_t i = 0 ; i < N ; ++i )
{
y_new[ i ]
= y_old[ i ]
+ h
*(
rkck::c[ 0 ]*k0[ i ]
+ rkck::c[ 2 ]*k2[ i ]
+ rkck::c[ 3 ]*k3[ i ]
+ rkck::c[ 5 ]*k5[ i ]
);
y_err[ i ]
= h
*(
rkck::cdiff[ 0 ]*k0[ i ]
+ rkck::cdiff[ 2 ]*k2[ i ]
+ rkck::cdiff[ 3 ]*k3[ i ]
+ rkck::cdiff[ 4 ]*k4[ i ]
+ rkck::cdiff[ 5 ]*k5[ i ]
);
}
const timespec stop = clock_tick();
const double total_time = seconds( start, stop );
delete [] y_old;
delete [] y_new;
delete [] y_err;
delete [] k0;
delete [] k1;
delete [] k2;
delete [] k3;
delete [] k4;
delete [] k5;
// Output
cout << "plain\t" << N << "\t" << total_time << endl;
return 0;
}
rkck_params.h
:
#ifndef RKCK_PARAMS_H
#define RKCK_PARAMS_H
namespace rkck
{
// C.f. $c_i$ in Ch. 16.2 of NR in C++, 2nd ed.
const double c[ 6 ]
= { 37.0/378.0,
0.0,
250.0/621.0,
125.0/594,
0.0,
512.0/1771.0 };
// C.f. $( c_i - c_i^* )$ in Ch. 16.2 of NR in C++, 2nd ed.
const double cdiff[ 6 ]
= { c[ 0 ] - 2825.0/27648.0,
c[ 1 ] - 0.0,
c[ 2 ] - 18575.0/48384.0,
c[ 3 ] - 13525.0/55296.0,
c[ 4 ] - 277.0/14336.0,
c[ 5 ] - 1.0/4.0 };
}
#endif
util.h
:
#ifndef UTIL_H
#define UTIL_H
#include <time.h>
#include <utility>
inline timespec clock_tick()
{
timespec tick;
clock_gettime( CLOCK_REALTIME, &tick );
return tick;
}
// \cite{www.guyrutenberg.com/2007/09/22/profiling-code-using-clock_gettime}
inline double seconds( const timespec& earlier, const timespec& later )
{
double seconds_diff = -1.0;
double nano_diff = -1.0;
if ( later.tv_nsec < earlier.tv_nsec )
{
seconds_diff = later.tv_sec - earlier.tv_sec - 1;
nano_diff = ( 1.0e9 + later.tv_nsec - earlier.tv_nsec )*1.0e-9;
}
else
{
seconds_diff = later.tv_sec - earlier.tv_sec;
nano_diff = ( later.tv_nsec - earlier.tv_nsec )*1.0e-9;
}
return seconds_diff + nano_diff;
}
#endif
The time complexity for the insertion of a new element is O(log N). Vector is faster for insertion and deletion of elements at the end of the container. Set is faster for insertion and deletion of elements at the middle of the container.
There is a myth that for run-time speed, one should use arrays. A std::vector can never be faster than an array, as it has (a pointer to the first element of) an array as one of its data members. But the difference in run-time speed is slim and absent in any non-trivial program.
So array is twice as quick as vector.
A vector will hold an object of a single type, and only a single type.
In the vector version your data is initialized to zero. In the new
version it's uninitialized, so different work might be done.
Did you run multiple times, in different orders?
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With