I am looking for some help or hints to speed up my code.
I have implemented a routine for computing the gravitational potential at a point (r,phi,lambda) in space from a set of spherical harmonic coefficients C_{n,m} and S_{n,m}. The equation is shown in the link below:
and includes the recursive computation of the latitude (phi) dependent associated Legendre polynomials P_{n,m}, starting with the first two values P{0,0} and P_{1,1}.
At first, I had this implemented as a MATLAB C-MEX code, with only the core part of my code in C-language. I wanted to make a pure C-routine, but found that the code runs 3-5 times slower, which makes me wonder why. Could it be the way I define my structures and use pointers to pointers in the central code? It seems like it is the core computation part that takes the extra time, but that part did not change although before I was passing using pointers directly to variables and now I am using pointers inside structures. Any help is appreciated!
In the following, I will try to explain my code and show some extracts:
At the beginning of the program, I define three structures. One to hold the spherical harmonic coefficients, C_{n,m} and S_{n,m}, (ggm_struct), one to hold the computation coordinates (comp_struct) and one to hold the results (func_struct):
// Define constant variables
const double deg2rad = M_PI/180.0; // degrees to radians conversion factor
const double sfac = 1.0000E-280; // scaling factor
const double sqrt2 = 1.414213562373095; // sqrt(2)
const double sqrt3 = 1.732050807568877; // sqrt(3)
// Define structure to hold geopotential model
struct ggm_struct {
char product_type[100], modelname[100], errors[100], norm[100], tide_system[100];
double GM, R, *C, *S;
int max_degree, ncoef;
};
// Define structure to hold computation coordinates
struct comp_struct {
double *lat, *lon, *h;
double *r, *phi;
int nlat, nlon;
};
/* Define structure to hold results */
struct func_struct {
double *rval;
int npoints;
};
I then have a (sub-)function that starts by allocating space and then loads the coefficients from an ascii file as
int read_gfc(char mfile[100], int *nmax, int *mmax, struct ggm_struct *ggm)
{
// Set file identifier
FILE *fid;
// Declare variables
char str[200], var[100];
int n, m, nid, l00 = 0, l10 = 0, l11 = 0;
double c, s;
// Determine number of coefficients
ggm->ncoef = (*nmax+2)*(*nmax+1)/2;
// Allocate memory for coefficients
ggm->C = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->C == NULL){
printf("Error: Memory for C not allocated!");
return -ENOMEM;
}
ggm->S = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->S == NULL){
printf("Error: Memory for S not allocated!");
return -ENOMEM;
}
// Open file
fid = fopen(mfile,"r");
// Check that file was opened correctly
if (fid == NULL){
printf("Error: opening file %s!",mfile);
return -ENOMEM;
}
// Read file header
while (fgets(str,200,fid) != NULL && strncmp(str,"end_of_head",11) != 0){
// Extract model parameters
if (strncmp(str,"product_type",12) == 0){ sscanf(str,"%s %s",var,ggm->product_type); }
if (strncmp(str,"modelname",9) == 0){ sscanf(str,"%s %s",var,ggm->modelname); }
if (strncmp(str,"earth_gravity_constant",22) == 0){ sscanf(str,"%s %lf",var,&ggm->GM); }
if (strncmp(str,"radius",6) == 0){ sscanf(str,"%s %lf",var,&ggm->R); }
if (strncmp(str,"max_degree",10) == 0){ sscanf(str,"%s %d",var,&ggm->max_degree); }
if (strncmp(str,"errors",6) == 0){ sscanf(str,"%s %s",var,ggm->errors); }
if (strncmp(str,"norm",4) == 0){ sscanf(str,"%s %s",var,ggm->norm); }
if (strncmp(str,"tide_system",11) == 0){ sscanf(str,"%s %s",var,ggm->tide_system); }
}
// Read coefficients
while (fgets(str,200,fid) != NULL){
// Extract parameters
sscanf(str,"%s %d %d %lf %lf",var,&n,&m,&c,&s);
// Store parameters
if (n <= *nmax && m <= *mmax) {
// Determine index
nid = (n+1)*n/2 + m;
// Store values
*(ggm->C+nid) = c;
*(ggm->S+nid) = s;
}
}
// Close fil
fclose(fid);
// Return from function
return 0;
}
Afterwards, the computation grid is defined by an array of seven components. As an example, the array [-90 90 -180 180 1 1 0] defines a grid from -90 to 90 degrees latitude at 1-degree increments and from -180 to 180 degrees longitude at 1-degree increments. The height is zero. From this array, a computation grid is generated in a (sub-)function:
int make_grid(double *grid, struct comp_struct *inp)
{
// Declare variables
int n;
/* Echo routine */
printf("Creating grid of coordinates\n");
printf(" [lat1,lat2,dlat] = [%f,%f,%f]\n", *grid, *(grid+1), *(grid+4) );
printf(" [lon1,lon2,dlon] = [%f,%f,%f]\n", *(grid+2), *(grid+3), *(grid+5) );
printf(" h = %f\n", *(grid+6));
/* Latitude ------------------------------------------------------------- */
// Determine number of increments
inp->nlat = ceil( ( *(grid+1) - *grid + *(grid+4) ) / *(grid+4) );
// Allocate memory
inp->lat = (double*) malloc(inp->nlat*sizeof(double));
if (inp->lat== NULL){
printf("Error: Memory for LATITUDE (inp.lat) points not allocated!");
return -ENOMEM;
}
// Fill in values
*(inp->lat) = *(grid+1);
for (n = 1; n < inp->nlat-1; n++) {
*(inp->lat+n) = *(inp->lat+n-1) - *(grid+4);
}
*(inp->lat+inp->nlat-1) = *grid;
/* Longitude ------------------------------------------------------------ */
// Determine number of increments
inp->nlon = ceil( ( *(grid+3) - *(grid+2) + *(grid+5) ) / *(grid+5) );
// Allocate memory
inp->lon = (double*) malloc(inp->nlon*sizeof(double));
if (inp->lon== NULL){
printf("Error: Memory for LONGITUDE (inp.lon) points not allocated!");
return -ENOMEM;
}
// Fill in values
*(inp->lon) = *(grid+2);
for (n = 1; n < inp->nlon-1; n++) {
*(inp->lon+n) = *(inp->lon+n-1) + *(grid+5);
}
*(inp->lon+inp->nlon-1) = *(grid+3);
/* Height --------------------------------------------------------------- */
// Allocate memory
inp->h = (double*) malloc(inp->nlat*sizeof(double));
if (inp->h== NULL){
printf("Error: Memory for HEIGHT (inp.h) points not allocated!");
return -ENOMEM;
}
// Fill in values
for (n = 0; n < inp->nlat; n++) {
*(inp->h+n) = *(inp->h+n-1) + *(grid+6);
}
// Return from function
return 0;
}
These geographic coordinates are then transformed to spherical coordinates for the computation using another (sub-)routine
int geo2sph(struct comp_struct *inp, int *lgrid)
{
// Declare variables
double a = 6378137.0, e2 = 6.69437999014E-3; /* WGS84 parameters */
double x, y, z, sinlat, coslat, sinlon, coslon, R_E;
int i, j, nid;
/* Allocate space ------------------------------------------------------- */
// radius
inp->r = (double*) malloc(inp->nlat*sizeof(double));
if (inp->r== NULL){
printf("Error: Memory for SPHERICAL DISTANCE (inp.r) points not allocated!");
return -ENOMEM;
}
// phi
inp->phi = (double*) malloc(inp->nlat*sizeof(double));
if (inp->phi== NULL){
printf("Error: Memory for SPHERICAL LATITUDE (inp.phi) points not allocated!");
return -ENOMEM;
}
/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {
// Compute sine and cosine of latitude
sinlat = sin(*(inp->lat+i));
coslat = cos(*(inp->lat+i));
// Compute radius of curvature
R_E = a / sqrt( 1.0 - e2*sinlat*sinlat );
// Compute sine and cosine of longitude
sinlon = sin(*(inp->lon));
coslon = cos(*(inp->lon));
// Compute rectangular coordinates
x = ( R_E + *(inp->h+i) ) * coslat * coslon;
y = ( R_E + *(inp->h+i) ) * coslat * sinlon;
z = ( R_E*(1.0-e2) + *(inp->h+i) ) * sinlat;
// Compute sqrt( x^2 + y^2 )
R_E = sqrt( x*x + y*y );
// Derive radial distance
*(inp->r+i) = sqrt( R_E * R_E + z*z );
// Derive spherical latitude
if (R_E < 1) {
if (z > 0) { *(inp->phi+i) = M_PI/2.0; }
else { *(inp->phi+i) = -M_PI/2.0; }
}
else {
*(inp->phi+i) = asin( z / *(inp->r+i) );
}
}
// Return from function
return 0;
}
Finally, the gravitational potential is computed within is own (sub-)function. This is the core part of the code, which is more or less the same as for the MATLAB C-MEX function. The only difference seems to be that before (in MATLAB MEX) everything was defined as (simple) double variables - now the variables are located inside a structure which contains pointers.
int gravpot(struct comp_struct *inp, struct ggm_struct *ggm, int *nmax,
int *mmax, int *lgrid, struct func_struct *out)
{
// Declare variables
double GMr, ar, t, u, u2, arn, gnm, hnm, P, Pp1, Pp2, msum;
double Pmm[*nmax+1], CPnm[*mmax+1], SPnm[*mmax+1];
int i, j, n, m, id;
// Allocate memory
out->rval = (double*) malloc(inp->nlat*inp->nlon*sizeof(double));
if (out->rval== NULL){
printf("Error: Memory for OUTPUT (out.rval) not allocated!");
return -ENOMEM;
}
/* Compute sectorial values of associated Legendre polynomials ========== */
// Define seed values ( divided by u^m )
Pmm[0] = sfac;
Pmm[1] = sqrt3 * sfac;
// Compute sectorial values, [1] eq. 13 and 28 ( divided by u^m )
for (m = 2; m <= *nmax; m++) {
Pmm[m] = sqrt( (2.0*m+1.0) / (2.0*m) ) * Pmm[m-1];
}
/* ====================================================================== */
/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {
// Compute ratios to be used in summation
GMr = ggm->GM / *(inp->r+i);
ar = ggm->R / *(inp->r+i);
/* ---------------------------------------------------------------------
* Compute product of Legendre values and spherical harmonic coefficients.
* Products of similar degree are summed together, resulting in mmax
* values. The degree terms are latitude dependent, such that these mmax
* sums are valid for every point with the same latitude.
* The values of the associated Legendre polynomials, Pnm, are scaled by
* sfac = 10^(-280) and divided by u^m in order to prevent underflow and
* overflow of the coefficients.
* ------------------------------------------------------------------ */
// Form coefficients for Legendre recursive algorithm
t = sin(*(inp->phi+i));
u = cos(*(inp->phi+i));
u2 = u * u;
arn = ar;
/* Degree n = 0 terms ----------------------------------------------- */
// Compute order m = 0 term (S term is zero)
CPnm[0] = Pmm[0] * *(ggm->C);
/* Degree n = 1 terms ----------------------------------------------- */
// Compute (1,1) terms, [1] eq. 3
CPnm[1] = ar * Pmm[1] * *(ggm->C+2);
SPnm[1] = ar * Pmm[1] * *(ggm->S+2);
// Compute (1,0) Legendre value, [1] eq. 18 and 27
P = t * Pmm[1];
// Add (1,0) terms to sum (S term is zero), [1] eq. 3
CPnm[0] = CPnm[0] + ar * P * *(ggm->C+1);
/* Degree n = [2,n_max] --------------------------------------------- */
for (n = 2; n <= *nmax; n++) {
// Compute power term
arn = arn * ar;
/* Compute sectorial (m=n) terms ++++++++++++++++++++++++++++++++ */
// Extract associated Legendre value
Pp1 = Pmm[n];
// Compute product terms, [1] eq. 3
if (n <= *mmax) {
id = (n+1)*n/2 + n;
CPnm[n] = arn * Pp1 * *(ggm->C+id);
SPnm[n] = arn * Pp1 * *(ggm->S+id);
}
/* Compute first non-sectorial terms (m=n-1) ++++++++++++++++++++ */
// Compute associated Legendre value, [1] eq. 18 and 27
gnm = sqrt( 2.0*n );
P = gnm * t * Pp1;
// Add terms to summation, eq. 3 in [1]
if (n-1 <= *mmax) {
id = (n+1)*n/2 + n - 1;
CPnm[n-1] = CPnm[n-1] + arn * P * *(ggm->C+id);
SPnm[n-1] = SPnm[n-1] + arn * P * *(ggm->S+id);
}
/* Compute terms of order m = [n-2,1] +++++++++++++++++++++++++++ */
for (m = n-2; m > 0; m--) {
// Set previous values
Pp2 = Pp1;
Pp1 = P;
// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0*(m+1.0) / sqrt( (n-m)*(n+m+1.0) );
hnm = sqrt( (n+m+2.0)*(n-m-1.0)/(n-m)/(n+m+1.0) );
P = gnm * t * Pp1 - hnm * u2 * Pp2;
// Add product terms to summation, eq. 3 in [1]
if (m <= *mmax) {
id = (n+1)*n/2 + m;
CPnm[m] = CPnm[m] + arn * P * *(ggm->C+id);
SPnm[m] = SPnm[m] + arn * P * *(ggm->S+id);
}
}
/* Compute zonal terms (m=0) ++++++++++++++++++++++++++++++++++++ */
// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0 / sqrt( n*(n+1.0) );
hnm = sqrt( (n+2.0)*(n-1.0)/n/(n+1.0) );
P = ( gnm * t * P - hnm * u2 * Pp1 ) / sqrt2;
// Add terms to summation (S term is zero), [1] eq. 3
id = (n+1)*n/2;
CPnm[0] = CPnm[0] + arn * P * *(ggm->C+id);
} /* ---------------------------------------------------------------- */
/* Loop over longitude ============================================== */
for (j = 0; j < inp->nlon; j++) {
/* -----------------------------------------------------------------
* All associated Legendre polynomials (latitude dependent) are now
* computed and multiplied by the corresponding spherical harmonic
* coefficient. These products are scaled by u^m, meaning that
* Horner's scheme is used in the following summation.
* -------------------------------------------------------------- */
// Initialise order-dependent sum
msum = 0.0;
// Derive longitude id
id = j + i * *lgrid;
// Loop over order (m > 0)
for (m = *mmax; m > 0; m--) {
// Add to order-dependent sum using Horner's scheme, [1] eq. 2, 3 and 31
msum = ( msum + cos( m * *(inp->lon+id) ) * CPnm[m]
+ sin( m * *(inp->lon+id) ) * SPnm[m] ) * u;
}
// Add zero order term to sum
msum = msum + CPnm[0];
// Rescale value into gravitational potential, [1] eq. 1
*(out->rval+i+j*inp->nlat) = GMr * msum / sfac;
} /* ================================================================ */
} /* ==================================================================== */
// Return from function
return 0;
}
Again, any help is greatly appreciated and additional information can be supplied if relevant, but this already became a long post. I have a hard time accepting that pure c-code runs slower than the MATLAB C-MEX code.
For example, you want to allow users/programmers to create new games and set values, but they should not have any access to the underlying code within the struct. In order to do this, we can use pointers. Let's look how we can do this.
You can dereference a pointer to get the address. Further, you can create a pointer to a struct. This is useful when creating abstract data types, which are structures in which you hide the details from users/other programs. To reference elements in a struct that is a pointer, use the -> notation to get to those specific values.
It is not a big deal to access pointer inside a structure in c but still, there are a lot of people who make mistakes. In this article, I will write a method to describe the way to how to access a pointer from a structure.
A pointer is a variable that points to the memory address instead of a value. You can dereference a pointer to get the address. Further, you can create a pointer to a struct. This is useful when creating abstract data types, which are structures in which you hide the details from users/other programs.
To put it simply, yes, pointers can prevent some compiler optimizations resulting in a potential slow down. At least, this is clearly the case with ICC and a bit with GCC. The performance of the program is strongly impacted by pointer aliasing and vectorization.
Indeed, the compiler cannot easily know if the provided pointers alias each other or alias with the address with some fields of the provided data structure. As a result, the compilers tends to be conservative and assume that the pointed value can change at any time and reload them often. This can prevent optimizations like the splitting of some loops in gravpot
(with GCC -- see line 119 of this modified code). Moreover, indirections and aliasing tends to prevent the vectorization of the hot loops (ie. the use of SIMD instructions provided by the target processor). Vectorisation can strongly impact the performance of a code.
To give an example, here is the initial code of geo2sph
and here is a slightly modified implementation. In the first case, ICC generate a slow scalar implementation, while in the second case, ICC generate a significantly faster vectorized implementation. The only difference between the two implementation is the use of the restrict
keyword. This keyword tell to the compiler that for the lifetime of the pointer, only the pointer itself or a value directly derived from it (such as pointer+1) will be used to access the object to which it points (see here for more information). Note that the use of the restrict
keyword is dangerous and one should be very careful while using it since the compiler may generate a bad code if the restrict hint is wrong (very hard to debug). Alternatively, you can help the compiler to generate a vectorized code using the OpenMP SIMD directive #pragma omp simd
(see here for the result). Note that you should be sure the target code can be safely vectorized (eg. iterations of must be independent).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With